In [None]:
# Data preprocessing and modeling libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, 
                             classification_report, roc_curve, auc)

# Neural network libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Visualization and data handling
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import os

warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("All modeling libraries imported successfully.")


All modeling libraries imported successfully!


In [None]:
df = pd.read_csv('../data/telco_churn_cleaned.csv')

print("Dataset loaded from EDA:")
print(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"\nFirst few rows:")
print(df.head())

print(f"\nTotalCharges data type: {df['TotalCharges'].dtype}")
print(f"Missing values in TotalCharges: {df['TotalCharges'].isnull().sum()}")

# Drop customerID if it's not useful for prediction
if 'customerID' in df.columns:
    df = df.drop('customerID', axis=1)

print(f"\nData loaded successfully! Shape after removing customerID: {df.shape}")


Dataset loaded from EDA:
Shape: 7043 rows, 22 columns

First few rows:
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... TechSupport  \
0  No phone service             DSL             No  ...          No   
1                No             DSL            Yes  ...          No   
2                No             DSL            Yes  ...          No   
3  No phone service             DSL            Yes  ...         Yes   
4                No     Fiber optic             No  ...          No   

  StreamingTV Streami

In [3]:
df_processed = df.copy()

df_processed['Churn'] = df_processed['Churn'].map({'Yes': 1, 'No': 0})

# Binary categorical columns (convert to 0/1)
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for col in binary_cols:
    if col in df_processed.columns:
        df_processed[col] = df_processed[col].map({'Yes': 1, 'No': 0})

# Gender (convert to 0/1)
if 'gender' in df_processed.columns:
    df_processed['gender'] = df_processed['gender'].map({'Male': 1, 'Female': 0})

# SeniorCitizen might be text from EDA 
if 'SeniorCitizen' in df_processed.columns:
    if df_processed['SeniorCitizen'].dtype == 'object':
        df_processed['SeniorCitizen'] = df_processed['SeniorCitizen'].map({'Yes': 1, 'No': 0})

print("✓ Binary encoding completed!")

# Multi-category columns  (using One-Hot Encoding)
multi_category_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 
                       'OnlineBackup', 'DeviceProtection', 'TechSupport',
                       'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

cols_to_encode = [col for col in multi_category_cols if col in df_processed.columns]

# Applying one-hot encoding
df_encoded = pd.get_dummies(df_processed, columns=cols_to_encode, drop_first=True)

print(f"\nShape after encoding: {df_encoded.shape}")
print(f"Features increased from {df.shape[1]} to {df_encoded.shape[1]} columns")
print("\n All categorical variables encoded.")


✓ Binary encoding completed!

Shape after encoding: (7043, 32)
Features increased from 21 to 32 columns

 All categorical variables encoded.
