Task 1: Data Loading & Preparation

In [4]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('Telco_Customer_Churn_Dataset  (1).csv')

# Remove unique identifier column
data.drop('customerID', axis=1, inplace=True)

# Convert 'TotalCharges' to numeric (some entries may be empty strings)
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# Handle missing values using column-wise median
data.fillna(data.median(numeric_only=True), inplace=True)
data.head()


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Task 2: Encoding Categorical Variable

In [5]:
from sklearn.preprocessing import LabelEncoder

# Identify all categorical (object) columns
cat_cols = data.select_dtypes(include='object').columns

# Binary encoding for Yes/No or Male/Female types
label_encoder = LabelEncoder()
binary_cols = [col for col in cat_cols if data[col].nunique() == 2]

for col in binary_cols:
    data[col] = label_encoder.fit_transform(data[col])

# One-hot encoding for multi-class categorical variables
multi_class_cols = [col for col in cat_cols if col not in binary_cols]
data = pd.get_dummies(data, columns=multi_class_cols)

# Encode the target column 'Churn'
data['Churn'] = label_encoder.fit_transform(data['Churn'])  # Yes = 1, No = 0


Task 3: Train-Test Split

In [7]:

from sklearn.model_selection import train_test_split

# Separate features and target
X = data.drop('Churn', axis=1)
y = data['Churn']

# 80% Training and 20% Testing with class balance (stratification)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
Task 4: Feature Selection

In [8]:
# Based on domain knowledge: contract type, charges, tenure
selected_features = [
    'tenure', 'MonthlyCharges', 'TotalCharges',
    'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year'
]

# Subset the training and testing sets to use only selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]


Task 5: Model Selection and Training

In [9]:

from sklearn.ensemble import RandomForestClassifier

# Using Random Forest for its robustness in binary classification
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_selected, y_train)


In [None]:
Task 6: Model Evaluation

In [10]:

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report
)

# Make predictions
y_pred = rf_model.predict(X_test_selected)
y_proba = rf_model.predict_proba(X_test_selected)[:, 1]  # Probabilities for ROC-AUC

# Print evaluation metrics
print("✅ Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred))


✅ Evaluation Results:
Accuracy: 0.7587
Precision: 0.5541
Recall: 0.4652
F1 Score: 0.5058
ROC-AUC Score: 0.7848

📄 Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.86      0.84      1035
           1       0.55      0.47      0.51       374

    accuracy                           0.76      1409
   macro avg       0.69      0.66      0.67      1409
weighted avg       0.75      0.76      0.75      1409

