<a href="https://colab.research.google.com/github/Khushibung05/Ensemble_learning/blob/main/telco_customer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    recall_score,
    precision_score,
    f1_score
)


from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

# Path to the dataset (update if needed)
path = '/content/drive/My Drive/Colab Notebooks/telco_customer/telco_customer_churn.csv'

# Importing the dataset
df= pd.read_csv(path)
print(df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4         

In [7]:
# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Fill missing values safely
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# Encode target
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Drop customerID safely
df.drop(columns=['customerID'], errors='ignore', inplace=True)


In [8]:
#Encode Categorical Features
label_encoders = {}

for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [18]:
#Features & Target Split
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
)
#standardistion
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)


In [19]:
#Train Decision Tree (Baseline)
dt = DecisionTreeClassifier(
    max_depth=5,
    random_state=42
)

dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

In [20]:
#Evaluate Baseline Model
print("=== Decision Tree (Baseline) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

=== Decision Tree (Baseline) ===
Accuracy: 0.794180269694819
Confusion Matrix:
 [[881 155]
 [135 238]]


In [21]:
#Train Gradient Boosting Model
gb = GradientBoostingClassifier(
    n_estimators=150,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

In [22]:
#Evaluate Ensemble Model
print("\n=== Gradient Boosting (Ensemble) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))


=== Gradient Boosting (Ensemble) ===
Accuracy: 0.8026969481902059
Confusion Matrix:
 [[934 102]
 [176 197]]


#Confusion Matrix Interpretation

From confusion matrix:

True Positives (TP) → Churn customers correctly identified

False Positives (FP) → Loyal customers wrongly flagged

False Negatives (FN) → Churn customers missed

Is it better to wrongly flag a loyal customer or miss a churn customer?

✔ Better to wrongly flag a loyal customer

Reason:

Wrong flag → small discount cost

Missed churn → permanent revenue loss

In [23]:
#dentify Important Features
feature_importance = pd.Series(
    gb.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print(feature_importance.head(10))

Contract            0.385990
tenure              0.148316
MonthlyCharges      0.133734
TotalCharges        0.102850
OnlineSecurity      0.079644
TechSupport         0.051529
PaymentMethod       0.020301
PaperlessBilling    0.017354
InternetService     0.014535
OnlineBackup        0.011247
dtype: float64
