<a href="https://colab.research.google.com/github/Kunalzzxx/Customer-Churn-Prediction/blob/main/Customer_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import kagglehub
blastchar_telco_customer_churn_path = kagglehub.dataset_download('blastchar/telco-customer-churn')

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay,accuracy_score

# #loading data
df = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
# df=pd.read_csv('./data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df



In [None]:
df.info()

In [None]:
# Check if any duplicates exist (returns True/False)
has_duplicates = df.duplicated().any()
print(f"Dataset has duplicates: {has_duplicates}")

In [None]:
df.dtypes

    TotalCharges It’s stored as object, but it's supposed to be a number. Lets covert it data type

In [None]:
# Replace blanks with NaN
df['TotalCharges']=df['TotalCharges'].replace(' ',np.nan)
# Convert to float
df['TotalCharges'] = df['TotalCharges'].astype('float')

In [None]:
df.isnull().sum()

In [None]:
df['TotalCharges'].isna().sum()

In [None]:
df['TotalCharges'].describe()

In [None]:
mask = df['TotalCharges'].isna()
df.loc[mask, ['tenure', 'TotalCharges']].head()

    All 11 missing rows correspond to tenure == 0. For those customers the company hasn’t billed anything yet, so TotalCharges should logically be 0.

In [None]:
df.loc[mask,['TotalCharges']]=0.0

In [None]:
df[df['TotalCharges']==0.0]

    id column can be dropped; it's just an identifier and doesn't help in prediction.

In [None]:
df.drop('customerID',axis=1,inplace=True)

In [None]:
df.MultipleLines.unique()

In [None]:
df.InternetService.unique()

    We will later check some other coulmn if those need any fix

In [None]:
df.SeniorCitizen.unique()

    lets make the seniorCitizen column as categorical column

In [None]:
df['SeniorCitizen']=df['SeniorCitizen'].map({0:'No',1:'Yes'})

In [None]:
df.SeniorCitizen.unique()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Calculate value counts and percentages for gender
gender_counts = df['gender'].value_counts()
gender_percentages = (gender_counts / len(df)) * 100

# Calculate value counts and percentages for churn
churn_counts = df['Churn'].value_counts()
churn_percentages = (churn_counts / len(df)) * 100

# Create subplots with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"}, {"type": "pie"}]],
    subplot_titles=("Gender", "Churn"),
    horizontal_spacing=0.1
)

# Gender distribution
gender_labels = gender_counts.index.tolist()
gender_values = gender_percentages.tolist()
gender_colors = ['#4472C4', '#E55A4E']  # Blue for Male, Red for Female

# Churn distribution
churn_labels = churn_counts.index.tolist()
churn_values = churn_percentages.tolist()
churn_colors = ['#70AD47', '#9966CC']  # Green for No, Purple for Yes

# Add gender pie chart
fig.add_trace(
    go.Pie(
        labels=gender_labels,
        values=gender_values,
        hole=0.4,
        marker=dict(colors=gender_colors),
        textinfo='label+percent',
        textposition='outside',
        showlegend=True,
        legendgroup='gender',
        name='Gender'
    ),
    row=1, col=1
)

# Add churn pie chart
fig.add_trace(
    go.Pie(
        labels=churn_labels,
        values=churn_values,
        hole=0.4,
        marker=dict(colors=churn_colors),
        textinfo='label+percent',
        textposition='outside',
        showlegend=True,
        legendgroup='churn',
        name='Churn'
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(
    title={
        'text': 'Gender and Churn Distribution',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 20}
    },
    showlegend=True,
    legend=dict(
        orientation="v",
        yanchor="middle",
        y=0.5,
        xanchor="left",
        x=1.05
    ),
    width=800,
    height=500
)

# Show the plot
fig.show()

# Print the actual percentages for verification
print("Gender Distribution:")
print(gender_percentages)
print("\nChurn Distribution:")
print(churn_percentages)



In [None]:
fig = px.histogram(df, x='gender', color='Churn',
                   barmode='group',
                   title='Churn by gender',
                   color_discrete_sequence=['#636EFA', '#EF553B'])
fig.show()

In [None]:
fig = px.histogram(df, x='SeniorCitizen', color='Churn',
                   barmode='group',
                   title='Churn by Senior Citizen Status',
                   color_discrete_sequence=['#00CC96', '#EF553B'])
fig.show()

In [None]:
fig = px.histogram(df, x='Contract', color='Churn',
                   barmode='group',
                   title='Churn by Contract Type',
                   color_discrete_sequence=['#636EFA', '#EF553B'])
fig.show()

In [None]:
fig = px.histogram(df, x='tenure', color='Churn',
                   nbins=30, barmode='overlay',
                   opacity=0.6,
                   title='Tenure Distribution by Churn',
                   color_discrete_sequence=['#EF553B', '#00CC96'])
fig.show()

In [None]:
fig = px.scatter(df, x='MonthlyCharges', y='TotalCharges', color='Churn',
                 title='Monthly Charges vs Total Charges by Churn',
                 color_discrete_sequence=['#EF553B', '#636EFA'])
fig.show()

In [None]:
churn_pct = pd.crosstab(df['InternetService'], df['Churn'], normalize='index') * 100
churn_pct = churn_pct.reset_index().melt(id_vars='InternetService', value_name='Percent', var_name='Churn')

fig = px.bar(churn_pct, x='InternetService', y='Percent', color='Churn',
             title='Churn % by Internet Service (Stacked)',
             text_auto='.1f',
             color_discrete_sequence=['#00CC96', '#EF553B'])
fig.update_layout(barmode='stack')
fig.show()

In [None]:
fig = px.box(df, x='Churn', y='MonthlyCharges', color='Churn',
             title='Monthly Charges Distribution by Churn',
             color_discrete_sequence=['#00CC96', '#EF553B'])
fig.show()

In [None]:
# Map Churn to 0/1 if not already
df_corr = df.copy()
df_corr['Churn'] = df_corr['Churn'].map({'No': 0, 'Yes': 1})

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df_corr, drop_first=True)

# Correlation with churn
churn_corr = df_encoded.corr()['Churn'].drop('Churn')
churn_corr_sorted = churn_corr.abs().sort_values(ascending=False).head(15)

# Plot
fig = px.bar(x=churn_corr_sorted.index,
             y=churn_corr_sorted.values,
             title='Top 15 Features Correlated with Churn',
             labels={'x': 'Feature', 'y': 'Absolute Correlation with Churn'},
             text=churn_corr_sorted.round(2),
             color=churn_corr_sorted.values,
             color_continuous_scale='RdBu')
fig.update_layout(xaxis_tickangle=45)
fig.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
# Copy your original DataFrame
df_temp = df.copy()

# Temporarily encode categorical columns
le = LabelEncoder()
for col in df_temp.columns:
    if df_temp[col].dtype == 'object':
        df_temp[col] = le.fit_transform(df_temp[col])

# Correlation heatmap
fig = px.imshow(df_temp.corr(),
                text_auto=True,
                title="Correlation Heatmap (Raw + Label Encoded)",
                color_continuous_scale='RdBu_r')
fig.update_layout(height=700, width=1200)
fig.show()

In [None]:
df.to_csv('telcom_customer_churn_pred.csv',index=False)
print('successfully saved')

In [None]:
# Step 1: Feature Engineering Function
def add_custom_features(df):
    services = ['PhoneService', 'MultipleLines', 'OnlineSecurity',
                'OnlineBackup', 'DeviceProtection', 'TechSupport',
                'StreamingTV', 'StreamingMovies']

    df['NumServices'] = df[services].apply(lambda x: sum(x == 'Yes'), axis=1)
    df['HighMonthlyCharges'] = df['MonthlyCharges'].apply(lambda x: 1 if x > 80 else 0)
    df['IsSeniorDependent'] = df.apply(lambda x: 'Yes' if x['SeniorCitizen'] == 'Yes' and x['Dependents'] == 'Yes' else 'No', axis=1)
    df['PricePerService'] = df['MonthlyCharges'] / (df['NumServices'] + 1)

    return df
# Step 2: Define feature pipeline block
feature_block = FunctionTransformer(add_custom_features, validate=False)

In [None]:
categorical_cols = ['gender', 'SeniorCitizen','InternetService', 'OnlineSecurity',
                    'TechSupport', 'Contract', 'PaperlessBilling',
                    'PaymentMethod', 'IsSeniorDependent']

numerical_cols = ['tenure', 'MonthlyCharges',
                  'NumServices', 'HighMonthlyCharges', 'PricePerService']

In [None]:
# Step 4: Preprocessing block

#A ColumnTransformer lets you apply different preprocessing steps to different columns:
preprocessor=ColumnTransformer(transformers=[
    ('num',MinMaxScaler(),numerical_cols),
    ('cat',OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
   remainder='drop' # Drops any other columns, including the drop_columns
   )

In [None]:
# Prepare features and target
X = df.drop('Churn', axis=1)
y = df['Churn'].map({'No': 0, 'Yes': 1})  # Convert to numeric for testing in XGBoost

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:
def create_pipeline(classifier):
    return Pipeline([
        ('feature_engineering', feature_block),
        ('preprocessing', preprocessor),
        ('classifier', classifier)
    ])

In [None]:
model_params = {
    'logistic_regression': {
        'model':  create_pipeline(LogisticRegression(solver='liblinear')),
        'params': {
            'classifier__C': np.logspace(-3, 2, 6),  # [0.001, 0.01, 0.1, 1, 10, 100]
            'classifier__penalty': ['l1', 'l2']
        }
    },
    'random_forest': {
        'model': create_pipeline( RandomForestClassifier(random_state=42)) ,
        'params': {
            'classifier__n_estimators': [50, 100, 150, 200],
            'classifier__max_depth': [5, 10, 15, 20],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__bootstrap': [True, False]
        }
    },
    'xgboost': {
        'model': create_pipeline(XGBClassifier(eval_metric='logloss', verbosity=0)),
        'params': {
            'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0],
    'classifier__n_estimators': [100, 150],
    'classifier__gamma': [0, 0.1, 0.2]
        }
    }
}

In [None]:
scores = []

for model_name, mp in model_params.items():
    print(f" Tuning {model_name}...")
    clf = RandomizedSearchCV(
        mp['model'],
        mp['params'],
        n_iter=20,           # only try 20 combinations
        cv=3,                # 3-fold cross-validation
        scoring='accuracy',
        n_jobs=-1,
        verbose=1,
        random_state=42
    )
    clf.fit(X_train, y_train)

    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

# Results DataFrame
model_score_df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
model_score_df

In [None]:
for i, row in model_score_df.iterrows():
    print(f"Model: {row['model']}")
    print(f"Best Score: {row['best_score']}")
    print(f"Best Params: {row['best_params']}")
    print("-" * 50)

In [None]:
# Best Logistic Regression model based on GridSearchCV
logistic_model = Pipeline(steps=[
    ('feature_engineering', feature_block),
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(
        penalty='l1',
        C=0.1,
        solver='liblinear',
        class_weight='balanced'  # 💡 This is the key change
    ))
])

# Fit on the full training data
logistic_model.fit(X_train, y_train)

# Predict on test data
y_pred = logistic_model.predict(X_test)

# Evaluate
accuracy=accuracy_score(y_test, y_pred)
print("✅ Accuracy on Test Data: {:.2f}%".format(accuracy * 100))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
cm= confusion_matrix(y_test, y_pred)
cm_percent = cm / cm.sum() * 100  # Convert to percentages

# Plotting
plt.figure(figsize=(6, 4))
sns.heatmap(cm_percent, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=['No Churn', 'Churn'],
            yticklabels=['No Churn', 'Churn'])

plt.title('Confusion Matrix (Percentage)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:


# Define the individual models with best parameters (or reasonable ones)
log_reg = LogisticRegression(
    penalty='l1',
    C=0.1,
    solver='liblinear',
    class_weight='balanced'
)

rf_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced',
    random_state=42
)

xgb_clf = XGBClassifier(
    n_estimators=150,
    learning_rate=0.1,
    subsample=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# VotingClassifier (Hard Voting or Soft Voting)
voting_clf = VotingClassifier(
    estimators=[
        ('lr', log_reg),
        ('rf', rf_clf),
        ('xgb', xgb_clf)
    ],
    voting='soft'  # use 'hard' for label majority voting
)

# Final Ensemble Model Pipeline
model = Pipeline(steps=[
    ('feature_engineering', feature_block),
    ('preprocessing', preprocessor),
    ('classifier', voting_clf)
])

# Fit on training data
model.fit(X_train, y_train)

# Predict on test data
y_pred_ensemble = model.predict(X_test)

# Evaluate

print("Classification Report:\n", classification_report(y_test, y_pred_ensemble))

# Confusion Matrix
ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred_ensemble,
    display_labels=['No Churn', 'Churn'],
    cmap='Blues',
    normalize='true'  # 🔄 percent values
)