<a href="https://colab.research.google.com/github/Mohamed-Nawfal/Telecom-Customer-Churn-Prediction/blob/main/Telecom_CustumerChurnPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')

In [None]:
customer_churn = pd.read_csv('Telcom Data.csv')
customer_churn.head()

In [None]:
customer_churn.info()

In [None]:
# Data Cleaning
#check duplicates data
customer_churn.duplicated().sum()

In [None]:
customer_churn.columns

In [None]:
customer_churn['Churn'].value_counts(normalize=True)

In [None]:
customer_churn['Churn'].value_counts()

In [None]:
customer_churn['Churn'] = customer_churn['Churn'].replace({'Yes':1, 'No':0})

In [None]:
customer_churn['Churn'].value_counts()

In [None]:
customer_churn.columns

In [None]:
customer_churn['gender'].value_counts()

In [None]:
customer_churn['SeniorCitizen'].value_counts()

In [None]:
customer_churn['MultipleLines'].value_counts()

In [None]:
customer_churn['InternetService'].value_counts()

In [None]:
customer_churn['OnlineSecurity'].value_counts()

In [None]:
customer_churn['StreamingTV'].value_counts()

In [None]:
for i in customer_churn.columns:
    print("******************************",i,"********************************************")
    print()
    print(set(customer_churn[i].tolist()))
    print()

In [None]:
customer_churn['TotalCharges'] = customer_churn['TotalCharges'].replace(" ", np.nan)


In [None]:
customer_churn['TotalCharges'].isnull().sum()

In [None]:
customer_churn['TotalCharges'].mode()

In [None]:
customer_churn['TotalCharges'] = customer_churn['TotalCharges'].fillna('20.2')

In [None]:
customer_churn['TotalCharges'].isnull().sum()

In [None]:
customer_churn['TotalCharges'] = customer_churn['TotalCharges'].astype(float)

In [None]:
customer_churn.info()

In [None]:
customer_churn = customer_churn.drop('customerID', axis=1)

In [None]:
customer_churn.columns

In [None]:
customer_churn = pd.get_dummies(customer_churn, columns=['gender','Partner','Dependents',
                                                        'PhoneService','MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first=True)

In [None]:
customer_churn.info()

In [None]:
customer_churn.head()

In [None]:
customer_churn['gender_Male'] = np.where(customer_churn['gender_Male']==True, 1, 0)

In [None]:
df = customer_churn.applymap(lambda x: int(x) if isinstance(x, bool) else x)

In [None]:
df.head()

In [None]:
# split the data into x and y
x = df.drop(['Churn'], axis=1)
y = df['Churn']

In [None]:
x.head()

In [None]:
y

In [None]:
# Feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler_columns = ['tenure','MonthlyCharges','TotalCharges']
x[scaler_columns] = scaler.fit_transform(x[scaler_columns])

In [None]:
x.head()

In [None]:
# Handling imbalance data set by using SMOTE Techniques
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(x,y)
print(y.value_counts())
print()
print(y_smote.value_counts())

In [None]:
# splitting the data into training and testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size=0.3, random_state=101)

In [None]:
    pip install -U ydata-profiling

In [None]:
# pandas profiling
from ydata_profiling import ProfileReport
reports = ProfileReport(df)
reports.to_file(output_file='output.html')

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression()
logit.fit(x_train, y_train)

In [None]:
# prediction
y_pred_train = logit.predict(x_train)
y_pred_test = logit.predict(x_test)

In [None]:
# Evaluation metrics
from sklearn.metrics import classification_report, accuracy_score

In [None]:
print(classification_report(y_train, y_pred_train))
print()
print(classification_report(y_test, y_pred_test))

In [None]:
print(accuracy_score(y_train, y_pred_train))
print()
print(accuracy_score(y_test, y_pred_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=101)
rf_model.fit(x_train, y_train)

# Make predictions on training and testing data
y_pred_train_rf = rf_model.predict(x_train)
y_pred_test_rf = rf_model.predict(x_test)

# Evaluate the Random Forest model
print("Random Forest - Training Classification Report:")
print(classification_report(y_train, y_pred_train_rf))
print("\nRandom Forest - Testing Classification Report:")
print(classification_report(y_test, y_pred_test_rf))

print("\nRandom Forest - Training Accuracy:", accuracy_score(y_train, y_pred_train_rf))
print("Random Forest - Testing Accuracy:", accuracy_score(y_test, y_pred_test_rf))

In [None]:
print(accuracy_score(y_train, y_pred_train_rf))
print()
print(accuracy_score(y_test, y_pred_test_rf))

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=101),
                           param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='recall')

# Fit the grid search to the data
grid_search.fit(x_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_

print(f"Best Parameters: {grid_search.best_params_}")

# Evaluate the tuned model
y_pred_tuned = best_rf.predict(x_test)
print("\n--- Tuned Random Forest Report ---")
print(classification_report(y_test, y_pred_tuned))

In [None]:
# Extract feature importance
importances = best_rf.feature_importances_
feature_names = x.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).head(10)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('Top 10 Drivers of Customer Churn')
plt.show()

In [None]:
# 1. Churn by Contract Type
# The original 'Contract' column was converted to dummy variables during one-hot encoding.
# To plot the original categorical 'Contract' column, we need to load a fresh copy of the data
# or use the dataframe state before the one-hot encoding was applied.
# For this visualization, I'll load a temporary dataframe.
temp_churn_df = pd.read_csv('Telcom Data.csv')
# Also ensure the Churn column in the temporary dataframe is in the correct format for plotting
temp_churn_df['Churn'] = temp_churn_df['Churn'].replace({'Yes':1, 'No':0})

plt.figure(figsize=(8, 5))
sns.countplot(x='Contract', hue='Churn', data=temp_churn_df)
plt.title('Churn Rate by Contract Type')
plt.show()

# 2. Tenure vs Churn
plt.figure(figsize=(10, 5))
sns.kdeplot(customer_churn[customer_churn['Churn'] == 0]['tenure'], label='Stayed', shade=True)
sns.kdeplot(customer_churn[customer_churn['Churn'] == 1]['tenure'], label='Churned', shade=True)
plt.title('Customer Retention based on Tenure (Months)')
plt.xlabel('Tenure')
plt.legend()
plt.show()