<a href="https://colab.research.google.com/github/MahmoodAbdali79/Telecom-Churn/blob/main/Telecom_Churn_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Seting Up

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load Data & Manipulation

In [3]:
df = pd.read_csv('drive/MyDrive/Data/Telco-Customer-Churn.csv')

In [4]:
def preprocess(df):
  # drop Customer ID
  df.drop(['customerID'], axis=1, inplace=True)

  # turn into numerical and fill NA value
  df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
  df['TotalCharges_missing'] = df['TotalCharges'].isnull().astype(int)

  imputation_value = df['TotalCharges'].mode()[0] # for considering mode
  # imputation_value = df['TotalCharges'].mean() # for considering mean
  # imputation_value = df['TotalCharges'].median() # for considering median
  df['TotalCharges'] = df['TotalCharges'].fillna(imputation_value)

  # drop rows with tenure of 0
  df.drop(labels=df[df['tenure']==0].index, axis=0, inplace=True)

  # map SeniorCitizen
  df['SeniorCitizen'] = df['SeniorCitizen'].map({0:"No", 1:"Yes"})

  for col in df.columns:
    if len(df[col].unique()) < 2:
      df.drop(col, axis=1, inplace=True)
  return df

In [5]:
df = preprocess(df)

# Feature Engineering

 ## Feature Encoding


  - Quantitative data:
    - [ ] Tranformation
    - [ ] Bining
    - [ ] Demention reduction
    - [ ] Standard and scale
  - Categorical data:
    - [ ] One hot encoding
    - [x] Lable endcoding
    - [ ] Hashing

In [6]:
#Create a label encoder object
le = LabelEncoder()
# Label Encoding will be used for columns with 2 or less unique
for col in df.columns:
    if df[col].dtype == 'object':
      le.fit(df[col])
      df[col] = le.transform(df[col])

## Feature Selection

### Filter Method
- Correation
  - Best feature for remove: MonthlyCharges, tenure             
- Vatiance
  - Best feature for remove:  
  PaperlessBilling, gender, Dependents, PhoneService, Partner, SeniorCitizen
- Univariate feature selection:
  - Best feature for remove:  
  MultipleLines, gender, StreamingTV, StreamingMovies, PhoneService, InternetService

#### Correation

In [7]:
corr = df.corr()

threshold = 0.5  # threshold

# Find pairs of features that exceed the threshold
highly_correlated_pairs = []
for i in range(len(corr.columns)):
    for j in range(i+1, len(corr.columns)):
        if abs(corr.iloc[i, j]) > threshold:
            highly_correlated_pairs.append((corr.index[i], corr.columns[j], corr.iloc[i, j].round(3)))

# Print highly correlated pairs
print("Highly correlated pairs with correlation coefficient > {}: ".format(threshold))
for pair in highly_correlated_pairs:
    print(pair)


corr_omissions = ['tenure', 'MonthlyCharges']

Highly correlated pairs with correlation coefficient > 0.5: 
('tenure', 'Contract', 0.677)
('tenure', 'TotalCharges', 0.826)
('MonthlyCharges', 'TotalCharges', 0.651)


In [8]:
correlation = df.corrwith(df.Churn)
correlation = correlation[correlation!=1].sort_values()
correlation

Contract           -0.396150
tenure             -0.354049
OnlineSecurity     -0.289050
TechSupport        -0.282232
TotalCharges       -0.199484
OnlineBackup       -0.195290
DeviceProtection   -0.177883
Dependents         -0.163128
Partner            -0.149982
InternetService    -0.047097
StreamingMovies    -0.038802
StreamingTV        -0.036303
gender             -0.008545
PhoneService        0.011691
MultipleLines       0.038043
PaymentMethod       0.107852
SeniorCitizen       0.150541
PaperlessBilling    0.191454
MonthlyCharges      0.192858
dtype: float64

#### Vatiance

In [9]:
threshold = 0.5  # Example threshold

# Apply VarianceThreshold
selector = VarianceThreshold(threshold=threshold)
features = df.drop('Churn', axis=1)
selector.fit(features)

# Get selected features
selected_features = features.columns[selector.get_support()]

x = set(selected_features)
y = set(features.columns)
print("Selected features based on variance threshold of {}: ".format(threshold))
print('Potential features to remove: ', y-x)

varianc_omissions = list(y-x)

Selected features based on variance threshold of 0.5: 
Potential features to remove:  {'SeniorCitizen', 'PhoneService', 'Partner', 'Dependents', 'PaperlessBilling', 'gender'}


#### Univariate feature selection

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split


X = df.drop('Churn', axis=1)
y = df['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform feature selection using ANOVA F-value
k = 13  # Number of features to select
selector = SelectKBest(score_func=f_classif, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
selected_features = X.columns[selector.get_support()]

x = set(selected_features)
y = set(features.columns)
print('Potential features to remove: ', y-x)

univariate_omissions = list(y-x)

Potential features to remove:  {'StreamingTV', 'MultipleLines', 'PhoneService', 'InternetService', 'StreamingMovies', 'gender'}


### conclusion for filtering
- [x] correlation
- [ ] variance
- [ ] univariate

In [11]:
def select_feature(df, method):
  if method == 'corr':
    X = df.drop(univariate_omissions + ['Churn'], axis=1)
  elif method == 'varianc':
    X = df.drop(varianc_omissions + ['Churn'], axis=1)
  elif method == 'univariate':
    X = df.drop(univariate_omissions + ['Churn'], axis=1)
  Y = df['Churn']

  return X, Y

## Feature Scaling

- [ ] Standardization (Z-score normalization)
- [ ] Min-Max sclaing
- [x] Robust sclaling

In [12]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

def plot_numerical_feature(cat, ax):
  sns.kdeplot(X[cat][Y==0], color="Blue", fill=True, ax=ax)
  sns.kdeplot(X[cat][Y==1], color="Red", fill= True, ax=ax)
  ax.legend(["Not Churn","Churn"],loc='upper right')
  ax.set_title(f'Distribution of {col} by churn', fontweight="bold")
  fig.tight_layout()

def scale_feature(data, name):
  if name == 'MinMaxScaler':
    scaler = MinMaxScaler()
    data = scaler.fit_transform(data)
  elif name == 'StandardScaler':
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
  elif name == 'RobustScaler':
    robust_scaler = RobustScaler()
    data = robust_scaler.fit_transform(data)
  return  data

# Model Selection

Some issues:
- Imbalanced data
  - [x] Target
  - [x] train, test
- [ ] Feature selection
  - [ ] corr
  - [ ] variance
  - [x] uni
- [x] Define metrics
- [x] using cross validation
- [ ] Hypetparameter tuning

## Balance Data

Over-sampling & under_sampling

In [13]:
# !pip install -U imbalanced-learn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [14]:
def Test_model(X, Y, clf):

  # Inirial mterics dict
  Metrics = {}

  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
  clf.fit(X_train, y_train)
  y_pred_original = clf.predict(X_test)
  recall_original = recall_score(y_test, y_pred_original)
  precision_original = precision_score(y_test, y_pred_original)
  Metrics['Originla'] = [recall_original, precision_original]

  # Over-sampling (SMOTE)
  smote = SMOTE(random_state=42)
  X_resampled_smote, y_resampled_smote = smote.fit_resample(X, Y)
  X_train, X_test, y_train, y_test = train_test_split(X_resampled_smote, y_resampled_smote, test_size=0.2, random_state=42)
  clf.fit(X_train, y_train)
  y_pred_smote = clf.predict(X_test)
  recall_smote = recall_score(y_test, y_pred_smote)
  precision_smote = precision_score(y_test, y_pred_smote)
  Metrics['Smote'] = [recall_smote, precision_smote]

  # Under-sampling (Random Under-sampling)
  rus = RandomUnderSampler(random_state=42)
  X_resampled_rus, y_resampled_rus = rus.fit_resample(X, Y)
  X_train, X_test, y_train, y_test = train_test_split(X_resampled_rus, y_resampled_rus, test_size=0.2, random_state=42)
  clf.fit(X_train, y_train)
  y_pred_rus = clf.predict(X_test)
  recall_rus = recall_score(y_test, y_pred_rus)
  precision_rus = precision_score(y_test, y_pred_rus)
  Metrics['RandomUnderSampler'] = [recall_rus, precision_rus]

  last_balance_name = max(Metrics, key=Metrics.get)

  return last_balance_name, Metrics[last_balance_name]

## Creating Molds

In [26]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

In [None]:
feature_selection = ['corr','varianc','univariate']

scalers = ['MinMaxScaler','StandardScaler','RobustScaler']

Models = {}
Models = {
    'svm_kernel': SVC(kernel='rbf', random_state=42),
    'random_forest': RandomForestClassifier(random_state=42),
    # 'neural_network': MLPClassifier(random_state=42),
    'gradient_boosting': GradientBoostingClassifier(random_state=42),
    'decision_tree': DecisionTreeClassifier(random_state=42),
    'logistic_regression': LogisticRegression(random_state=42),
    'naive_bayes': GaussianNB(),
    'linear_svm': LinearSVC(random_state=42)
}

Modles_metrics = {}
Modles_metrics = {'Model name': [],
                  'Feature selection': [],
                  'Scalre': [],
                  'Balance type': [],
                  'Recall': [],
                  'Precision': [],

                  }

for fs in feature_selection:
  X, Y = select_feature(df, fs)

  for scale in scalers:
    X = scale_feature(X, scale)

    for name, model in Models.items():
      Result = Test_model(X, Y, model)
      Modles_metrics['Model name'].append(name)
      Modles_metrics['Feature selection'].append(fs)
      Modles_metrics['Scalre'].append(scale)
      Modles_metrics['Balance type'].append(Result[0])
      Modles_metrics['Recall'].append(Result[1][0])
      Modles_metrics['Precision'].append(Result[1][1])

Modles_metrics = pd.DataFrame(Modles_metrics).sort_values(by='Recall', ascending=False)

In [None]:
Modles_metrics

Unnamed: 0,Model name,Feature selection,Scalre,Balance type,Recall,Precision
51,gradient_boosting,univariate,StandardScaler,Smote,0.866861,0.800000
9,gradient_boosting,corr,StandardScaler,Smote,0.866861,0.800000
2,gradient_boosting,corr,MinMaxScaler,Smote,0.855199,0.799273
44,gradient_boosting,univariate,MinMaxScaler,Smote,0.855199,0.799273
16,gradient_boosting,corr,RobustScaler,Smote,0.854227,0.787634
...,...,...,...,...,...,...
24,decision_tree,varianc,MinMaxScaler,Smote,0.765792,0.777120
52,decision_tree,univariate,StandardScaler,Smote,0.763848,0.763848
10,decision_tree,corr,StandardScaler,Smote,0.763848,0.763848
17,decision_tree,corr,RobustScaler,Smote,0.759961,0.770443


### Bagginh

In [None]:
X, Y = select_feature(df, 'univariate')
X = scale_feature(X, 'MinMaxScaler')

# Initialize a base learner (e.g., Decision Tree classifier)
base_learner = DecisionTreeClassifier(random_state=42)

# Initialize the Bagging classifier
bagging_clf = BaggingClassifier(estimator=base_learner, n_estimators=10, random_state=42)

Test_model(X, Y, bagging_clf)

('Smote', [0.783284742468416, 0.8249744114636642])

### Boosting

In [None]:
X, Y = select_feature(df, 'univariate')
X = scale_feature(X, 'StandardScaler')

base_learner = DecisionTreeClassifier(random_state=42)

# Initialize the AdaBoost classifier
adaboost_clf = AdaBoostClassifier(base_estimator=base_learner, n_estimators=10, random_state=42)
Test_model(X, Y, bagging_clf)

('Smote', [0.7978620019436345, 0.8251256281407036])

### Hyper parameter

In [16]:
X, Y = select_feature(df, 'univariate')
X = scale_feature(X, 'StandardScaler')

smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X, Y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled_smote, y_resampled_smote, test_size=0.2, random_state=42)

param_grid_gb = {
    'n_estimators': [100, 150],  # Number of trees in the forest
    'learning_rate': [0.01, 0.1],  # Learning rate
    'max_depth': [3, 5],  # Maximum depth of the trees
    # Add more parameters as needed
}

gb_clf = GradientBoostingClassifier(random_state=42)

grid_search_gb = GridSearchCV(gb_clf, param_grid_gb, cv=5, scoring='recall')
grid_search_gb.fit(X_train, y_train)

# Get the best hyperparameters
best_params_gb = grid_search_gb.best_params_

In [17]:
best_params_gb

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}

In [18]:
y_pred = grid_search_gb.predict(X_test)
recall_rus = recall_score(y_test, y_pred)
print(f'Recall: {recall_rus}')

Recall: 0.858114674441205


In [19]:
X, Y = select_feature(df, 'univariate')
X = scale_feature(X, 'MinMaxScaler')

smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X, Y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled_smote, y_resampled_smote, test_size=0.2, random_state=42)


param_grid_rf = {
    'n_estimators': [100, 150],
    'max_depth': [3,5,7],
    'max_features':[0.2,0.7],
    'min_samples_split': [2, 5],
    'min_samples_leaf':[1,5]
}

rf_clf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf_clf, param_grid_rf, cv=5, scoring='recall')
grid_search_rf.fit(X_train, y_train)

# Get the best hyperparameters
best_params_rf = grid_search_rf.best_params_
print("Best Hyperparameters for Random Forest:", best_params_rf)

# Best Hyperparameters for Random Forest: {'max_depth': 3, 'max_features': 0.7, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Best Hyperparameters for Random Forest: {'max_depth': 3, 'max_features': 0.7, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [20]:
print(grid_search_rf.best_score_)

0.8853365968771867


### Ensemble Model

In [22]:
X, Y = select_feature(df, 'univariate')
X = scale_feature(X, 'MinMaxScaler')

smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X, Y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled_smote, y_resampled_smote, test_size=0.2, random_state=42)

# Initialize the best Gradient Boosting classifier with the best hyperparameters
best_gb_clf = GradientBoostingClassifier(**best_params_gb, random_state=42)

# Initialize the best Random Forest classifier with the best hyperparameters
best_rf_clf = RandomForestClassifier(**best_params_rf, random_state=42)

# Create an ensemble model with the best classifiers
ensemble_clf = VotingClassifier(estimators=[
    ('Gradient Boosting', best_gb_clf),
    ('Random Forest', best_rf_clf)
], voting='soft')  # Use 'soft' voting for probability-based ensemble

# Train the ensemble model on the training data
ensemble_clf.fit(X_train, y_train)

y_pred = ensemble_clf.predict(X_test)
recall = recall_score(y_test, y_pred)
print("Ensemble Model F1 Score:", recall)

Ensemble Model F1 Score: 0.8746355685131195


In [28]:
ensemble_clf

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.66      0.74      1037
           1       0.72      0.87      0.79      1029

    accuracy                           0.77      2066
   macro avg       0.78      0.77      0.76      2066
weighted avg       0.78      0.77      0.76      2066



# Save & Deployment

- [x] Model Serialization
- [ ] Model API
  - [ ] Create API
  - [ ] Documentation
- [ ] Containerization
- [ ] Scalability
- [ ] Monitoring

##  save model

In [35]:
!ls -sh drive/MyDrive/Data

total 1.3M
1.0K Gradient-Boosting_Random-Forest.joblib  340K Telco-Customer-Churn_V1.csv
955K Telco-Customer-Churn.csv


In [31]:
import joblib
model_file_path = 'drive/MyDrive/Data/Gradient-Boosting_Random-Forest.joblib'

# Save the model to a file using joblib
joblib.dump(best_gb_clf, model_file_path)

print("Model saved successfully.")

Model saved successfully.
