In [1]:
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings


In [2]:
data=pd.read_csv('data/churn data.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
#dropping umwanted columns
data.drop('customerID', axis=1, inplace=True)


### Splitting the data

In [4]:
X=data.drop(['Churn'], axis=1)
y=data['Churn']
y=y.to_frame()

In [5]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

In [6]:
numerical_features=['tenure','MonthlyCharges','TotalCharges']
for column in data.columns:
    if column not in numerical_features:
         print(column,data[column].unique())
         print('-'*30)

gender ['Female' 'Male']
------------------------------
SeniorCitizen [0 1]
------------------------------
Partner ['Yes' 'No']
------------------------------
Dependents ['No' 'Yes']
------------------------------
PhoneService ['No' 'Yes']
------------------------------
MultipleLines ['No phone service' 'No' 'Yes']
------------------------------
InternetService ['DSL' 'Fiber optic' 'No']
------------------------------
OnlineSecurity ['No' 'Yes' 'No internet service']
------------------------------
OnlineBackup ['Yes' 'No' 'No internet service']
------------------------------
DeviceProtection ['No' 'Yes' 'No internet service']
------------------------------
TechSupport ['No' 'Yes' 'No internet service']
------------------------------
StreamingTV ['No' 'Yes' 'No internet service']
------------------------------
StreamingMovies ['No' 'Yes' 'No internet service']
------------------------------
Contract ['Month-to-month' 'One year' 'Two year']
------------------------------
PaperlessBilling

In [7]:
data['TotalCharges'] = data['TotalCharges'].replace('', np.nan)
data['TotalCharges'] = data['TotalCharges'].astype(float)
data['TotalCharges'] = data['TotalCharges'].fillna(data['TotalCharges'].mean())

In [8]:
numerical_features=data.select_dtypes(include=['number']).columns
categorical_features=X.select_dtypes(exclude=['number']).columns

In [9]:
data['TotalCharges'].dtypes

dtype('float64')

In [10]:
print(numerical_features)
print('_'*100)
print(categorical_features)

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')
____________________________________________________________________________________________________
Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'TotalCharges'],
      dtype='object')


In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
Label_Encoder=LabelEncoder()
standered_scaller=StandardScaler()
one_hot_encoder=OneHotEncoder()
column_transformer=ColumnTransformer([
    ('OneHotEncoder', one_hot_encoder, categorical_features),

    ('StandardScaler', standered_scaller, numerical_features),
]
)


In [None]:

# Clean all numeric-looking columns
for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = X[col].astype(str).str.strip().replace('', np.nan)
            X[col] = X[col].astype(float)
            X[col] = X[col].fillna(X[col].mean())
        except:
            pass  


In [13]:
cleaned_data = pd.concat([X, y], axis=1)


In [14]:
import os

# Define the path where you want to save the file
folder_path = r'data'
file_name = 'cleaned_data.csv'
file_path = os.path.join(folder_path, file_name)

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the cleaned_data DataFrame as a CSV file
cleaned_data.to_csv(file_path, index=False)

print(f"Cleaned data saved to: {file_path}")


Cleaned data saved to: data\cleaned_data.csv


In [15]:
X=column_transformer.fit_transform(X)
y=Label_Encoder.fit_transform(y)


  y = column_or_1d(y, warn=True)


In [16]:
X.toarray()

array([[ 1.        ,  0.        ,  0.        , ..., -1.27744458,
        -1.16032292, -0.99497138],
       [ 0.        ,  1.        ,  1.        , ...,  0.06632742,
        -0.25962894, -0.17387565],
       [ 0.        ,  1.        ,  1.        , ..., -1.23672422,
        -0.36266036, -0.96039939],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.87024095,
        -1.1686319 , -0.85518222],
       [ 0.        ,  1.        ,  0.        , ..., -1.15528349,
         0.32033821, -0.87277729],
       [ 0.        ,  1.        ,  1.        , ...,  1.36937906,
         1.35896134,  2.01391739]], shape=(7043, 6576))

In [17]:
from sklearn.model_selection import  train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
X_train.shape,X_test.shape

((5634, 6576), (1409, 6576))

In [18]:
pd.Series(y_train).value_counts()

0    4157
1    1477
Name: count, dtype: int64

In [21]:
X_train.toarray()

array([[ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
        -1.15528349e+00, -4.88956928e-01, -9.28962288e-01],
       [ 0.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
        -3.00155854e-01,  2.05674218e-01, -2.27808836e-01],
       [ 1.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
        -5.44478037e-01, -1.47938794e+00, -8.38757218e-01],
       ...,
       [ 1.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         9.21455057e-01, -3.88609919e-04,  5.88915721e-01],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         1.61370124e+00,  1.45866916e+00,  2.40328274e+00],
       [ 1.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         1.24721797e+00,  5.01474084e-01,  1.13833656e+00]],
      shape=(5634, 6576))

We will perform SMOTE to the tranning data

In [22]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
X_train,y_train=smote.fit_resample(X_train,y_train)



In [23]:
pd.Series(y_train).value_counts()

1    4157
0    4157
Name: count, dtype: int64

## Model Tranning

**Applying cross validation and selects the best model**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score,StratifiedKFold

cv_score={}

models={
    'LogisticRegression':LogisticRegression(),
    'DecissionTree':DecisionTreeClassifier(),
    'RandomForest':RandomForestClassifier(),
    'Xgboost':XGBClassifier()
}

for model_name, model in models.items():
    # Train model
    print(f'Tranning {model_name}  : ')
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    score=cross_val_score(model,X_train, y_train,  cv=skf,scoring='accuracy')
    cv_score[model_name] = score
    print(f'{model_name}Cross validation accuracy : {score.mean():.2f}')
    print('_'*35)

Tranning LogisticRegression  : 


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegressionCross validation accuracy : 0.83
___________________________________
Tranning DecissionTree  : 
DecissionTreeCross validation accuracy : 0.84
___________________________________
Tranning RandomForest  : 
RandomForestCross validation accuracy : 0.87
___________________________________
Tranning Xgboost  : 
XgboostCross validation accuracy : 0.85
___________________________________


In [None]:
from sklearn.metrics import accuracy_score, classification_report

def evauate_model(true,predicted):
    accuracyScore=accuracy_score(true,predicted)
    cls_report=classification_report(true,predicted)
    return accuracyScore,cls_report

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define your models
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Helper function to evaluate a model
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    report = classification_report(y_true, y_pred)
    return accuracy, report

# Lists to store results
model_list = []
accuracy_list = []
cls_report_list = []

# Loop through each model and evaluate
for name, model in models.items():
    print(f"Training model: {name}")

    # Train
    model.fit(X_train, y_train)

    # Predict
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate
    train_accuracy, train_report = evaluate_model(y_train, y_train_pred)
    test_accuracy, test_report = evaluate_model(y_test, y_test_pred)

    # Store results
    model_list.append(name)
    accuracy_list.append(test_accuracy)
    cls_report_list.append(test_report)

    # Print results
    print(f"\n{name} - Model Performance")
    print("Training Set:")
    print(f" Accuracy: {train_accuracy:.4f}")
    print(" Classification Report:\n", train_report)

    print("Test Set:")
    print(f" Accuracy: {test_accuracy:.4f}")
    print(" Classification Report:\n", test_report)
    print("=" * 50, "\n")


Training model: LogisticRegression

LogisticRegression - Model Performance
Training Set:
 Accuracy: 0.8918
 Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.85      0.89      4121
           1       0.86      0.93      0.90      4121

    accuracy                           0.89      8242
   macro avg       0.89      0.89      0.89      8242
weighted avg       0.89      0.89      0.89      8242

Test Set:
 Accuracy: 0.7779
 Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.80      0.84      1053
           1       0.55      0.72      0.62       356

    accuracy                           0.78      1409
   macro avg       0.72      0.76      0.73      1409
weighted avg       0.81      0.78      0.79      1409


Training model: DecisionTree

DecisionTree - Model Performance
Training Set:
 Accuracy: 0.9985
 Classification Report:
               precision    recall  f1-sc

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost - Model Performance
Training Set:
 Accuracy: 0.9037
 Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.92      0.91      4121
           1       0.92      0.89      0.90      4121

    accuracy                           0.90      8242
   macro avg       0.90      0.90      0.90      8242
weighted avg       0.90      0.90      0.90      8242

Test Set:
 Accuracy: 0.7991
 Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87      1053
           1       0.61      0.56      0.59       356

    accuracy                           0.80      1409
   macro avg       0.73      0.72      0.73      1409
weighted avg       0.79      0.80      0.80      1409




### Using the XGBoost  model

In [None]:
XG_model=XGBClassifier(fit_intercept=True)
XG_model=XG_model.fit(X_train,y_train)
y_pred=XG_model.predict(X_test)
accurcy=accuracy_score(y_test,y_pred)
C_report=classification_report(y_test,y_pred)
print(accurcy)
print(C_report)


Parameters: { "fit_intercept" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0.7991483321504613
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1053
           1       0.61      0.56      0.59       356

    accuracy                           0.80      1409
   macro avg       0.73      0.72      0.73      1409
weighted avg       0.79      0.80      0.80      1409

