PREDICTING CUSTOMER CHURN

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

In [4]:
#loading the data set
dataset =pd.read_csv(r"C:\Users\nelli\Downloads\archive (4).zip")


In [5]:
dataset.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
#checking for missing values
dataset.isnull().sum()


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [7]:
# Check the data types of each column
data_types = dataset.dtypes
print(data_types)

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [8]:
# performing label encoding on the categorical columns in the 'dataset' dataframe. 
for column in dataset.select_dtypes(include=['object']):
    if column != 'customerID':
        dataset[column] = LabelEncoder().fit_transform(dataset[column])

In [9]:
# Check the data types of each column
data_types = dataset.dtypes
print(data_types)

customerID           object
gender                int32
SeniorCitizen         int64
Partner               int32
Dependents            int32
tenure                int64
PhoneService          int32
MultipleLines         int32
InternetService       int32
OnlineSecurity        int32
OnlineBackup          int32
DeviceProtection      int32
TechSupport           int32
StreamingTV           int32
StreamingMovies       int32
Contract              int32
PaperlessBilling      int32
PaymentMethod         int32
MonthlyCharges      float64
TotalCharges          int32
Churn                 int32
dtype: object


In [10]:
# Split the dataset: Divide the dataset into training and testing sets.
X = dataset.drop(['customerID', 'Churn'], axis=1)
y = dataset['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# Scaling features(normalize/standardize the features)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [12]:
# Training and evaluating models (logistic regression, KNN, SVM, decision tree, random forest,naive-bayes) on the training dataset.


logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_preds = logreg.predict(X_test)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
svc_preds = svc.predict(X_test)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(max_depth=3, random_state=42)
dtree.fit(X_train, y_train)
predictions = dtree.predict(X_test)

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
predictions = gnb.predict(X_test)

In [13]:
#Evaluate the models: Assess the performance of the models using appropriate metrics such as accuracy, precision, recall, F1-score, and ROC-AUC.
logreg_accuracy = accuracy_score(y_test, logreg_preds)
logreg_precision = precision_score(y_test, logreg_preds)
logreg_recall = recall_score(y_test, logreg_preds)
logreg_f1 = f1_score(y_test, logreg_preds)

rf_accuracy = accuracy_score(y_test, rf_preds)
rf_precision = precision_score(y_test, rf_preds)
rf_recall = recall_score(y_test, rf_preds)
rf_f1 = f1_score(y_test, rf_preds)

svc_accuracy = accuracy_score(y_test, svc_preds)
svc_precision = precision_score(y_test, svc_preds)
svc_recall = recall_score(y_test, svc_preds)
svc_f1 = f1_score(y_test, svc_preds)

# Calculate the performance metrics
#knn_accuracy = accuracy_score(y_test, knn_preds)
#knn_precision = precision_score(y_test, knn_preds)
#knn_recall = recall_score(y_test, knn_preds)
#knn_f1 = f1_score(y_test, knn_preds)

# Assuming you have trained and fitted
dtree_model = DecisionTreeClassifier()
dtree_model.fit(X_train, y_train)
# Use the fitted model to make predictions on the test set
dtree_preds = dtree_model.predict(X_test)

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
# Use the fitted model to make predictions on the test set
gnb_preds = gnb_model.predict(X_test)

dtree_accuracy = accuracy_score(y_test, dtree_preds)
dtree_precision = precision_score(y_test, dtree_preds)
dtree_recall = recall_score(y_test, dtree_preds)
dtree_f1 = f1_score(y_test, dtree_preds)

gnb_accuracy = accuracy_score(y_test, gnb_preds)
gnb_precision = precision_score(y_test, gnb_preds)
gnb_recall = recall_score(y_test, gnb_preds)
gnb_f1 = f1_score(y_test, gnb_preds)


In [14]:
print(f"Logistic Regression - Accuracy: {logreg_accuracy}, Precision: {logreg_precision}, Recall: {logreg_recall}, F1: {logreg_f1}")
print(f"Random Forest - Accuracy: {rf_accuracy}, Precision: {rf_precision}, Recall: {rf_recall}, F1: {rf_f1}")
#print(f"KNeighborsClassifier - Accuracy: {knn_accuracy}, Precision: {knn_precision}, Recall: {knn_recall}, F1: {knn_f1}")
print(f"DecisionTreeClassifier - Accuracy: {dtree_accuracy}, Precision: {dtree_precision}, Recall: {dtree_recall}, F1: {dtree_f1}")
print(f"GaussianNB - Accuracy: {gnb_accuracy}, Precision: {gnb_precision}, Recall: {gnb_recall}, F1: {gnb_f1}")

Logistic Regression - Accuracy: 0.8078561287269286, Precision: 0.6842105263157895, Recall: 0.5435540069686411, F1: 0.6058252427184466
Random Forest - Accuracy: 0.7950780880265026, Precision: 0.6766917293233082, Recall: 0.47038327526132406, F1: 0.5549845837615622
DecisionTreeClassifier - Accuracy: 0.7221959299574066, Precision: 0.4883303411131059, Recall: 0.4738675958188153, F1: 0.48099027409372236
GaussianNB - Accuracy: 0.7557974443918599, Precision: 0.5358910891089109, Recall: 0.7543554006968641, F1: 0.6266280752532563


In [16]:
# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

# Feature selection
rfe = RFE(RandomForestClassifier(**best_params), n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

rf_optimized = RandomForestClassifier(**best_params)
rf_optimized.fit(X_train_rfe, y_train)
rf_optimized_preds = rf_optimized.predict(X_test_rfe)

rf_optimized_accuracy = accuracy_score(y_test, rf_optimized_preds)
rf_optimized_precision = precision_score(y_test, rf_optimized_preds)
rf_optimized_recall = recall_score(y_test, rf_optimized_preds)
rf_optimized_f1 = f1_score(y_test, rf_optimized_preds)

print(f"Optimized Random Forest - Accuracy: {rf_optimized_accuracy}, Precision: {rf_optimized_precision}, Recall: {rf_optimized_recall}, F1: {rf_optimized_f1}")


Optimized Random Forest - Accuracy: 0.7946048272598202, Precision: 0.6620370370370371, Recall: 0.49825783972125437, F1: 0.5685884691848907


In [17]:
#Identifying important features
important_features = pd.Series(rf_optimized.feature_importances_, index=X.columns[rfe.support_])
important_features = important_features.sort_values(ascending=False)

print("\nImportant Features:")
print(important_features)


#Conclusion
print("\nBased on the evaluation metrics, the Optimized Random Forest model is the best-performing model.")
print("The top features contributing to customer churn prediction are:")
print(important_features.head(5))


Important Features:
tenure              0.213411
MonthlyCharges      0.191053
TotalCharges        0.160243
Contract            0.144356
OnlineSecurity      0.093193
TechSupport         0.058499
PaymentMethod       0.052468
OnlineBackup        0.031394
InternetService     0.030643
PaperlessBilling    0.024742
dtype: float64

Based on the evaluation metrics, the Optimized Random Forest model is the best-performing model.
The top features contributing to customer churn prediction are:
tenure            0.213411
MonthlyCharges    0.191053
TotalCharges      0.160243
Contract          0.144356
OnlineSecurity    0.093193
dtype: float64
