In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv('tel_churn.csv')

In [3]:
df.head(5)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,1,29.85,29.85,0,True,False,False,True,True,...,False,False,True,False,True,False,False,False,False,False
1,0,34,56.95,1889.5,0,False,True,True,False,True,...,False,False,False,True,False,False,True,False,False,False
2,0,2,53.85,108.15,1,False,True,True,False,True,...,False,False,False,True,True,False,False,False,False,False
3,0,45,42.3,1840.75,0,False,True,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,0,2,70.7,151.65,1,True,False,True,False,True,...,False,False,True,False,True,False,False,False,False,False


In [4]:
X = df.drop('Churn',axis=1)
y = df['Churn']

# Logistic Regression

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [7]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

model_lg = LogisticRegression(max_iter=1000)
model_lg.fit(X_train_scaled,y_train)

In [8]:
y_pred = model_lg.predict(X_test_scaled)
y_pred

array([0, 1, 1, ..., 1, 0, 0], dtype=int64)

In [9]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8088130774697939
Confusion Matrix:
 [[940  98]
 [171 198]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.91      0.87      1038
           1       0.67      0.54      0.60       369

    accuracy                           0.81      1407
   macro avg       0.76      0.72      0.74      1407
weighted avg       0.80      0.81      0.80      1407



As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [10]:
smote = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X,y)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled,test_size=0.2)

In [11]:
scaler = StandardScaler()

Xr_train_scaled = scaler.fit_transform(Xr_train)
Xr_test_scaled = scaler.fit_transform(Xr_test)

model_lg_smote = LogisticRegression(max_iter=1000)
model_lg_smote.fit(Xr_train_scaled,yr_train)

In [12]:
yr_pred_lg = model_lg_smote.predict(Xr_test_scaled)

In [13]:
print("Accuracy:", accuracy_score(yr_test, yr_pred_lg))
print("Confusion Matrix:\n", confusion_matrix(yr_test, yr_pred_lg))
print("Classification Report:\n", classification_report(yr_test, yr_pred_lg))

Accuracy: 0.9464740866610025
Confusion Matrix:
 [[508  36]
 [ 27 606]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94       544
           1       0.94      0.96      0.95       633

    accuracy                           0.95      1177
   macro avg       0.95      0.95      0.95      1177
weighted avg       0.95      0.95      0.95      1177



 Now we can see quite better results, i.e. Accuracy: 94 %, and a very good recall, precision & f1 score for minority class.
Let's try with some other classifier.

# Decision Tree Classifier

In [14]:
model_dt = DecisionTreeClassifier(criterion='gini',random_state=100,max_depth=6,min_samples_leaf=8)
model_dt.fit(X_train, y_train)

In [15]:
y_pred_dt = model_dt.predict(X_test)

In [16]:
print("Accuracy:", accuracy_score(y_test,y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test,y_pred_dt))
print("Classification Report:\n", classification_report(y_test,y_pred_dt))

Accuracy: 0.7924662402274343
Confusion Matrix:
 [[903 135]
 [157 212]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.87      0.86      1038
           1       0.61      0.57      0.59       369

    accuracy                           0.79      1407
   macro avg       0.73      0.72      0.73      1407
weighted avg       0.79      0.79      0.79      1407



As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
Hence, moving ahead to call SMOTEENN (OverSampling + ENN)


In [17]:
smote = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X,y)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled,test_size=0.2)

In [18]:
model_dt_smote = DecisionTreeClassifier(criterion='gini',random_state=100,max_depth=6,min_samples_leaf=8)
model_dt_smote.fit(Xr_train, yr_train)

In [19]:
yr_pred_dt = model_dt_smote.predict(Xr_test)

In [20]:
print("Accuracy:", accuracy_score(yr_test,yr_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(yr_test,yr_pred_dt))
print("Classification Report:\n", classification_report(yr_test,yr_pred_dt))

Accuracy: 0.9362786745964317
Confusion Matrix:
 [[459  43]
 [ 32 643]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.91      0.92       502
           1       0.94      0.95      0.94       675

    accuracy                           0.94      1177
   macro avg       0.94      0.93      0.93      1177
weighted avg       0.94      0.94      0.94      1177



Now we can see quite better results, i.e. Accuracy: 93 %, and a very good recall, precision & f1 score for minority class.
Let's try with some other classifier.

# Random Forest Classifier

In [25]:
model_rf = RandomForestClassifier(criterion='gini',random_state=100,max_depth=6,min_samples_leaf=8)
model_rf.fit(X_train, y_train)

In [26]:
y_pred_rf = model_rf.predict(X_test)

In [27]:
print("Accuracy:", accuracy_score(y_test,y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test,y_pred_rf))
print("Classification Report:\n", classification_report(y_test,y_pred_rf))

Accuracy: 0.7974413646055437
Confusion Matrix:
 [[950  88]
 [197 172]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.92      0.87      1038
           1       0.66      0.47      0.55       369

    accuracy                           0.80      1407
   macro avg       0.74      0.69      0.71      1407
weighted avg       0.78      0.80      0.78      1407



As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
Hence, moving ahead to call SMOTEENN (OverSampling + ENN)


In [28]:
smote = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X,y)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled,test_size=0.2)

In [29]:
model_rf_smote = RandomForestClassifier(criterion='gini',random_state=100,max_depth=6,min_samples_leaf=8)
model_rf_smote.fit(Xr_train, yr_train)

In [30]:
yr_pred_rf = model_dt_smote.predict(Xr_test)

In [40]:
print("Accuracy:", accuracy_score(yr_test,yr_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(yr_test,yr_pred_rf))
print("Classification Report:\n", classification_report(yr_test,yr_pred_rf))

Accuracy: 0.9473237043330501
Confusion Matrix:
 [[494  42]
 [ 20 621]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.92      0.94       536
           1       0.94      0.97      0.95       641

    accuracy                           0.95      1177
   macro avg       0.95      0.95      0.95      1177
weighted avg       0.95      0.95      0.95      1177



Now we can see quite better results, i.e. Accuracy: 94.7 %, and a very good recall, precision & f1 score for minority class.
Let's try with some other classifier.

With RF Classifier, also we are able to get quite good results, infact better than Logistic regression and Decision Tree.


In [73]:
import pickle
filename = "random_forest_churn.pkl"

In [74]:
with open("random_forest_churn.pkl", "wb") as f:
    pickle.dump(model_rf_smote, f)

In [75]:
with open("random_forest_churn.pkl", "rb") as f:
    model = pickle.load(f)

In [77]:
y_pred = model.predict(Xr_test)