In [53]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn .combine import SMOTEENN

In [54]:
data = pd.read_csv('churn_eda.csv')

In [55]:
data.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72,Churn
0,0,0,29,29,1,0,0,1,1,0,...,0,1,0,1,0,0,0,0,0,0
1,1,0,56,1889,0,1,1,0,1,0,...,0,0,1,0,0,1,0,0,0,0
2,2,0,53,108,0,1,1,0,1,0,...,0,0,1,1,0,0,0,0,0,1
3,3,0,42,1840,0,1,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,4,0,70,151,1,0,1,0,1,0,...,0,1,0,1,0,0,0,0,0,1


In [56]:
data = data.drop('Unnamed: 0', axis=1)

In [57]:
x = data.drop('Churn', axis=1)

In [58]:
x.shape

(7032, 50)

In [59]:
y = data['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

In [60]:
y.shape

(7032,)

## Train Test Split

In [61]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x,y,
    test_size=0.2,
    random_state=42,
    # startify=y
)




## Decision tree classifier

In [62]:
model_dt = DecisionTreeClassifier(criterion= "gini", random_state = 100, max_depth= 6, min_samples_leaf =8)

In [63]:
model_dt.fit(X_train, y_train)

In [64]:
y_pred = model_dt.predict(X_test)


In [67]:
model_dt.score(X_test,y_test)

0.7697228144989339

In [68]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.84      0.85      0.84      1033
           1       0.57      0.55      0.56       374

    accuracy                           0.77      1407
   macro avg       0.70      0.70      0.70      1407
weighted avg       0.77      0.77      0.77      1407



In [71]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [72]:
xr_train, xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

In [73]:
model_dt_smote = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=6, min_samples_leaf=8)

In [74]:
model_dt_smote.fit(xr_train, yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9443493150684932
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       551
           1       0.96      0.94      0.95       617

    accuracy                           0.94      1168
   macro avg       0.94      0.94      0.94      1168
weighted avg       0.94      0.94      0.94      1168



## Rondom forest classifier

In [75]:

from sklearn.ensemble import RandomForestClassifier

In [77]:
model_rf = RandomForestClassifier(n_estimators=100, criterion="gini", random_state = 100, max_depth=6, min_samples_leaf=8)

In [80]:
model_rf.fit(X_train,y_train)

In [82]:
y_pred = model_rf.predict(X_test)

In [84]:
model_rf.score(X_test, y_test)

0.7846481876332623

In [85]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1033
           1       0.64      0.43      0.52       374

    accuracy                           0.78      1407
   macro avg       0.73      0.67      0.69      1407
weighted avg       0.77      0.78      0.77      1407



In [89]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [90]:
xr_train1, xr_test1, yr_train1, yr_test1 = train_test_split(X_resampled1, y_resampled1, test_size = 0.2)

In [94]:
model_rf_smote = RandomForestClassifier(n_estimators=100, criterion="gini", random_state = 100, max_depth=6, min_samples_leaf=8)

In [96]:
model_rf_smote.fit(xr_train1, yr_train1)

In [97]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [99]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [101]:
model_score_r1
print(metrics.classification_report(yr_test1, yr_predict1))

              precision    recall  f1-score   support

           0       0.97      0.90      0.93       527
           1       0.92      0.97      0.95       633

    accuracy                           0.94      1160
   macro avg       0.94      0.94      0.94      1160
weighted avg       0.94      0.94      0.94      1160



In [102]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[475  52]
 [ 16 617]]


## performing pca is important just for now i skip

In [103]:
import pickle

In [104]:
filename = 'model.sav'

In [105]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [106]:
load_model = pickle.load(open(filename, 'rb'))

In [107]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [108]:
model_score_r1

0.9413793103448276