In [1]:
# importing the libraries 
import pandas as pd 
import pickle
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from imblearn.combine import SMOTEENN 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
data = pd.read_csv('/Users/tarakram/Documents/Churn-Prediction/data/processed/pre-processed_data.csv')
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,InternetService,OnlineSecurity,Contract,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,0,0,0,2,29.85,29,0
1,1,0,0,0,34,1,0,1,1,3,56.95,1889,0
2,1,0,0,0,2,1,0,1,0,3,53.85,108,1
3,1,0,0,0,45,0,0,1,1,0,42.3,1840,0
4,0,0,0,0,2,1,1,0,0,2,70.7,151,1


In [3]:
# Spltting the data 
X = data.drop('Churn', axis = 1)
y = data['Churn']


In [4]:
print(X.shape)
print(y.shape)

(7009, 12)
(7009,)


In [5]:
X_train,X_test,y_train,y_test = tts(X,y,test_size= 0.2, random_state= 42)

In [6]:
X_train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,InternetService,OnlineSecurity,Contract,PaymentMethod,MonthlyCharges,TotalCharges
1323,0,0,1,1,43,0,0,1,1,0,51.25,2151
5366,1,0,0,0,4,0,0,0,0,2,30.50,118
1897,1,0,0,0,3,1,1,0,0,2,92.00,266
6658,0,0,1,1,12,1,2,0,2,0,20.30,224
1871,1,0,0,0,72,1,0,1,2,1,88.55,6362
...,...,...,...,...,...,...,...,...,...,...,...,...
3772,0,0,0,0,58,1,1,0,0,2,95.30,5817
5191,1,0,1,1,72,0,0,1,2,1,38.50,2763
5226,0,0,0,0,27,1,1,0,0,3,94.55,2724
5390,0,1,1,0,1,0,0,0,0,2,24.80,24


In [7]:
y_train

1323    0
5366    0
1897    0
6658    0
1871    0
       ..
3772    0
5191    0
5226    1
5390    1
860     0
Name: Churn, Length: 5607, dtype: int64

In [8]:
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data.to_csv('/Users/tarakram/Documents/Churn-Prediction/data/processed/train_data.csv', index=False)
test_data.to_csv('/Users/tarakram/Documents/Churn-Prediction/data/processed/test_data.csv', index=False)


In [9]:
y_test

5133    1
6722    0
245     0
1880    0
5926    0
       ..
4379    0
2574    0
3128    1
2687    0
5259    0
Name: Churn, Length: 1402, dtype: int64

In [10]:
# Creating a object for every model
lg = LogisticRegression()
bnb = BernoulliNB()
mnb = MultinomialNB()
gnb = GaussianNB()
dtc = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)


In [11]:
models = [lg, bnb, mnb, gnb, dtc, rfc]
scores = []

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred) 
    prec = precision_score(y_test, y_pred) 
    rec = recall_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    scores.append([acc, prec, rec, f1_score_value])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'Bernoualli Navie Bayes', 'Multinomial Navie Bayes', 'Guassian Navie Bayes','Decision_tree','Random_Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1-score'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
0,Logistic Regression,0.78388,0.575581,0.557746,0.566524
1,Bernoualli Navie Bayes,0.720399,0.456471,0.546479,0.497436
2,Multinomial Navie Bayes,0.668331,0.414861,0.75493,0.535465
3,Guassian Navie Bayes,0.748217,0.501838,0.769014,0.607341
4,Decision_tree,0.798859,0.60961,0.571831,0.590116
5,Random_Forest,0.795292,0.640496,0.43662,0.519263


#### Let's apply scaling to the dataset.


In [13]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [14]:
models = [lg, bnb, mnb, gnb, dtc, rfc]
scores = []

for m in models:
    m.fit(X_train_scaled, y_train)
    y_pred = m.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred) 
    prec = precision_score(y_test, y_pred) 
    rec = recall_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    scores.append([acc, prec, rec, f1_score_value])

In [15]:
pickle.dump(scaler, open('/Users/tarakram/Documents/Customer-Churn/model/scaler.pkl', 'wb'))

In [16]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'Bernoualli Navie Bayes', 'Multinomial Navie Bayes', 'Guassian Navie Bayes','Decision_tree','Random_Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1-score'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
0,Logistic Regression,0.791726,0.603279,0.51831,0.557576
1,Bernoualli Navie Bayes,0.720399,0.456471,0.546479,0.497436
2,Multinomial Navie Bayes,0.759629,0.568182,0.211268,0.308008
3,Guassian Navie Bayes,0.74465,0.497207,0.752113,0.598655
4,Decision_tree,0.798859,0.60961,0.571831,0.590116
5,Random_Forest,0.795292,0.640496,0.43662,0.519263


### Since we are dealing with the unbalence dataset, we can oversample the data, by using SMOTE.

In [17]:
sm = SMOTEENN() # using SMOTE.
X_resampled1, y_resampled1 = sm.fit_resample(X,y)


In [18]:
X_train1,X_test1,y_train1,y_test1=tts(X_resampled1, y_resampled1,test_size=0.2)

In [19]:
models = [lg, bnb, mnb, gnb, dtc, rfc]
scores = []

for m in models:
    m.fit(X_train1, y_train1)
    y_pred = m.predict(X_test1)
    acc = accuracy_score(y_test1, y_pred) 
    prec = precision_score(y_test1, y_pred) 
    rec = recall_score(y_test1, y_pred)
    f1_score_value = f1_score(y_test1, y_pred)
    scores.append([acc, prec, rec, f1_score_value])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'Bernoualli Navie Bayes', 'Multinomial Navie Bayes', 'Guassian Navie Bayes','Decision_tree','Random_Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1-score'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
0,Logistic Regression,0.899382,0.893376,0.920133,0.906557
1,Bernoualli Navie Bayes,0.849956,0.8437,0.8802,0.861564
2,Multinomial Navie Bayes,0.810238,0.817434,0.826955,0.822167
3,Guassian Navie Bayes,0.872021,0.87013,0.891847,0.880855
4,Decision_tree,0.903795,0.876147,0.953411,0.913147
5,Random_Forest,0.906443,0.894737,0.933444,0.913681


Overall Random forest is performing better, having 94 % of all metrics.

### Saving the Model

In [21]:
pickle.dump(rfc, open('/Users/tarakram/Documents/Churn-Prediction/models/churn_predictor_model.pkl', 'wb'))