### __Importing necessary libraries to train and fit a model__

In [20]:
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score  #The recall is intuitively the ability of the classifier to find 
#all the positive samples.The best value is 1 and the worst value is 0.(tp/tp+tn)
from sklearn.metrics import classification_report #A Classification report is used to measure the quality of predictions
#from a classification algorithm.
from sklearn.metrics import confusion_matrix #to evaluate the accuracy of a classification
from sklearn.tree import DecisionTreeClassifier #The goal is to create a model that predicts the value of a target variable
#by learning simple decision rules inferred from the data features. 
from imblearn.combine import SMOTEENN #to balance imbalance datasets

In [6]:
df = pd.read_csv('telecom_ML_data')
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_bins_1-12,tenure_bins_13-24,tenure_bins_25-36,tenure_bins_37-48,tenure_bins_49-60,tenure_bins_61-72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [7]:
df= df.drop('Unnamed: 0',axis = 1)


In [8]:
x =df.drop('Churn',axis = 1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_bins_1-12,tenure_bins_13-24,tenure_bins_25-36,tenure_bins_37-48,tenure_bins_49-60,tenure_bins_61-72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [9]:
y =df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

### __Train Test Split__

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

#### __Decision Tree Classifier__

In [13]:
dt_model = DecisionTreeClassifier(criterion= "gini", random_state = 100,max_depth =6, min_samples_leaf =8)

In [14]:
dt_model.fit(x_train, y_train)

In [16]:
y_pred =dt_model.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [17]:
dt_model.score(x_test, y_test)

0.7867803837953091

In [30]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1040
           1       0.62      0.47      0.54       367

    accuracy                           0.79      1407
   macro avg       0.72      0.68      0.70      1407
weighted avg       0.77      0.79      0.78      1407



__Accuracy is quite low(0.79) as the data is an imbalanced dataset(73(unchurn):27(churn)).__

__we shouldn't consider accuracy as our metrics as accuracy is cursed in imbalance dataset.__

__Hence, recall,precision and f1-score has to be consider for the minority class,and it is evident that the corresponding values are too low for label 1(i.e.,Churned customers).__

__Hence moving ahead to SMOOTEENN(Upsampling +ENN) Combination of over and under sampling-(upsampling using SMOOTE and cleaning noisy samples using ENN)__

In [25]:
sm =SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [26]:
xr_train, xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled, test_size = 0.2)

In [27]:
dt_model_smote = DecisionTreeClassifier(criterion= "gini", random_state = 100,max_depth =6, min_samples_leaf =8)

In [28]:
dt_model_smote.fit(xr_train, yr_train)

In [31]:
yr_pred =dt_model_smote.predict(xr_test)
yr_pred

array([0, 1, 1, ..., 0, 1, 1], dtype=int64)

In [32]:
dt_model_smote_score = dt_model_smote.score(xr_test, yr_test)
print(dt_model_smote_score)
print(metrics.classification_report(yr_test, yr_pred))


0.9209183673469388
              precision    recall  f1-score   support

           0       0.96      0.86      0.91       518
           1       0.90      0.97      0.93       658

    accuracy                           0.92      1176
   macro avg       0.93      0.91      0.92      1176
weighted avg       0.92      0.92      0.92      1176



Results are quite better. Accuracy is 92 % and the precision, recall, f1 score for minority class (label 1 (Churned customers)) are high.



In [33]:
print(metrics.confusion_matrix(yr_test, yr_pred))

[[445  73]
 [ 20 638]]


### __Random Forest Classifier__

In [35]:
from sklearn.ensemble import RandomForestClassifier
#A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset
#and uses averaging to improve the predictive accuracy and control over-fitting

In [39]:
rf_model = RandomForestClassifier(n_estimators = 100, criterion= "gini", random_state = 100,max_depth =6, min_samples_leaf =8) 

In [43]:
rf_model.fit(x_train,y_train)
y_pred =rf_model.predict(x_test)
print(y_pred)
rf_model.score(x_test, y_test)

[0 0 0 ... 0 1 0]


0.8031272210376688

In [44]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1040
           1       0.67      0.49      0.56       367

    accuracy                           0.80      1407
   macro avg       0.75      0.70      0.72      1407
weighted avg       0.79      0.80      0.79      1407



In [52]:
sm =SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [53]:
xr_train1, xr_test1, yr_train1, yr_test1 = train_test_split(X_resampled1, y_resampled1, test_size = 0.2)

In [64]:
rf_model_smote = RandomForestClassifier(criterion= "gini", random_state = 100,max_depth =6, min_samples_leaf =8)

In [65]:
rf_model_smote.fit(xr_train1, yr_train1)

In [66]:
yr_pred1 =rf_model_smote.predict(xr_test1)
yr_pred1

array([0, 1, 1, ..., 0, 1, 1], dtype=int64)

In [67]:
rf_model_smote.score(xr_test1, yr_test1)

0.9336734693877551

In [68]:
print(metrics.classification_report(yr_test1, yr_pred1))

              precision    recall  f1-score   support

           0       0.96      0.89      0.92       526
           1       0.92      0.97      0.94       650

    accuracy                           0.93      1176
   macro avg       0.94      0.93      0.93      1176
weighted avg       0.94      0.93      0.93      1176



In [60]:
print(metrics.confusion_matrix(yr_test1, yr_pred1))

[[481  45]
 [ 28 622]]


__Hence RandomForestClassifier model gives better results (Accuracy =93%) than DecisionTreeClassifier__

Diving deep into Multiple classifiers in order to see the model performance.

(Principle Component Analysis)PCA is used to decompose a multivariate dataset in a set of successive orthogonal components that explain a maximum amount of the variance. In scikit-learn, PCA is implemented as a transformer object that learns 
 components in its fit method, and can be used on new data to project it on these components.

__Performing PCA__

In [61]:
from sklearn.decomposition import PCA

In [62]:
pca = PCA(0.9) #no of components set

In [63]:
xr_train_pca = pca.fit_transform(xr_train1) #Fit the model with X and apply the dimensionality reduction on X.
xr_test_pca = pca.transform(xr_test1) 
explained_variance = pca.explained_variance_ratio_

In [70]:
model = RandomForestClassifier(n_estimators = 100, criterion= "gini", random_state = 100,max_depth =6, min_samples_leaf =8) 

In [72]:
model.fit(xr_train_pca,yr_train1)
y_pred_pca =model.predict(xr_test_pca)
print(y_pred_pca)
model.score(xr_test_pca, yr_test1)

[0 1 1 ... 0 1 1]


0.7151360544217688

In [73]:
print(metrics.classification_report(yr_test1, y_pred_pca))

              precision    recall  f1-score   support

           0       0.70      0.63      0.67       526
           1       0.73      0.78      0.75       650

    accuracy                           0.72      1176
   macro avg       0.71      0.71      0.71      1176
weighted avg       0.71      0.72      0.71      1176



By performing RFClassifier in PCA algorithm, there is no better result than RF Classifier in SMOTEENN().

Hence save the model as RFClassifier in SMOTEENN algorithm 

### __Pickling the Model__

Saving the finalized model to pickle saves a lot of time as we don’t have to train the model every time you run the application

In [74]:
import pickle

In [76]:
filename = 'MLmodel.sav'

In [78]:
pickle.dump(rf_model_smote,open(filename,'wb'))

In [79]:
load_model = pickle.load(open(filename,'rb'))

In [80]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [81]:
model_score_r1

0.9336734693877551

__Now,the final model RFClassifier with SMOTEENN algorithm is dumped in MLmodel.sav.__

__By preparing APIs for the model, one can access the model from UI.__ 