In [None]:
#import libraries
#Data Structures
import pandas as pd
import numpy as np


#Sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
#from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
#from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score
#from imblearn.metrics import sensitivity_specificity_support
from sklearn import metrics

import xgboost as xgb
import re

#Plotting
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
%matplotlib inline

#Others
import warnings
warnings.filterwarnings('ignore')



# display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [None]:
#import dataset
data = pd.read_csv("train.csv")
unseen = pd.read_csv("test.csv")
#view data
data.head()

In [None]:
#check the target variable
data.churn_probability.value_counts()/len(data)

In [None]:
#check shape and columns details of the dataframe
print(data.shape)
data.info(verbose=True, show_counts=True)

## Data preparation

### Data cleaning

In [None]:
#missing values
#we can see from above result that some columns are mostly empty, which will not help with analysis
#Identify the these coulmns
(data.isna().sum()/len(data)).sort_values(ascending=False)

In [None]:
#the customers who have missing values in below column, did not make any data recharge in june 2014, hence, replacing these missing 
#values with zero

cols=["arpu_3g_6","count_rech_2g_6","night_pck_user_6","arpu_2g_6","date_of_last_rech_data_6","total_rech_data_6",
"av_rech_amt_data_6","max_rech_data_6","count_rech_3g_6","fb_user_6"]          
imp = SimpleImputer(strategy='constant', fill_value=0)
data[cols] = imp.fit_transform(data[cols])




In [None]:
#the customers who have missing values in below column, did not make any data recharge in july 2014, hence, replacing these missing 
#values with zero
cols=["night_pck_user_7","date_of_last_rech_data_7","total_rech_data_7","max_rech_data_7","fb_user_7","count_rech_2g_7",
"count_rech_3g_7","arpu_3g_7","av_rech_amt_data_7","arpu_2g_7"]
imp = SimpleImputer(strategy='constant', fill_value=0)
data[cols] = imp.fit_transform(data[cols])


In [None]:
#the customers who have missing values in below column, did not make any data recharge in august 2014, hence, replacing these missing 
#values with zero
cols=["count_rech_2g_8","av_rech_amt_data_8","night_pck_user_8","max_rech_data_8","total_rech_data_8","arpu_2g_8","arpu_3g_8",
"date_of_last_rech_data_8","fb_user_8","count_rech_3g_8"]
imp = SimpleImputer(strategy='constant', fill_value=0)
data[cols] = imp.fit_transform(data[cols])

In [None]:
#there are customers, who did not make any voice call in August, June and july as per below data (total_og_mou*/total_ic_mou*)
cols=[i for i in list(data.columns) if re.search('mou.+8',i)]
(data[data.ic_others_8.isna()][cols].head())
cols=[i for i in list(data.columns) if re.search('mou.+7',i)]
(data[data.ic_others_7.isna()][cols].head())
cols=[i for i in list(data.columns) if re.search('mou.+6',i)]
(data[data.ic_others_6.isna()][cols].head())



In [None]:
#for such customers, impute the missing values with zero for cols which has voice call usage
cols=[i for i in list(data.columns) if re.search("(mou_6)|(mou_7)|(mou_8)|(ic_others)|(og_others)",i)]
for i in ["total_og_mou_6","total_og_mou_7","total_og_mou_8","total_ic_mou_6","total_ic_mou_7","total_ic_mou_8"]:
    cols.remove(i)

imp = SimpleImputer(strategy='constant', fill_value=0)
data[cols] = imp.fit_transform(data[cols])

In [None]:
data.shape

In [None]:
#investigate the remaining cols with missing values
data[data.date_of_last_rech_8.isna()]
#employee made voice calls but august, last recharge date is blank for august, similarly for june and july
#hence dropping such rows, which has genuine missing values in these dates columns
data.dropna(subset=['date_of_last_rech_8', 'date_of_last_rech_7',"date_of_last_rech_6"], inplace=True)
data.shape
data[data.loc_og_t2o_mou.isna()].head(5)
data[data.loc_og_t2o_mou.isna()].tail(5)
#a close look reveals that the rows with missing values in loc_og_t2o_mou, std_og_t2o_mou, loc_ic_t2o_mou, have zeroes on all
#mou column, hence imputing them with zero
data[["loc_og_t2o_mou", "std_og_t2o_mou", "loc_ic_t2o_mou"]] = imp.fit_transform(data[["loc_og_t2o_mou", "std_og_t2o_mou", "loc_ic_t2o_mou"]])

In [None]:
#check for missing values again
(data.isna().sum()/len(data)).sort_values(ascending=False).head()
#all missing values are fixed

In [None]:
data.shape
data.info(verbose=1)


In [None]:
# Dropping variables which is not helpful in analysis
data.head()

In [None]:
#drop id, circle id and date variables, 
d_cols=[i for i in list(data.columns) if re.search('date',i)]
d_cols.append("id")
d_cols.append("circle_id")
churn=data.drop(d_cols, axis=1)

churn.shape

In [None]:
#creating total data recharge amount for each months, dropping redundant original variables
cols=[i for i in list(data.columns) if re.search('rech',i)]

churn.total_rech_amt_data_6=churn.av_rech_amt_data_6*churn.total_rech_data_6
churn.total_rech_amt_data_7=churn.av_rech_amt_data_7*churn.total_rech_data_7
churn.total_rech_amt_data_8=churn.av_rech_amt_data_8*churn.total_rech_data_8
churn.drop(["av_rech_amt_data_6","av_rech_amt_data_7","av_rech_amt_data_8","total_rech_data_6","total_rech_data_7","total_rech_data_8"],axis=1,inplace=True, errors="ignore")
churn.shape

In [None]:
#check the categorical variables
churn.select_dtypes("object").columns
#these cols were initially numerical, became object type due to a side effect of SimpleImputer()

In [None]:
#we are not perfoming any outlier treatment for not, we may try the if lower accuracy score is coming with original data
#check the data type
#the fb_user*, night_pck_user* variables are binary category variable, changing them to integer for model building. The arpu* 
#columns should be float, other cols can be float or int
churn.fb_user_6=churn.fb_user_6.astype('int64')
churn.fb_user_7=churn.fb_user_7.astype('int64')
churn.fb_user_8=churn.fb_user_8.astype('int64')
churn.night_pck_user_6=churn.night_pck_user_6.astype('int64')
churn.night_pck_user_7=churn.night_pck_user_7.astype('int64')
churn.night_pck_user_8=churn.night_pck_user_8.astype('int64')
churn[['max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8',
       'count_rech_2g_6', 'count_rech_2g_7', 'count_rech_2g_8',
       'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8', 'arpu_3g_6',
       'arpu_3g_7', 'arpu_3g_8', 'arpu_2g_6', 'arpu_2g_7', 'arpu_2g_8']]=churn[['max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8',
       'count_rech_2g_6', 'count_rech_2g_7', 'count_rech_2g_8',
       'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8', 'arpu_3g_6',
       'arpu_3g_7', 'arpu_3g_8', 'arpu_2g_6', 'arpu_2g_7', 'arpu_2g_8']].astype('float64')



churn.select_dtypes("object").columns
#all columns in numerical format

In [None]:
#below columns have zeros in all rows, hence dropping these
print(churn.sum(axis=0).sort_values().head(10))
churn.drop(["loc_og_t2o_mou","std_og_t2c_mou_8","std_ic_t2o_mou_6","std_ic_t2o_mou_7","std_ic_t2o_mou_8","std_og_t2c_mou_7",
           "std_og_t2c_mou_6","loc_ic_t2o_mou","std_og_t2o_mou"],axis=1, inplace=True)

In [None]:
churn.shape

### Exploratory data analysis

In [None]:
# Check Correlation between target variable churn_probability with the other variable in the dataset
plt.figure(figsize=(10,50))
churn.corr()["churn_probability"].abs().sort_values(ascending=False)
#we are seeing many weak correlation, hence we will check if non-linear model can work well with this


In [None]:
#check distribution of differenr variables which include "total" in their name
t_cols=[i for i in list(churn.columns) if re.search('total',i)]
t_cols
fig=plt.subplots(figsize=(20, 20))

for i, feature in enumerate(t_cols):
    plt.subplot(10, 3, i+1)
    plt.subplots_adjust(hspace = 2.0)
    sns.distplot(x=churn[feature])
    plt.title(feature)
    plt.tight_layout()

#the plots looks like right skewed, we will apply scaling before modelling

In [None]:
#check correlation among independent variable,
churn.corr()[(churn.corr()>.8) & (churn.corr()<1.0)].abs().unstack().sort_values(kind="quicksort", ascending=False)
#several variables has high value correlation, hence we can use PCA here

In [None]:
#check the target variable
churn.churn_probability.value_counts()/len(churn)
#imbalanced dataset

In [None]:
churn.shape

### Divide the dataset into train and test dataset


In [None]:
churn_X=churn.drop("churn_probability",axis=1)
churn_X.shape
churn_y=churn.churn_probability


In [None]:
X_train, X_test, y_train, y_test= train_test_split(churn_X, churn_y, train_size = 0.7, test_size = 0.3, random_state = 100,\
                                                   stratify=churn.churn_probability.values)
X_train.shape
y_test.shape

In [None]:
var_cols=X_train.columns

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train= scaler.fit_transform(X_train)
X_train=pd.DataFrame(X_train,columns=var_cols)
X_train.head()
X_test= scaler.transform(X_test)
X_test=pd.DataFrame(X_test,columns=var_cols)


In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
X_train.describe()

### perform PCA

In [None]:
pca = PCA(random_state=42)
pca.fit(X_train)
pca.explained_variance_ratio_

In [None]:
#scree plot
var_cumu = np.cumsum(pca.explained_variance_ratio_)
fig = plt.figure(figsize=[5,5])
plt.vlines(x=15, ymax=1, ymin=0, colors="r", linestyles="--")
plt.hlines(y=0.95, xmax=30, xmin=0, colors="g", linestyles="--")
plt.plot(var_cumu)
plt.ylabel("Cumulative variance explained")
plt.show()

In [None]:
#we can see that less than 80 PCs is explaining around 95% of the total variance of the dataset.
#Perform PCA with 80 components
pca_final = IncrementalPCA(n_components=80)
train_pca = pca_final.fit_transform(X_train)
train_pca.shape

In [None]:
#np.corrcoef(train_pca.transpose())
#the principal components are not correlated
#Applying the transformation on the test set
test_pca = pca_final.transform(X_test)
test_pca.shape

### Applying logistic regression (with default arguments) on the data on our Principal components

In [None]:
lr_pca = LogisticRegression()
lr_pca=lr_pca.fit(train_pca, y_train)
#predict probability on test data set
pred_probs_test = lr_pca.predict_proba(test_pca)
metrics.roc_auc_score(y_test, pred_probs_test[:,1])
#test AUC score is pretty good


In [None]:
#predict probability on train data set
pred_probs_train = lr_pca.predict_proba(train_pca)
metrics.roc_auc_score(y_train, pred_probs_train[:,1])
#the test auc score is slightly less than train score, so the model is not overfitting

In [None]:
#calculate probability
pred_probs_test

In [None]:
#confusion matric is built based on default probability cutoff 0.5
confusion = metrics.confusion_matrix(y_train, lr_pca.predict(train_pca) )
confusion

In [None]:
#accuracy score on train data
metrics.accuracy_score(y_train, lr_pca.predict(train_pca))

In [None]:
metrics.accuracy_score(y_test, lr_pca.predict(test_pca))
# 92% of accuracy score is achieved on test partition without any hyperparameter tuning

### Hyperparameter tuning - PCA and Logistic Regression

In [None]:
# specify range of hyperparameters to tune

w=[{0:0.1, 1: 0.9}, {0:0.2, 1: 0.8}, {0:0.15, 1: 0.85}, {0:0.05, 1: 0.95}]
hyper_params = [{'class_weight':w, 'C': [0.1, 0.5, 1, 2, 3, 4, 5, 10], 'penalty': ['l1', 'l2']}]

#create a 5 fold cross-validation scheme

folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 100)


In [None]:
estimator_model=LogisticRegression()
estimator_model

lr_pca2 = GridSearchCV(estimator = estimator_model, 
                          param_grid = hyper_params, 
                          scoring= 'roc_auc', 
                          cv = folds, 
                          return_train_score=True,
                          verbose = 1)  
lr_pca2.fit(train_pca, y_train)


In [None]:
pd.DataFrame(lr_pca2.cv_results_)

In [None]:
# print best hyperparameters
print("Best AUC: ", lr_pca2.best_score_)
print("Best hyperparameters: ", lr_pca2.best_params_)
#similar AUC score received 
lr_pca2_final=lr_pca2.best_estimator_


In [None]:
#AUC score on test set
lr_pca2_final.fit(train_pca, y_train)

metrics.roc_auc_score(y_test, pred_probs_test[:,1])
#the AUC score is improved very slightly


In [None]:
#determine cutoff probabality
pred_probs_train = lr_pca2_final.predict_proba(train_pca)
y_train_pred=pd.DataFrame()
y_train_pred["churn_proba"]=pred_probs_train[:,1]
y_train_pred

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred[i]= y_train_pred.churn_proba.map(lambda x: 1 if x > i else 0)
y_train_pred.head()


In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train, y_train_pred[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
#from the above table a cut-off prob of .6 will give highest train accuracy score, one of our business goals 
# is to build an ML model that identifies customers who'll definitely churn with more accuracy as compared to the ones 
#who'll not churn, hence we should one maximizing the true positives and minimizing the false negative, that means, for this
#goal, our target will be to maximise sensitivity while keeping accuracy as much as high

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi'])
plt.show()


In [None]:
#as per the above plot, cut off prob is .33 when both accuracy and sensitivity are evaluation metric
#for kaggle competition as the evaluation metric is accuracy score, here we will take the cut-off as .6
#lets compute accuracy and sensitivity with both the cut off on test data

pred_probs_test = lr_pca2_final.predict_proba(test_pca)[:,1]
test_prediction=pd.DataFrame()
test_prediction["actual"]=y_test
test_prediction["proba"]=pred_probs_test
#map(lambda x: 1 if x > i else 0)
test_prediction["predicted_.33"]=test_prediction.proba.map(lambda x: 1 if x > .33 else 0)
test_prediction["predicted_.6"]=test_prediction.proba.map(lambda x: 1 if x > .6 else 0)
test_prediction.head(5)

In [None]:
test_prediction.head(20)
print("with cutoff .33, the test accuracy score is",metrics.accuracy_score(test_prediction.actual,test_prediction["predicted_.33"]))
print("with cutoff .6, the test accuracy score is",metrics.accuracy_score(test_prediction.actual,test_prediction["predicted_.6"]))



In [None]:
cm33=confusion_matrix(test_prediction.actual,test_prediction["predicted_.33"])
cm60=confusion_matrix(test_prediction.actual,test_prediction["predicted_.6"])
print("with cutoff .33, the test sensitivity is",cm33[1,1]/(cm33[1,0]+cm33[1,1]))
print("with cutoff .6, the test sensitivity is",cm60[1,1]/(cm60[1,0]+cm60[1,1]))
    

In [None]:
#for kaggle submission, apply this model on unseen data
unseen.shape
unseen.head


In [None]:
#apply scaling
unseen_test=unseen[var_cols]
(unseen_test.isna().sum()/len(unseen_test)).sort_values(ascending=False)
#imputing the missing values with zero using same logic we did on train dataset

unseen_test[var_cols] = imp.fit_transform(unseen_test[var_cols])
(unseen_test.isna().sum()/len(unseen_test)).sort_values(ascending=False).head()

In [None]:
#check if the dataset has object type columns
unseen_test.select_dtypes("object").columns
#all cols are numeric

In [None]:
#apply scaling
unseen_test= scaler.transform(unseen_test)
unseen_test=pd.DataFrame(unseen_test,columns=var_cols)

#apply pca
unseen_test_pca = pca_final.transform(unseen_test)
unseen_test_pca.shape

pred_probs_unseen = lr_pca2_final.predict_proba(unseen_test_pca)
pred_probs_unseen

In [None]:
#output the kaggle submission file based on cutoff prob of .6
#predict churn probalities and storing the same against id
submission_data=pd.DataFrame()
submission_data["id"]=unseen.id
submission_data["proba"]=pred_probs_unseen[:,1]
submission_data["churn_probability"]=submission_data["proba"].map(lambda x: 1 if x > .6 else 0)


In [None]:
submission_data=submission_data.drop("proba", axis=1)
submission_data

In [None]:
submission_data.churn_probability.value_counts()

In [None]:
submission_data.to_csv('submission_pca_lr.csv',index=False)

### RandomForest

In [None]:
#We got a fair accuracy score using logistic regression, let proceed with Randon Forest for imbalanced data set, so that 
#we can get important features and we will also compare the accuracy with previous model
#scaling is not required for tree based model

X_train, X_test, y_train, y_test= train_test_split(churn_X, churn_y, train_size = 0.7, test_size = 0.3, random_state = 100,\
                                                   stratify=churn.churn_probability.values)
X_train.shape, y_train.shape, X_test.shape, y_test.shape


In [None]:
forest = RandomForestClassifier(random_state=42, n_jobs=-1, criterion='gini', oob_score=True, class_weight={0:0.2, 1: 0.8})
#using same weight as logistic regression model, to reduce number of fits
params = {
             'n_estimators': [200,300],
             "min_samples_leaf": [30,40,50]
}
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

In [None]:
#performing CV search
rf = GridSearchCV(estimator=forest,
                  param_grid=params,
                  cv = folds,
                  n_jobs=-1, verbose=1, scoring="roc_auc")

rf.fit(X_train,y_train)

In [None]:
#print("oob score",rf.oob_score_)
print("best roc_auc",rf.best_score_)


In [None]:
#select the model which gives best score
rf_best = rf.best_estimator_
rf_best

In [None]:
#check AUC score on training dataset
metrics.roc_auc_score(y_train, rf_best.predict_proba(X_train)[:,1])


In [None]:
#check AUC score on test dataset
metrics.roc_auc_score(y_test, rf_best.predict_proba(X_test)[:,1])

In [None]:
#both test and train AUC has been improved than logistic regression
#lets find out the cut-off for highest accuracy and higher sensitivity
# Let's create columns with different probability cutoffs 
y_train_rf_pred=pd.DataFrame()
y_train_rf_pred["churn_proba"]=rf_best.predict_proba(X_train)[:,1]
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_rf_pred[i]= y_train_rf_pred["churn_proba"].map(lambda x: 1 if x > i else 0)
y_train_rf_pred.head()


In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_rf_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train, y_train_rf_pred[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_rf_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_rf_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_rf_df.plot.line(x='prob', y=['accuracy','sensi'])
plt.show()



In [None]:
#based on above graph, for highest accuracy, we are taking the cut off .6 
#for higher sentivity with as high as possible accuracy,  we can take cutoff probability of .25
test_rf_prediction=pd.DataFrame()
test_rf_prediction["actual"]=y_test
test_rf_prediction["proba"]=rf_best.predict_proba(X_test)[:,1]
test_rf_prediction["predicted_.25"]=test_rf_prediction["proba"].map(lambda x: 1 if x>.25 else 0)
test_rf_prediction["predicted_.6"]=test_rf_prediction["proba"].map(lambda x: 1 if x>.6 else 0)

In [None]:
test_rf_prediction.head()

In [None]:
#check test accuracy with .6 of cutoff probobility
print("Using random forest the test accuracy is",metrics.accuracy_score(test_rf_prediction["actual"],test_rf_prediction["predicted_.6"]))
confusion=metrics.confusion_matrix(test_rf_prediction["actual"],test_rf_prediction["predicted_.6"])
confusion
#we got more test accuracy than logistic regression model

In [None]:
#check sensitivity with .25 of cutoff probability
# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives
confusion=metrics.confusion_matrix(test_rf_prediction["actual"],test_rf_prediction["predicted_.25"])
sensi_rf_test=confusion[1,1]/(confusion[1,0]+confusion[1,1])
print("Using random forest the sensitivity on test set is",sensi_rf_test)
#we got more sensitivity on test data than logistic regression 
confusion

In [None]:
#as we got higher accuracy with RandonForest, lets use the same model on unseen data for kaggle submissin
unseen.head()
unseen_rf_test=unseen[var_cols]
unseen_rf_test.shape


In [None]:
#check missing values
(unseen_rf_test.isna().sum()/len(unseen_rf_test)).sort_values(ascending=False)
#imputing the missing values with zero using same logic we did on train dataset
unseen_rf_test[var_cols] = imp.fit_transform(unseen_rf_test[var_cols])
(unseen_rf_test.isna().sum()/len(unseen_rf_test)).sort_values(ascending=False).head()


In [None]:
#predict churn probalities and storing the same against id
#output the kaggle submission file based on cutoff prob of .6
pred_probs_rf_unseen = rf_best.predict_proba(unseen_rf_test)

submission_rf_data=pd.DataFrame()
submission_rf_data["id"]=unseen.id
submission_rf_data["proba"]=pred_probs_rf_unseen[:,1]
submission_rf_data["churn_probability"]=submission_rf_data["proba"].map(lambda x: 1 if x > .6 else 0)
submission_rf_data.head()

In [None]:
#preparing the file for submission
submission_rf_data=submission_rf_data.drop("proba", axis=1, errors="ignore")
submission_rf_data.head()

In [None]:
submission_rf_data.churn_probability.value_counts()

In [None]:
submission_rf_data.to_csv('submission_rf.csv',index=False)

### XGBoost Classifier

In [None]:
#as another ensemble technique XGBoost is preferred for large dataset and high precision accuracy, lets try to predict churn using same
X_train, X_test, y_train, y_test= train_test_split(churn_X, churn_y, train_size = 0.7, test_size = 0.3, random_state = 100,\
                                                   stratify=churn.churn_probability.values)
X_train.shape, y_train.shape, X_test.shape, y_test.shape


In [None]:
xgb_cl = xgb.XGBClassifier(n_jobs = -1,objective = 'binary:logistic',random_state=42, class_weight={0:0.2, 1: 0.8})


In [None]:
# A parameter grid for XGBoost
params = {
        'n_estimators' : [200], # no of trees 
        'learning_rate' : [0.05],  # eta
        'min_child_weight': [1, 5, 7],
        'gamma': [0.1],
        'subsample': [0.8],
        'colsample_bytree': [0.6, 0.8, 1.0],
        
        }

folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

param_comb = 800

sgb = RandomizedSearchCV(xgb_cl, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=-1, cv=folds, verbose=3, random_state=42)

In [None]:
sgb.fit(X_train,y_train)

In [None]:
print("The best ROC_AUC is ",sgb.best_score_)
xgb_best=sgb.best_estimator_
xgb_best

In [None]:
#check AUC score on train dataset
metrics.roc_auc_score(y_train, xgb_best.predict_proba(X_train)[:,1])

In [None]:
#check AUC score on test dataset
metrics.roc_auc_score(y_test, xgb_best.predict_proba(X_test)[:,1])

In [None]:
#both test and train AUC has been improved than Random Forest
#lets find out the cut-off for highest accuracy and higher sensitivity
# Let's create columns with different probability cutoffs 
y_train_xgb_pred=pd.DataFrame()
y_train_xgb_pred["churn_proba"]=xgb_best.predict_proba(X_train)[:,1]
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_xgb_pred[i]= y_train_xgb_pred["churn_proba"].map(lambda x: 1 if x > i else 0)
y_train_xgb_pred.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_xgb_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train, y_train_xgb_pred[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_xgb_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_xgb_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_xgb_df.plot.line(x='prob', y=['accuracy','sensi'])
plt.show()

In [None]:
#based on above graph, for highest accuracy, we are taking the cut off .5
#for higher sentivity with as high as possible accuracy,  we can take cutoff probabilities of .12
test_xgb_prediction=pd.DataFrame()
test_xgb_prediction["actual"]=y_test
test_xgb_prediction["proba"]=xgb_best.predict_proba(X_test)[:,1]
test_xgb_prediction["predicted_.12"]=test_xgb_prediction["proba"].map(lambda x: 1 if x>.12 else 0)
test_xgb_prediction["predicted_.5"]=test_xgb_prediction["proba"].map(lambda x: 1 if x>.5 else 0)

In [None]:
test_xgb_prediction.head()

In [None]:
#check test accuracy with .5 of cutoff probobility
print("Using xgboost the test accuracy is",metrics.accuracy_score(test_xgb_prediction["actual"],test_xgb_prediction["predicted_.5"]))
confusion=metrics.confusion_matrix(test_xgb_prediction["actual"],test_xgb_prediction["predicted_.5"])
confusion
#we got more test accuracy than logistic regression model

In [None]:
#check sensitivity with .12 of cutoff probability
# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives
confusion=metrics.confusion_matrix(test_xgb_prediction["actual"],test_xgb_prediction["predicted_.12"])
sensi_xgb_test=confusion[1,1]/(confusion[1,0]+confusion[1,1])
print("Using XGBoost the sensitivity on test set is",sensi_xgb_test)
#we got more sensitivity on test data than logistic regression 
confusion

In [None]:
#XGBoost is peforming like random forest with very minimul improvement over accuracy and sentivity

In [None]:
#as we got higher accuracy with RandonForest, lets use the same model on unseen data for kaggle submissin
unseen.head()
unseen_xgb_test=unseen[var_cols]
unseen_xgb_test.shape

In [None]:
 
#imputing the missing values with zero using same logic we did on train dataset
unseen_xgb_test[var_cols] = imp.fit_transform(unseen_xgb_test[var_cols])
(unseen_xgb_test.isna().sum()/len(unseen_xgb_test)).sort_values(ascending=False).head()


In [None]:
#predict churn probalities and storing the same against id
#output the kaggle submission file based on cutoff prob of .5
pred_probs_xgb_unseen = xgb_best.predict_proba(unseen_xgb_test)

submission_xgb_data=pd.DataFrame()
submission_xgb_data["id"]=unseen.id
submission_xgb_data["proba"]=pred_probs_xgb_unseen[:,1]
submission_xgb_data["churn_probability"]=submission_xgb_data["proba"].map(lambda x: 1 if x > .5 else 0)
submission_xgb_data.head()

In [None]:
#preparing the file for submission
submission_xgb_data=submission_xgb_data.drop("proba", axis=1, errors="ignore")
submission_xgb_data.head()

In [None]:
submission_xgb_data.churn_probability.value_counts()

In [None]:
submission_xgb_data.to_csv('submission_xgb.csv',index=False)


## Kaggle Submission

In [None]:
#Since, XGBoost is giving highest accuracy with a cut-off probability of .5, we are using the predicted output of the same
#for final kaggle submission.

In [None]:
submission_xgb_data.to_csv('Submission.csv',index=False)

## Feature Importance

In [None]:
#as both RandomForest and XGBoost are on the par in terms of both accuracy and sensitivity, either of them can be used for 
#feature importance. We are using RandomForest for the same.
rf_best.feature_importances_
imp_df = pd.DataFrame({
    "Varname": var_cols,
    "Importance_pct": rf_best.feature_importances_*100
})
print(imp_df.shape)
imp_df.sort_values(by="Importance_pct", ascending=False)


In [None]:
#the above list shows importance of predictor variables in a descending order but it doesn't give the sign/direction
#of the same.Hence, taking top 30 variables with highest importance and building a logistic regression on top of these.

In [None]:
imp=imp_df.sort_values(by="Importance_pct", ascending=False)
log_cols=imp.Varname[0:30]

In [None]:
X_train, X_test, y_train, y_test= train_test_split(churn_X, churn_y, train_size = 0.7, test_size = 0.3, random_state = 100,\
                                                   stratify=churn.churn_probability.values)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
X_train_log=X_train[log_cols]
X_test_log=X_test[log_cols]

X_train_log.shape, X_test_log.shape

In [None]:
#normalizing the variables as we want to retain original data distribution for feature interpretation
from sklearn.preprocessing import MinMaxScaler

scaler_log = MinMaxScaler()
X_train_log= scaler_log.fit_transform(X_train_log)
X_train_log=pd.DataFrame(X_train_log,columns=log_cols)


X_test_log= scaler_log.transform(X_test_log)
X_test_log=pd.DataFrame(X_test_log,columns=log_cols)
X_train_log.describe()

In [None]:
X_test_log.describe()


In [None]:
X_train_log.shape, X_test_log.shape

In [None]:
#build logistic regression using gridsearch
# specify range of hyperparameters to tune

w=[{0:0.2, 1: 0.8}]
hyper_params = [{'class_weight':w, 
                 'C': [0.1, 0.5, 1, 2, 3, 4, 5, 10], 
                 'penalty': ['l1', 'l2']}]

#create a 5 fold cross-validation scheme

folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 100)


In [None]:
logistic=LogisticRegression()


lr_feature = GridSearchCV(estimator = logistic, 
                          param_grid = hyper_params, 
                          scoring= 'roc_auc', 
                          cv = folds, 
                          return_train_score=True,
                          verbose = 1)  
lr_feature.fit(X_train_log, y_train)

In [None]:
#best AUC score
lr_feature.best_score_

In [None]:
lr_feat_best=lr_feature.best_estimator_
lr_feature.best_params_

In [None]:
print("the test AUC score is",metrics.roc_auc_score(y_test,lr_feat_best.predict_proba(X_test_log)[:,1]))
print("the train AUC score is",metrics.roc_auc_score(y_train,lr_feat_best.predict_proba(X_train_log)[:,1]))
#overfitting is avoided

In [None]:
#extract coefficients for feature importance
lr_feat_best.coef_

In [None]:
feature_coeff=pd.DataFrame()
feature_coeff["feature"]=X_train_log.columns
feature_coeff["coefficients"]=lr_feat_best.coef_.reshape((-1,1))
feature_coeff

In [None]:
#the features which are positively impacting churn
feature_coeff[feature_coeff.coefficients>0]

## Recommendations

1.Customers who are making calls while in roaming tend to leave more. The roaming packages/rates should be 
revisited base on customer feedback.

2.Instead of data usage, the voice call usage are greatly reducing churn tendency. This is an indicator of good network
quality. However, it is surprising to not to see much of the data related attributes which are impacting the churn. Since, we 
observed that highest data recharge amount is reducing churn tendency among customers, more attractive data offers should
be proposed in the market considering the current technical advancement of electronics devices.

3.The STD voice call usage is not reducing the churn like local calls, the company should focus on STD rates to improve 
STD usage.

4.Apart from these, the company should gather the customer queries and complaints information and work on their business strategies accordingly to reduce the churn rate.
