In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load data set and rapid preview data
<li><ul>7043 entries</ul></li>
<li><ul>21 columns</ul></li>
<li><ul>0 null value</ul></li>

In [None]:
path_data = 'churn/'
df =pd.read_csv(path_data+'dataset-churn.csv')
df.info()
df.head()

In [None]:
def assess_NA(data):
    """
    Returns a pandas dataframe denoting the total number of NA values and the percentage of NA values in each column.
    The column names are noted on the index.
    
    Parameters
    ----------
    data: dataframe
    """
    # pandas series denoting features and the sum of their null values
    null_sum = data.isnull().sum()# instantiate columns for missing data
    total = null_sum.sort_values(ascending=False)
    percent = ( ((null_sum / len(data.index))*100).round(2) ).sort_values(ascending=False)
    
    # concatenate along the columns to create the complete dataframe
    df_NA = pd.concat([total, percent], axis=1, keys=['Number of NA', 'Percent NA'])
    
    # drop rows that don't have any missing data; omit if you want to keep all rows
    df_NA = df_NA[ (df_NA.T != 0).any() ]
    
    return df_NA
miss = assess_NA(df)
miss.head()

## Change type of some columns

In [None]:
# set customerID as string
def changeTypeCol(df):
    df['customerID'] = df['customerID'].astype('str')
    # set some columns to categorical
    df['SeniorCitizen'] = df['SeniorCitizen'].astype('category')
    df['gender'] = df['gender'].astype('category')
    df['Partner'] = df['Partner'].astype('category')
    df['Dependents'] = df['Dependents'].astype('category')
    df['PhoneService'] = df['PhoneService'].astype('category')
    df['MultipleLines'] = df['MultipleLines'].astype('category')
    df['InternetService'] = df['InternetService'].astype('category')
    df['OnlineSecurity'] = df['OnlineSecurity'].astype('category')
    df['OnlineBackup'] = df['OnlineBackup'].astype('category')
    df['DeviceProtection'] = df['DeviceProtection'].astype('category')
    df['TechSupport'] = df['TechSupport'].astype('category')
    df['StreamingTV'] = df['StreamingTV'].astype('category')
    df['StreamingMovies'] = df['StreamingMovies'].astype('category')
    df['Contract'] = df['Contract'].astype('category')
    df['PaperlessBilling'] = df['PaperlessBilling'].astype('category')
    df['PaymentMethod'] = df['PaymentMethod'].astype('category')
    df['Churn'] = df['Churn'].astype('category')
    #df['TotalCharges'] = df['TotalCharges'].astype('float64')
    return df

df = changeTypeCol(df)
"""
#handling mixed types of elements in a columns TotalCharges
try:
    df['TotalCharges'] = df['TotalCharges'][df['TotalCharges'].astype(float)] 
except:
    df['TotalCharges'] = np.median(df['TotalCharges'])
df.shape
#miss = assess_NA(df)
#miss.head()
"""

df["TotalCharges"] = df["TotalCharges"].convert_objects(convert_numeric=True).fillna(0)
#df.info()
print(df["TotalCharges"].shape)

## Replace TotalCharges empty to median

df["TotalCharges"] = df["TotalCharges"].replace(np.median(df["TotalCharges"]), "")
print(df.isnull().sum())
df.to_csv(path_data+"dataset-churn-whnull.csv", index=False)

df = pd.read_csv(path_data+"dataset-churn-whnull.csv")
#df = changeTypeCol(df)

miss = assess_NA(df)
miss.head()

In [None]:
# check duplicated data
if len(df[df.duplicated()]) > 0:
    print("len of duplicated entries: ", len(df[df.duplicated()]))
    print(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head())
else:
    print("No duplicated entries found")

## Distribution categorical data
This step is important to have an ideas about the unique values of each categorical data.

In [None]:
def dist_cat(df, col):
    plt.figure(figsize=(8,4))
    count  = df.value_counts()
    g = sns.barplot(count.index, count.values, alpha=0.8)

    plt.title('Distribution '+col) 
    plt.ylabel("Number of Occurence", fontsize=12)
    plt.xlabel(col, fontsize=12)
    sns.despine()
    
cat = [dist_cat(df[col], col) for col in df.select_dtypes(include='category').columns]
#scatter = pd.scatter_matrix(X_train, c= y_train, marker = 'o', s=40, hist_kwds={'bins':15}, figsize=(9,9), cmap=cmap)

In [None]:
all_cat = list(df.select_dtypes(include='category').columns.values)

bool_cat = ['Partner', 'Dependents', 'PhoneService','PaperlessBilling', 'Churn']
for c in bool_cat:
    df[c] = df[c].map({'Yes':1, 'No':0})
df["MultipleLines"] = df["MultipleLines"].map({'Yes':1, 'No':0, 'No phone service':0})
df["gender"] = df["gender"].map({'Male':1, 'Female':0})
df["InternetService"] = df["InternetService"].map({'No':0, 'DSL':1, 'Fiber optic':2})
df["OnlineSecurity"] = df["OnlineSecurity"].map({'No':0, 'No internet service':0,'Yes':1})
df["DeviceProtection"] = df["DeviceProtection"].map({'No':0, 'No internet service':0,'Yes':1})
df["TechSupport"] = df["TechSupport"].map({'No':0, 'No internet service':0,'Yes':1})
df["StreamingTV"] = df["StreamingTV"].map({'No':0, 'No internet service':0,'Yes':1})
df["StreamingMovies"] = df["StreamingMovies"].map({'No':0, 'No internet service':0,'Yes':1})
df["Contract"] = df["Contract"].map({'Month-to-month':0, 'One year':1,'Two year':2})
df["PaymentMethod"] = df["PaymentMethod"].map({'Electronic check':0, 'Mailed check':1, 
                                               'Bank transfer (automatic)':2,'Credit card (automatic)':3})
df["OnlineBackup"] = df["OnlineBackup"].map({'No':0, 'No internet service':0, 'Yes':1})

print(df.isnull().sum())

df.to_csv(path_data+"dataset-churn-mapped.csv", index=False)
df.head()

In [None]:
# load data mapped
df = pd.read_csv(path_data+"dataset-churn-mapped.csv")
print(df.isnull().sum())
df.head()
df.info()

## Relation categorical data between the label churn

In [None]:
def pourcentDiff(distr_churn0, distr_churn1):
    distr_churn1["percent"] = None
    distr_churn0["percent"] = None
    for j in range(len(distr_churn0)):
        if (distr_churn1["Churn"][j] < distr_churn0["Churn"][j]):
            distr_churn1["percent"][j] = distr_churn0["percent"][j] = distr_churn1["Churn"][j]*100/distr_churn0["Churn"][j]
        else:
            distr_churn1["percent"][j] = distr_churn0["percent"][j] = distr_churn0["Churn"][j]*100/distr_churn1["Churn"][j]
    return distr_churn0, distr_churn1

def piechart(distr_churn0):
    slices = distr_churn0["percent"].values
    classe = range(len(distr_churn0["percent"]))

    plt.pie(slices,
            labels=classe,
            startangle=90,
            shadow= True,
            autopct='%1.1f%%'
            )

#electronic check
def distr_bet_churn(df, col):
    df_no_churn = df.loc[df["Churn"]==0]
    df_churn = df.loc[df["Churn"]==1]
    distr_churn0 = df_no_churn[[col, "Churn"]].groupby(col).count()
    distr_churn1 = df_churn[[col, "Churn"]].groupby(col).count()
    distr_churn0, distr_churn1 = pourcentDiff(distr_churn0, distr_churn1)
    plt.figure(figsize=(11,5))
    plt.subplot(121)
    plt.tight_layout()
    # plot our bar chart in a 1er subplot
    plt.bar(distr_churn0["Churn"].index - 0.25/2, distr_churn0["Churn"].values, color = '#008E8E', width = 0.25)
    plt.bar(distr_churn0["Churn"].index + 0.25/2, distr_churn1["Churn"].values, color = '#DB1702', width = 0.25)
    plt.title("Relation "+col+" and Churn")
    plt.xlabel(col)
    plt.legend(["No Churn", "Churn"])
    plt.ylabel("freq")
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.xticks(range(len(distr_churn0)))
    
    plt.subplot(122)
    # plot our pie chart in a 2th subplot
    plt.tight_layout()
    piechart(distr_churn0)
    plt.title("Customer churn (%) between "+col)
    
    if col=="PaymentMethod":
        plt.legend(["Electronic", "Mailed", "Bank transfer", "Credit Card"])
        
    elif col=="Contract":
        plt.legend(["Month-to-Month", "One year", "Two year"])
    
    else:
        plt.legend(["no use "+col, "use "+col])
    
    
    
list_cat = all_cat
list_cat.remove("Churn")
list_cat.remove('SeniorCitizen')
distr_ch = [distr_bet_churn(df, col) for col in list_cat]

## Univariate plots (Numerical variable)

In [None]:
def dist_num(df, col):
    plt.figure()
    sns.distplot(df, kde=False, bins=20);
    plt.title("Distribution of "+str(col))
    plt.ylabel("freq")
    sns.despine()
col_num = ["tenure", "MonthlyCharges", "TotalCharges"]

num = [dist_num(df[col], col) for col in col_num]

In [None]:
col = ["tenure", "MonthlyCharges", "TotalCharges", "Churn"]
sns.pairplot(df[col], hue="Churn", size=2);

## check outlier Numerical variable

In [None]:
boxplot = df.boxplot(figsize=(15,6), column=['tenure', 'MonthlyCharges'], by="Churn", layout=(1, 2))
boxplot = df.boxplot(figsize=(15,6), column=['TotalCharges'], by="Churn")

In [None]:
#Capping the outlier rows with Percentiles
def detOutlier(data, column):

    Q1_Income = data[column].quantile(0.25)
    Q3_Income = data[column].quantile(0.75)
    IQR_Income = Q3_Income  - Q1_Income
    upper_Income = Q3_Income + 1.5 * IQR_Income
    lower_Income = Q1_Income - 1.5 * IQR_Income
    
    data[column].loc[(data[column] > upper_Income)] = np.median(data[column]) 
    data[column].loc[(data[column] < lower_Income)] = np.median(data[column])
    
    return data
print(df["TotalCharges"].sum())
outlier = ["TotalCharges", "tenure"]
for c in outlier:
    data_train1 = detOutlier(df[df["Churn"]==0], c)
    df = data_train1.append(detOutlier(df[df["Churn"]==1], c))
print(df["TotalCharges"].sum())


## Change the outlier in a median value

#Capping the outlier rows with Percentiles
def detOutlier(data, column, target):
    
    
    Q1_Income = data[column].loc[data[column]==target].quantile(0.25)
    Q3_Income = data[column].loc[data[column]==target].quantile(0.75)
    IQR_Income = Q3_Income - Q1_Income
    upper_Income = Q3_Income + 1.5 * IQR_Income
    lower_Income = Q1_Income - 1.5 * IQR_Income
    
    data[column].loc[(data[column] > upper_Income) & (data["Churn"]==target)] = np.median(data[column].loc[data["Churn"]==target]) 
    data[column].loc[(data[column] < lower_Income) & (data["Churn"]==target)] = np.median(data[column].loc[data["Churn"]==target])
    
    return data

print(df["TotalCharges"].loc[df["Churn"]==1].mean())
      
#data = detOutlier(df, "TotalCharges", 0) 
data = detOutlier(df, "TotalCharges", 1) 

print(data["TotalCharges"].loc[data["Churn"]==1].mean())


## Correlation Matrix

In [None]:
plt.figure(figsize=(15,15))

corr = df.corr(method='pearson')

sns.heatmap(corr, annot=True, cmap=plt.cm.Reds)
plt.show()

## Features selection

In [None]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

df = pd.read_csv(path_data+"dataset-churn-mapped.csv")

model = ExtraTreesClassifier()

col_features = list(df.columns[1:-1].values)

X = df[col_features]

y = df["Churn"]

model = model.fit(X, y)

# display the relative importance of each attribute
importances = model.feature_importances_

plt.barh(col_features, importances, color="r",align="center")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize, scale
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn import preprocessing
import pickle
from sklearn.utils import shuffle
import xgboost as xgb
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error


seed = 0
np.random.seed(seed)


colUsed = ["TotalCharges", "MonthlyCharges", "PaymentMethod", "PaperlessBilling", "Contract", "StreamingMovies",
            "StreamingTV", "TechSupport", "DeviceProtection", "OnlineSecurity", "InternetService", "MultipleLines",
            "PhoneService", "tenure", "Dependents", "Partner", "SeniorCitizen", "gender"]


x_train, x_test, y_train, y_test = train_test_split(X[colUsed], y, test_size=0.3, random_state=None)
print("data splited")

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
xgb_model = xgb.XGBClassifier(learning_rate= 0.01, eta= 0.01, max_depth= 5, #objective="multi:softprob", 
                              n_estimators= 125, min_child_weight= 7 ,# subsample= 0.8, colsample_bytree= 0.9,
#                             #gamma=0.3)
                             )
                              
xgb_model.fit(x_train, y_train)
                              
y_pred_train = xgb_model.predict(x_train)
                              
y_pred_test = xgb_model.predict(x_test)
                              
print("accuracy xgboost TR:",np.round(accuracy_score(y_train, y_pred_train), 3))
print("accuracy xgboost TS:",np.round(accuracy_score(y_test, y_pred_test), 3))
                              
plot_confusion_matrix(y_train, y_pred_train,classes=[0, 1],
                      title='Confusion matrix, train', normalize=True)

plot_confusion_matrix(y_test, y_pred_test,classes=[0, 1],
                      title='Confusion matrix, test', normalize=True)                              


grid_params = {'lr':[0.01],
               'min_child_weight':[5,7,10,15],
               'max_depth':[3],
               'n_estimators':[100, 125, 150]}
gs = GridSearchCV(
	              xgb.XGBClassifier(),
	              grid_params,
	              verbose=1,
	              cv=5,
	              n_jobs=-1)
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error

res = gs.fit(df[colUsed], df["Churn"])

print(res.best_score_)
print(res.best_params_)

In [None]:
xgb_model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=0, criterion='gini')
                              
xgb_model.fit(x_train, y_train)
                              
y_pred_train = xgb_model.predict(x_train)
                              
y_pred_test = xgb_model.predict(x_test)
                              
print("accuracy xgboost TR:",np.round(accuracy_score(y_train, y_pred_train), 3))
print("accuracy xgboost TS:",np.round(accuracy_score(y_test, y_pred_test), 3))
                              
plot_confusion_matrix(y_train, y_pred_train,classes=[0, 1],
                      title='Confusion matrix, train', normalize=True)

plot_confusion_matrix(y_test, y_pred_test,classes=[0, 1],
                      title='Confusion matrix, test', normalize=True)              

In [None]:
from imblearn.over_sampling import SMOTE

df = shuffle(df)

colUsed = ["TotalCharges", "MonthlyCharges", "PaymentMethod", "PaperlessBilling", "Contract", "StreamingMovies",
            "StreamingTV", "TechSupport", "DeviceProtection", "OnlineSecurity", "InternetService", "MultipleLines",
            "PhoneService", "tenure", "Dependents", "Partner", "SeniorCitizen", "gender"]

smote = SMOTE('minority')

x_data, y_data = smote.fit_sample(df[colUsed], df["Churn"])
x_data.shape, y_data.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=None)
print("data splited")

In [None]:
#ss = StandardScaler()

#x_train = ss.fit_transform(x_train)

#x_test = ss.transform(x_test)

xgb_model = xgb.XGBClassifier(learning_rate= 0.1, eta= 0.01, max_depth= 7, #objective="multi:softprob", 
                              n_estimators= 125, min_child_weight= 3 , #subsample= 0.6, colsample_bytree= 0.9,
#                             #gamma=0.3)
                             )

#xgb_model = RandomForestClassifier(n_estimators=150, max_depth=30, random_state=0, criterion='gini')
                              
#xgb_model = DummyClassifier(strategy='most_frequent') 
    
xgb_model.fit(x_train, y_train)
                              
y_pred_train = xgb_model.predict(x_train)
                              
y_pred_test = xgb_model.predict(x_test)
                              
print("accuracy xgboost TR:",np.round(accuracy_score(y_train, y_pred_train), 3))
print("accuracy xgboost TS:",np.round(accuracy_score(y_test, y_pred_test), 3))
                              
plot_confusion_matrix(y_train, y_pred_train,classes=[0, 1],
                      title='Confusion matrix, train', normalize=True)

plot_confusion_matrix(y_test, y_pred_test,classes=[0, 1],
                      title='Confusion matrix, test', normalize=True)          

In [None]:
from sklearn.ensemble import BaggingClassifier
clf_stump = xgb.XGBClassifier(learning_rate= 0.1, eta= 0.01, max_depth= 7, #objective="multi:softprob", 
                              n_estimators= 125, min_child_weight= 3 , #subsample= 0.6, colsample_bytree= 0.9,
#                             #gamma=0.3)
                             )


xgb_model= BaggingClassifier(base_estimator=clf_stump, n_estimators=100, max_samples=0.35)

xgb_model.fit(x_train, y_train)
                              
y_pred_train = xgb_model.predict(x_train)
                              
y_pred_test = xgb_model.predict(x_test)
                              
print("accuracy xgboost TR:",np.round(accuracy_score(y_train, y_pred_train), 3))
print("accuracy xgboost TS:",np.round(accuracy_score(y_test, y_pred_test), 3))
                              
plot_confusion_matrix(y_train, y_pred_train,classes=[0, 1],
                      title='Confusion matrix, train', normalize=True)

plot_confusion_matrix(y_test, y_pred_test,classes=[0, 1],
                      title='Confusion matrix, test', normalize=True)          