# 1 hour machine learning project

Project taken from Kaggle : https://www.kaggle.com/blastchar/telco-customer-churn


"Predict behavior to retain customers. You can analyze all relevant customer data and develop focused customer retention programs." [IBM Sample Data Sets]




## My company : Fabdev (fabdev.fr)

## My name : Lucas Berbesson


# Let's get started

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go

In [None]:
df = pd.read_csv('Telco-Customer-Churn.csv')
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.fillna(0)
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
print ("Rows     : " ,df.shape[0])
print ("Columns  : " ,df.shape[1])
df.isnull().sum()

In [None]:
#replace 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for i in replace_cols : 
    df[i]  = df[i].replace({'No internet service' : 'No'})

In [None]:
sns.countplot(df["Churn"])

In [None]:
def kdeplot(feature):
    plt.figure(figsize=(9, 4.5))
    plt.title("KDE for {}".format(feature))
    ax0 = sns.kdeplot(df[df['Churn'] == 'No'][feature].dropna(), color= 'navy', label= 'Churn: No')
    ax1 = sns.kdeplot(df[df['Churn'] == 'Yes'][feature].dropna(), color= 'orange', label= 'Churn: Yes')
kdeplot('tenure')
kdeplot('MonthlyCharges')

From the plots above we can conclude that:

- Recent clients are more likely to churn
- Clients with higher MonthlyCharges are also more likely to churn


In [None]:
df.nunique()[df.nunique() < 6].keys().tolist()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#customer id col
Id_col     = ['customerID']
#Target columns
target_col = ["Churn"]
#categorical columns
cat_cols   = df.nunique()[df.nunique() < 6].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
#numerical columns
num_cols   = [x for x in df.columns if x not in cat_cols + target_col + Id_col]
#Binary columns with 2 values
bin_cols   = df.nunique()[df.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    df[i] = le.fit_transform(df[i])
    
#Duplicating columns for multi value columns
df = pd.get_dummies(data = df,columns = multi_cols )

#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(df[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
telcom = df.copy()
telcom = telcom.drop(columns = num_cols,axis = 1)
telcom = telcom.merge(scaled,left_index=True,right_index=True,how = "left")

In [None]:
telcom.head()

In [None]:
#correlation
plt.figure(figsize=(30,20))
correlation = sns.heatmap(telcom.corr(), annot=True)

In [None]:
t = telcom.corr()["Churn"].abs()
t.sort_values(ascending=False)

In [None]:
telcom.head()

In [None]:
telcom = telcom.drop(['TotalCharges'],axis=1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

#splitting train and test data 
train,test = train_test_split(telcom,test_size = .25 ,random_state = 111)
cols    = [i for i in telcom.columns if i not in Id_col + target_col]
train_X = train[cols]
train_Y = train[target_col]
test_X  = test[cols]
test_Y  = test[target_col]

def telecom_churn_prediction(algorithm,training_x,testing_x,training_y,testing_y,cols,cf=False) :
    
    #model
    algorithm.fit(training_x,training_y.values.ravel())
    predictions   = algorithm.predict(testing_x)
    #coeffs
    if   cf == "coefficients" :
        coefficients  = pd.DataFrame(algorithm.coef_.ravel())
    elif cf == "features" :
        coefficients  = pd.DataFrame(algorithm.feature_importances_)
        
    
    
    print (algorithm)
    print ("\n Classification report : \n",classification_report(testing_y,predictions))
    
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y,predictions)
    print("\n Confusion matrix : \n")
    print(conf_matrix)          
    
    if cf: 
        column_df     = pd.DataFrame(cols)
        coef_sumry    = (pd.merge(coefficients,column_df,left_index= True,
                                  right_index= True, how = "left"))
        coef_sumry.columns = ["coefficients","features"]
        coef_sumry    = coef_sumry.sort_values(by = "coefficients",ascending = False)
        features_importance = go.Bar(x = coef_sumry["features"],y = coef_sumry["coefficients"],
                        name = "coefficients",
                        marker = dict(color = coef_sumry["coefficients"],
                                      colorscale = "Portland",
                                      line = dict(width = .6,color = "black")))


        py.plot([features_importance])
    return 


In [None]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression()
telecom_churn_prediction(logit,train_X,test_X,train_Y,test_Y,cols,"coefficients")

**The precision** is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.  
Ex : 81% of the time the algorithm makes the right prediction about the fact that someone did not survive


**The recall** is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.  
Ex : 35% of the time the algorithm does not detect that someone will survive

**The F1 score** can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The formula for the F1 score is:
F1 = 2 * (precision * recall) / (precision + recall)


**The support** is the number of occurrences of each class in y_true.

**Confusion matrix** : a matrix where C_{i, j} is equal to the number of observations known to be in group i but predicted to be in group j.



We got 81% classification accuracy from our logistic regression classifier. But the precision and recall for predictions in the positive class (churn) are relatively low, which suggests our data set may be imbalanced.


In [None]:
telcom['Churn'].value_counts()

In [None]:
from sklearn.utils import resample
 
data_majority = telcom[telcom['Churn']==0]
data_minority = telcom[telcom['Churn']==1]
 
data_majority_downsampled = resample(data_majority,
replace=True,
n_samples=1869, #same number of samples as majority classe
random_state=1) #set the seed for random resampling
# Combine resampled results
telcom_balanced = pd.concat([data_minority, data_majority_downsampled])
 
telcom_balanced['Churn'].value_counts()

In [None]:
#splitting train and test data 
train,test = train_test_split(telcom_balanced,test_size = .25 ,random_state = 111)
cols    = [i for i in telcom_balanced.columns if i not in Id_col + target_col]
train_X = train[cols]
train_Y = train[target_col]
test_X  = test[cols]
test_Y  = test[target_col]

In [None]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression()
telecom_churn_prediction(logit,train_X,test_X,train_Y,test_Y,cols,"coefficients")

In [None]:
telcom_balanced.columns

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier() 
telecom_churn_prediction(gbc,train_X,test_X,train_Y,test_Y,cols)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


for algorithm in [KNeighborsClassifier, SVC, LinearSVC, NuSVC,DecisionTreeClassifier,RandomForestClassifier,
                  AdaBoostClassifier,GradientBoostingClassifier,GaussianNB,LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis]:
    algo = algorithm() 
    telecom_churn_prediction(algo,train_X,test_X,train_Y,test_Y,cols)