In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('data/churn_data.csv')

In [3]:
#change dtype of 'TotalCharges' from object to numeric
#if there is an error, because the value has a space (" "), we use --> errors='coerce'
total_charges = pd.to_numeric(df.TotalCharges, errors='coerce')

In [4]:
#getting null (missing) values
df[total_charges.isnull()][['customerID', 'TotalCharges']]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


In [5]:
#set missing values to zero
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.TotalCharges = df.TotalCharges.fillna(0)

In [6]:
#column names & string values: lowercasing everything and replace spaces with underscore
df.columns = df.columns.str.lower().str.replace(' ', '_')
string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [7]:
df.head(3)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes


In [8]:
#change target variable from object to integer (if yes, then 1; if no, then 0)
df.churn = (df.churn == 'yes').astype(int)

In [9]:
#splitting the dataset in different subsets
from sklearn.model_selection import train_test_split


In [10]:
#shuffling the data of df and splitting it into 2 sets
#df_train_full (80%), df_test(20%)
#random_state guarantees that the data is always shuffled in the same way
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [11]:
#take df_train_full and split it into train and val
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)
#save target variable in a matrix array
y_train = df_train.churn.values
y_val = df_val.churn.values
#delete target variable from training and validation set
del df_train['churn']
del df_val['churn']

In [12]:
#check for missing values
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [13]:
#check distribution of values in target variable
#most of people did not churn
df_train_full.churn.value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [14]:
#get the churn rate
#we see it is an imbalanced dataset
global_mean = df_train_full.churn.mean()
round(global_mean, 3)

0.27

In [15]:
global_mean

0.26996805111821087

In [16]:
#create variable lists
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
'phoneservice', 'multiplelines', 'internetservice',
'onlinesecurity', 'onlinebackup', 'deviceprotection',
'techsupport', 'streamingtv', 'streamingmovies',
'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [17]:
#get number of unique values for each variable
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

Feature importance: Which features really influence our target variable?<br>
It helps us to answer these questions:
* What makes customers churn?
* What are the characteristics of people who churn?

In [18]:
#check churn rate for female/male customers
#difference from global mean is not high
#--> this variable is not useful for predicting churn
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()
print('female', round(female_mean, 3))
male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()
print('male', round(male_mean, 3))

female 0.277
male 0.263


In [19]:
#check churn rate for people, who have/don't have partners
#difference from global mean is high
#--> this variable is useful for predicting churn
partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean()
print('partner == yes', round(partner_yes, 3))
partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()
print('partner == no', round(partner_no, 3))

partner == yes 0.205
partner == no 0.33


Risk ratio:<br> 
* Ratio between global rate and group rate
* ratio between probabilities in different groups
* --> risk = group rate / global rate
* risk of churning for females: risk = 27.7% / 27% = 1.02
* it tells us how likely the group elements would have the effect to churn,<br>
compared with whole group
* if risk ratio < 1, this group has a lower risk to churn (change) <br>
than whole group and vice versa<br>
<img src="data/pic-1.png" width=400 align="left">

In [20]:
#compute AVG(churn) --> get mean churn rate for females and males
df_group = df_train_full.groupby(by='gender').churn.agg(['mean'])
df_group

Unnamed: 0_level_0,mean
gender,Unnamed: 1_level_1
female,0.276824
male,0.263214


In [21]:
#calculate difference between group chrun rate and global rate
df_group['diff'] = df_group['mean'] - global_mean
df_group

Unnamed: 0_level_0,mean,diff
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.276824,0.006856
male,0.263214,-0.006755


In [22]:
#calculate the risk of churning
df_group['risk'] = df_group['mean'] / global_mean

df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


In [23]:
#get this overview for all categorical variables
from IPython.display import display
for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['rate'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,rate
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


Unnamed: 0_level_0,mean,diff,rate
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.027698,0.897403
1,0.413377,0.143409,1.531208


Unnamed: 0_level_0,mean,diff,rate
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059841,1.221659
yes,0.205033,-0.064935,0.759472


Unnamed: 0_level_0,mean,diff,rate
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.043792,1.162212
yes,0.165666,-0.104302,0.613651


Unnamed: 0_level_0,mean,diff,rate
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028652,0.89387
yes,0.273049,0.003081,1.011412


Unnamed: 0_level_0,mean,diff,rate
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012561,0.953474
no_phone_service,0.241316,-0.028652,0.89387
yes,0.290742,0.020773,1.076948


Unnamed: 0_level_0,mean,diff,rate
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077621,0.712482
fiber_optic,0.425171,0.155203,1.574895
no,0.077805,-0.192163,0.288201


Unnamed: 0_level_0,mean,diff,rate
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150953,1.559152
no_internet_service,0.077805,-0.192163,0.288201
yes,0.153226,-0.116742,0.56757


Unnamed: 0_level_0,mean,diff,rate
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134355,1.497672
no_internet_service,0.077805,-0.192163,0.288201
yes,0.217232,-0.052736,0.80466


Unnamed: 0_level_0,mean,diff,rate
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125907,1.466379
no_internet_service,0.077805,-0.192163,0.288201
yes,0.230412,-0.039556,0.85348


Unnamed: 0_level_0,mean,diff,rate
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148946,1.551717
no_internet_service,0.077805,-0.192163,0.288201
yes,0.159926,-0.110042,0.59239


Unnamed: 0_level_0,mean,diff,rate
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072864,1.269897
no_internet_service,0.077805,-0.192163,0.288201
yes,0.302723,0.032755,1.121328


Unnamed: 0_level_0,mean,diff,rate
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068938,1.255358
no_internet_service,0.077805,-0.192163,0.288201
yes,0.307273,0.037305,1.138182


Unnamed: 0_level_0,mean,diff,rate
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161733,1.599082
one_year,0.120573,-0.149395,0.446621
two_year,0.028274,-0.241694,0.10473


Unnamed: 0_level_0,mean,diff,rate
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097897,0.637375
yes,0.338151,0.068183,1.25256


Unnamed: 0_level_0,mean,diff,rate
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101797,0.622928
credit_card_(automatic),0.164339,-0.10563,0.608733
electronic_check,0.45589,0.185922,1.688682
mailed_check,0.19387,-0.076098,0.718121


<b>Mutual Information (MI)</b>
* tells us how much info we learn about one variable if we knowthe value<br>
of other variables
* we use it to measure mutual dependency between 2 variables
* higher mutual info means higher dependence
* MI only works for categorical but not for numerical variables

In [24]:
#get mutual information of all categorical variables
#contract is most useful, and gender is the least useful variable
from sklearn.metrics import mutual_info_score

#this function calculates mutual information
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.churn)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


<b>Correlation Coefficient (Pearson's correlation coeeficient)</b>
* values from -1 to 1
* value above 0: one value goes up, the other one goes upp as well<br>
and we might see more 1's then 0's in the target variable
* that also works for binary target variables (like churn)
* value = 0: no correlation

In [25]:
#calculate corr coeff between 3 num. variables and target variables
df_train_full[numerical].corrwith(df_train_full.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

We use 'DictVectorizer' to do one-hot encoding for categorical variables.<br>
It converts a dataframe to a dictionary and then is doing one-hot encoding

In [26]:
#convert training set to dictionary
train_dict = df_train[categorical + numerical].to_dict(orient='rows')

In [27]:
#make use of DictVectorizer
#'dv' is a DictVectorizer instance
#if feature is categorical, it applies one-hot encoding
#if feature is numerical, it is left intact
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
               sparse=False)

In [28]:
#use 'transform' method to convert dictionaries to matrix
X_train = dv.transform(train_dict)

In [29]:
#take a look at first row of the matrix
#the 3 numerical variables haven't been changed
X_train[0]

array([0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 8.6100e+01, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 7.1000e+01, 6.0459e+03])

In [30]:
#the column names and their values
dv.get_feature_names()

['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender=female',
 'gender=male',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'multiplelines=no',
 'multiplelines=no_phone_service',
 'multiplelines=yes',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'partner=no',
 'partner=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'phoneservice=no',
 'phoneservice=yes',
 'seniorcitizen',
 'streamingmovies=no',
 'streamingmovies=no_internet_service',
 'streamingmovies=yes',
 'streamingtv=no',
 'streamingtv=no_internet_servic

Formula for logistic regression:<br>
<img src="data/pic-2.png" width=200 align="left"><br><br><br>
sigmoid function maps any value to a number between 0 and 1<br>
<img src="data/pic-3.png" width=400 align="left"><br>

CODE FOR LIINEAR REGRESSION<br>
`def linear_regression(xi):
    score = bias
    for j in range(n)：
        result = result + xi[j] * w[j]
    return result`
    
CODE FOR LOGISTIC REGRESSION<br>
`def linear_regression(xi):
    score = bias
    for j in range(n)：
        score = score + xi[j] * w[j]
    prob = sigmoid(score)
    return prob`
    
DEFINITION OF SIGMOID FUNCTION<br>
`import math
def sigmoid(score):
    return 1 / (1 + math.exp(- score))`

In [31]:
#train logictic regression model
from sklearn.linear_model import LogisticRegression
#train model by calling the 'fit' method
#X_train is derived from training set (besides we also have validation and testing set)
model = LogisticRegression(solver='liblinear', random_state = 1)
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
#do one-hot encoding for validation set
val_dict = df_val[categorical + numerical].to_dict(orient='rows')
#use 'transform' method to convert dictionaries to matrix
#'dv' for vectorization was already defined before
X_val = dv.transform(val_dict)

In [33]:
#use the model to predict the target variable
y_pred = model.predict_proba(X_val)

In [34]:
#'y_pred' gives a two-colum matrix
#first column: probab that target is negative --> no churn (0)
#second column: probab that target is positive --> churn (0)
y_pred

array([[0.76509203, 0.23490797],
       [0.73114243, 0.26885757],
       [0.68054933, 0.31945067],
       ...,
       [0.9427494 , 0.0572506 ],
       [0.38477113, 0.61522887],
       [0.93872737, 0.06127263]])

In [35]:
#we only need the second column because both columns contain same info
#probab of churning is p, probab of not churning is always 1-p
#so we can ignore first column
y_pred = model.predict_proba(X_val)[:, 1]

In [36]:
#need to change the above soft prediction to hard (binary) predictions
y_pred >= 0.5

array([False, False, False, ..., False,  True, False])

how the above comparison operator works<br>
<img src="data/pic-4.png" width=400 align="left">

In [37]:
#write this result to 'churn' array
churn = y_pred >= 0.5

use 'accuracy' to check the quality of the model<br>
have to know:<br> 
* compare each prediction with actual value
* if prediction is churn and real value is also churn, then model is good
* if pred is churn and real value not churn, model is bad
* How to measure quality of model? calculate number of times,<br>
where predictions match real values and divide it by total number of cases

In [38]:
#'y_val' contains '0' and '1' and is the array with real numbers
#'churn' contains the boolean predictions 'False' and 'True' 
# ‘==’ compares the 2 arrays
#it is still possible that the integer values of y_val and
#the boolean values of 'churn' can be compared
(y_val == churn).mean()

0.8016129032258065

the result of the comparison is a boolean array<br>
<img src="data/pic-5.png" width=400 align="left"><br><br><br><br><br>

the mean() function casts the boolean values to integers<br>
the result tells us that 80 percent of the predictions are correct<br>
(similar to the respective values of y_val)

<b>Model interpretation</b><br>
Model has 2 parameters, that it learns from data:<br>
* w_0 is bias term
* w = (w_1, w_2, ..., w_n) is weights vector

In [39]:
#getting the bias term (interception with y axis)
model.intercept_[0]

-0.121988402285897

In [40]:
#getting the rest of the weights
model.coef_[0]

array([ 5.63358844e-01, -8.58950440e-02, -5.99452202e-01, -3.02715213e-02,
       -9.17168810e-02,  9.99326465e-02, -1.15872472e-01, -1.06048577e-01,
       -2.73673749e-02, -9.46210274e-02, -3.23341996e-01,  3.17226066e-01,
       -1.15872472e-01,  7.83927198e-04, -1.68103922e-01,  1.27130908e-01,
       -8.10153887e-02,  1.35696746e-01, -1.15872472e-01, -1.41812676e-01,
        2.57846840e-01, -1.15872472e-01, -2.63962770e-01, -2.12619978e-01,
        9.06315759e-02, -4.80049611e-02, -7.39834412e-02, -2.66742071e-02,
       -1.36241079e-01,  1.74736082e-01, -1.33809198e-01,  1.27130908e-01,
       -2.49119310e-01,  2.97086480e-01, -8.48433099e-02, -1.15872472e-01,
        7.87273800e-02, -9.90600891e-02, -1.15872472e-01,  9.29441591e-02,
        1.78133778e-01, -1.15872472e-01, -1.84249708e-01, -6.94875281e-02,
        4.47701427e-04])

In [41]:
#see which feature is associated with which weight
dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))

{'contract=month-to-month': 0.563,
 'contract=one_year': -0.086,
 'contract=two_year': -0.599,
 'dependents=no': -0.03,
 'dependents=yes': -0.092,
 'deviceprotection=no': 0.1,
 'deviceprotection=no_internet_service': -0.116,
 'deviceprotection=yes': -0.106,
 'gender=female': -0.027,
 'gender=male': -0.095,
 'internetservice=dsl': -0.323,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.116,
 'monthlycharges': 0.001,
 'multiplelines=no': -0.168,
 'multiplelines=no_phone_service': 0.127,
 'multiplelines=yes': -0.081,
 'onlinebackup=no': 0.136,
 'onlinebackup=no_internet_service': -0.116,
 'onlinebackup=yes': -0.142,
 'onlinesecurity=no': 0.258,
 'onlinesecurity=no_internet_service': -0.116,
 'onlinesecurity=yes': -0.264,
 'paperlessbilling=no': -0.213,
 'paperlessbilling=yes': 0.091,
 'partner=no': -0.048,
 'partner=yes': -0.074,
 'paymentmethod=bank_transfer_(automatic)': -0.027,
 'paymentmethod=credit_card_(automatic)': -0.136,
 'paymentmethod=electronic_check': 0.175,


to understand how the model works, we just use 3 features: contract, tenure and total charges

In [42]:
small_subset = ['contract', 'tenure', 'totalcharges']
#convert to dictionary
train_dict_small = df_train[small_subset].to_dict(orient='rows')
#make use of DictVectorizer to perform one hot encoding
dv_small = DictVectorizer(sparse=False)
#use fit method to apply DictVectorizer to dictionary
dv_small.fit(train_dict_small)
#transform dictionary to matrix array
X_small_train = dv_small.transform(train_dict_small)



In [43]:
#check out the nature of the features
#during one-hot encoding, the contract feature will be changed to 3 new featuures
#'tenure' and 'totalcharges' are numerical and stay the same
dv_small.get_feature_names()

['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'tenure',
 'totalcharges']

In [44]:
#train the small model on the set of 5 features
model_small = LogisticRegression(solver='liblinear', random_state=1)
model_small.fit(X_small_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
#check out the bias of this small model
model_small.intercept_[0]

-0.6384442006590022

In [46]:
#get the weights of the other features
dict(zip(dv_small.get_feature_names(), model_small.coef_[0].round(3)))

{'contract=month-to-month': 0.909,
 'contract=one_year': -0.145,
 'contract=two_year': -1.403,
 'tenure': -0.097,
 'totalcharges': 0.001}

If we apply the sigmoid function on the bias term,<br>
the result would be a costumer, what does not churn<br>
--> probab (output value) less than 0.5<br>
<img src="data/pic-6.png" width=400 align="left">

How do we consider the other weights?<br>
* If we have a customer with a month-to-month contract, the weight 0.91 becomes relevant
* If we have a customer with a one-year contract, the weight -0.144 becomes relevant
* If we have a customer with a two-year contract, the weight -1.404 becomes relevant<br>
<img src="data/pic-7.png" width=400 align="left"><br><br><br><br><br><br><br><br><br>
--> if sign of weight is positive, customer is likely to churn<br>
--> if sign of weight is negative, customer is likely not to churn

How the weights would affect the probab to churn<br>
Probab > 0.5 --> churn<br>
Probab < 0.5 --> stay with company<br>
<img src="data/pic-8.png" width=400 align="left">

Applying the model for scoring them

In [47]:
#take a customer we want to score and put his features 
#in a dictionary
customer = {
    'customerid': '8879-zkjof',
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no',
    'deviceprotection': 'yes',
    'techsupport': 'yes',
    'streamingtv': 'yes',
    'streamingmovies': 'yes',
    'contract': 'one_year',
    'paperlessbilling': 'yes',
    'paymentmethod': 'bank_transfer_(automatic)',
    'monthlycharges': 79.85,
    'totalcharges': 3320.75,
}

In [48]:
#convert dictionary to a matrix, using DictVectorizer
#we see one row with the one-hot encoded features of the 
#customer and the three unchanged numerical features
X_test = dv.transform([customer])
X_test

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 7.98500e+01, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
        1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 4.10000e+01, 3.32075e+03]])

In [49]:
#take this matrix and put it in into the trained model
model.predict_proba(X_test)

array([[0.92667596, 0.07332404]])

In [50]:
#probab that this customer churns is 7.3 percent
model.predict_proba(X_test)[0,1]

0.07332403595205103

In [51]:
#take another customer
#this customer has probab of 83% to churn
customer = {
    'gender': 'female',
    'seniorcitizen': 1,
    'partner': 'no',
    'dependents': 'no',
    'phoneservice': 'yes',
    'multiplelines': 'yes',
    'internetservice': 'fiber_optic',
    'onlinesecurity': 'no',
    'onlinebackup': 'no',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'yes',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 85.7,
    'totalcharges': 85.7
}
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

0.8321645264352351