In [1]:
"""
This project consists of three parts:

* Part 1: Logistic Regression for Customer Churn: Completed.
* Part 2: valuation of the Metrics used for Customer Churn.
* Part 3: Deployment of the Logistic Regression Model for Customer Churn

Part 1
------

"""
# load libs
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline

In [3]:
# read in the dataset

df = pd.read_csv('churn_data.csv')

In [6]:
# check number of rows and show the first 5 rows of the dataset

print("Number of rows: ", len(df))
df.head()

Number of rows:  7043


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
# don't see a lot of the dataset; transpose the dataframe so we can see the headers of the dataset

df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [10]:
"""
Churn is the most important quantity here; its the target variable for the model.

=> churn: yes/no.

"""

# determine header types

df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [11]:
"""

TotalCharges is apparently a string; object represents a string type.
We need this be a numerical value.

"""

total_charges = pd.to_numeric(df.TotalCharges, errors='coerce')

In [12]:
# confirm data has non-numeric data and see whats missing

df[total_charges.isnull()][['customerID', 'TotalCharges']]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


In [13]:
# set missing data to zero

df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.TotalCharges = df.TotalCharges.fillna(0)

In [14]:
# Font size of column headers


df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [15]:
# Churn value is yes/no. Transform it to 1/0 for binary classification; Logistic Regression.



df.churn = (df.churn == 'yes').astype(int)

In [19]:
# check data again to check that churn is now binary (0/1) and that other formatting was successful.

df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [20]:

# Time to test some data; requires splitting


from sklearn.model_selection import train_test_split

# perform the split of the processed original data

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [21]:
"""

train_test_split takes the original data (df), shuffles it and creates 2 new dataframes:

1) df_train_full = 80% of original data
2) df_test = 20% of original data

Parameters:

1) test_size = 0.2; 20% of original dataset is used for testing
2) random_state = 1; the is shuffled in the exactly the same for each run of the program; seed = 1.

train_test_split only splits the data into 2 parts. We can split it further into 3 parts for example:

I.e. train, validation and test.

The validation set is produced by spitting the training set.

"""

df_train, df_val = train_test_split(df_train_full, test_size = 0.33, random_state = 11)

y_train = df_train.churn.values

y_val = df_val.churn.values

del df_train['churn']

del df_val['churn']

In [22]:
"""

Exploratory data analysis

- check for any missing values in the dataset; ML algorithms struggle with this.

"""

df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [23]:
"""

No missing values found in the dataset.

Check the distribution of values in the target variable.

"""

df_train_full.churn.value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [24]:
"""

Didn't churn (0) = 4113
Churned (1) = 1521

The probability of Churning; AKA The Churn Rate

"""

global_mean = df_train_full.churn.mean()

round(global_mean, 3)

0.27

In [None]:
"""

The probability that a customer will churn is 27%.

The dataset is imbalanced. There were three times as many people who didn't churn in the dataset as those 
who did churn.

We can say that the non-churn class dominates the churn class. 
A churn rate of 27% is a strong indicator of of class imbalance.

"""

In [25]:
"""

Both the categorical and numerical variables in the dataset are important, but they are also different
and need different treatment. For that, we want to look at them separately.

To manage this, 2 lists are created:

1) categorical; contains names of the categorical variables.
2) numerical; contains the names of numerical variables.


"""

# creating the 2 different lists from above

categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']

numerical = ['tenure', 'monthlycharges', 'totalcharges']



In [26]:
"""

Feature Importance

If we want to understand churn and create a good model, we need to know how the other variables effect churn.
It allows us to decide which features/variables are important for the model and which ones are not.

Two features are present:

1) categorical
2) numerical

Consider categorical features first

Firstly, look at the churn rate for each variable.

gender

Takes two values: female or male.

There are two groups here:

gender == 'female'
gender == 'male'

Calculate Churn Rate for each gender


"""

female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()

male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()

In [31]:
# Now look at the Churn Rates for each gender:

print('Churn Rate (female):', round(female_mean, 3))
print('Churn Rate (male):', round(male_mean, 3))



Churn Rate (female): 0.277
Churn Rate (male): 0.263


In [32]:
# Next we consider if a customer has a partner.

# churn rate for partner

partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean()
partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()

In [34]:
# now show the Churn Rate for customers with or without a partner

print('Customer has a Partner:', round(partner_yes, 3))
print("Customer dosen't have a Partner:", round(partner_no, 3))

Customer has a Partner: 0.205
Customer dosen't have a Partner: 0.33


In [None]:
"""

Customer with a partner = 20.5%

Customer without a partner = 33%

Customers without a partner are more likely to churn.


"""

In [37]:
"""

Risk Ratio

The ratio between probabilities in different groups.
Risk refers to the risk of having the effect.
The risk in this project is churn: the risk of churning.

In general: 

    risk = group-rate/global-rate

e.g. for the gender variable and we look at female:

    risk = 27.7% / 27% = 1.02

Risk is a number between 0 & infinity.
It tells us how likely the elements of the group are to have the effect (churn) compared with the entire population.


If the difference between the group rate and the global rate is small, the risk is close to 1: 
this group has the same level of risk as the rest of the population. 
Customers in the group are as likely to churn as anyone else. 
In other words, a group with a risk close to 1 is not risky at all.

If the risk is lower than 1, the group has lower risks: 
the churn rate in this group is smaller than the global churn. 
For example, the value 0.5 means that the clients in this group are two times 
less likely to churn than clients in general.

On the other hand, if the value is higher than 1, the group is risky: 
there’s more churn in the group than in the population. 
So a risk of 2 means that customers from the group are two times more likely to churn.


Now look at risk for all categorial variables 


"""

from IPython.display import display 
 
for col in categorical:
    
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['rate'] = df_group['mean'] / global_mean
    
    display(df_group)

Unnamed: 0_level_0,mean,diff,rate
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


Unnamed: 0_level_0,mean,diff,rate
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.027698,0.897403
1,0.413377,0.143409,1.531208


Unnamed: 0_level_0,mean,diff,rate
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059841,1.221659
yes,0.205033,-0.064935,0.759472


Unnamed: 0_level_0,mean,diff,rate
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.043792,1.162212
yes,0.165666,-0.104302,0.613651


Unnamed: 0_level_0,mean,diff,rate
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028652,0.89387
yes,0.273049,0.003081,1.011412


Unnamed: 0_level_0,mean,diff,rate
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012561,0.953474
no_phone_service,0.241316,-0.028652,0.89387
yes,0.290742,0.020773,1.076948


Unnamed: 0_level_0,mean,diff,rate
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077621,0.712482
fiber_optic,0.425171,0.155203,1.574895
no,0.077805,-0.192163,0.288201


Unnamed: 0_level_0,mean,diff,rate
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150953,1.559152
no_internet_service,0.077805,-0.192163,0.288201
yes,0.153226,-0.116742,0.56757


Unnamed: 0_level_0,mean,diff,rate
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134355,1.497672
no_internet_service,0.077805,-0.192163,0.288201
yes,0.217232,-0.052736,0.80466


Unnamed: 0_level_0,mean,diff,rate
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125907,1.466379
no_internet_service,0.077805,-0.192163,0.288201
yes,0.230412,-0.039556,0.85348


Unnamed: 0_level_0,mean,diff,rate
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148946,1.551717
no_internet_service,0.077805,-0.192163,0.288201
yes,0.159926,-0.110042,0.59239


Unnamed: 0_level_0,mean,diff,rate
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072864,1.269897
no_internet_service,0.077805,-0.192163,0.288201
yes,0.302723,0.032755,1.121328


Unnamed: 0_level_0,mean,diff,rate
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068938,1.255358
no_internet_service,0.077805,-0.192163,0.288201
yes,0.307273,0.037305,1.138182


Unnamed: 0_level_0,mean,diff,rate
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161733,1.599082
one_year,0.120573,-0.149395,0.446621
two_year,0.028274,-0.241694,0.10473


Unnamed: 0_level_0,mean,diff,rate
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097897,0.637375
yes,0.338151,0.068183,1.25256


Unnamed: 0_level_0,mean,diff,rate
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101797,0.622928
credit_card_(automatic),0.164339,-0.10563,0.608733
electronic_check,0.45589,0.185922,1.688682
mailed_check,0.19387,-0.076098,0.718121


In [39]:
"""

Looking at some random data, we can see that customers that are signed up to a month-to-month contract are more
likely to churn.
Recall: for a risk ration greater than 1, a customer is more likely to churn.

As another example, a senior citizen is much more likely to churn companed to a non-senior citizen.


Mutual Information

Used to measure the mutual dependency between two variables.

I.e. how much information we learn about a variable if we learn the value of the other variable.

Higher values of mutual information means a higher degree of dependence.
If the mutual information between a categorical variable and the target is high, this categorical variable
will be quite useful for predicting the target.

Calculating mutual information is pretty easy because its a function from the metrics package of Scikit-learn

"""

from sklearn.metrics import mutual_info_score
 
def calculate_mi(series):
    
    return mutual_info_score(series, df_train_full.churn)
 
df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='Mutual Information')
df_mi

Unnamed: 0,Mutual Information
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [None]:
"""

The most useful features, best to least.

contract, onlinesecurity and techsupport are the most useful features for the ML model.


"""

In [40]:
"""

Correlation Coefficient

Mutual information is a way to quantify the degree of dependency between two categorical variables, but it 
doesn’t work when one of the features is numerical, so we cannot apply it to the three numerical variables that we have.

We can, however, measure the dependency between a binary target variable and a numerical variable. 
We can pretend that the binary variable is numerical (containing only the numbers zero and one) 
and then use the classical methods from statistics to check for any dependency between these variables.

One such method is the correlation coefficient (sometimes referred as Pearson’s correlation coefficient). 
It is a value from –1 to 1:

    * Positive correlation: as one variable goes up, the other variable tends to go up as well.
    * Zero correlation: no relationship between the variables; completely independent.
    * Negative correlation: when one variable goes up, the other goes down.

"""

# calculating the correlation coefficient:

df_train_full[numerical].corrwith(df_train_full.churn)


tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [None]:
"""

Correlation Coefficient Analysis

1) Correlation between tenure and churn: -0.35

The longer the customer stays, the less likely they are to churn.
For customers that stay for 2 months or less, there is a churn rate of 60%.
For customers who've stayed between between 3 to 12 months, the churn rate is 40%.
For a customer staying longer than a year, the churn rate is 17%

2) Correlation between monthlycharges and churn: 0.19:

The more a customer pays, the more likely it is that the customer will leave.
Only 8% of those who pay less than £20 monthly churned.
Customers paying between $21 and $50 churn more frequently with a churn rate of 18%.
Customers paying more than $50 churned with a rate of 32%.

3) Correlation between totalcharges and churn: -0.19:

The longer people have stayed with the company, the more they have paid in total
which implies that it is less likely that they'll leave.


"""

In [41]:
"""

Feature Engineering

=>  transforming all categorical variables to numeric features.

The ML models take the categorical data as numbers in a matrix form or encode; how can this be done?

One way is to the encoding technique one-hot encoding.

Consider the variable contract. The possible values it can take:

* two-year
* yearly
* monthly

So a customer with a yearly contract could be represented by (0, 1, 0); two-year = 0, yearly = 1, monthly = 0.
In this case, the yearly value is active, or hot, so it gets 1, whereas the remaining values are not active, or cold, so they are 0.

We can perform one-hot encoding in multiple ways in Scikit-learn, but the one used in this project is DictVectorizer.
Using this method, DictVectorizer takes a dictionary and vectorizes it.
The vectors are then put together as rows of one matrix.
This matrix is used as input to a ML algorithm.

To use this method, the dataframe needs to be converted to a list of dictionaries.


"""

train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [42]:
# Now we can use DictVectorizer

from sklearn.feature_extraction import DictVectorizer
 
dv = DictVectorizer(sparse=False)

dv.fit(train_dict)

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
               sparse=False)

In [43]:
# converting the dictionaries to a matrix

X_train = dv.transform(train_dict)

In [44]:
# This operation creates a matrix with 45 columns; look at the 1st row

X_train[0]

array([0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 8.6100e+01, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 7.1000e+01, 6.0459e+03])

In [45]:
# get the column names

dv.get_feature_names()

['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender=female',
 'gender=male',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'multiplelines=no',
 'multiplelines=no_phone_service',
 'multiplelines=yes',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'partner=no',
 'partner=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'phoneservice=no',
 'phoneservice=yes',
 'seniorcitizen',
 'streamingmovies=no',
 'streamingmovies=no_internet_service',
 'streamingmovies=yes',
 'streamingtv=no',
 'streamingtv=no_internet_servic

In [46]:
"""

Machine Learning: Classification
--------------------------------

Logistic Regression is used as classification model.
-------------------

Logistic regression has a lot in common with linear regression, the model we learned in the previous chapter. If you remember, the linear regression model is a regression model that can predict a number. It has the form

where

    xi is the feature vector corresponding to the i th observation.
    w0 is the bias term.
    w is a vector with the weights of the model.

We apply this model and get g(xi)—the prediction of what we think the value for xi should be. 
Linear regression is trained to predict the target variable yi—the actual value of the observation i. 
In the previous chapter, this was the price of a car.

Linear regression is a linear model. It’s called linear because it combines the weights of 
the model with the feature vector linearly, using the dot product. Linear models are simple to 
implement, train, and use. Because of their simplicity, they are also fast.

Logistic regression is also a linear model, but unlike linear regression, it’s a classification 
model, not regression, even though the name might suggest that. It’s a binary classification model, 
so the target variable yi is binary; the only values it can have are zero and one. 
Observations with yi = 1 are typically called positive examples: examples in 
which the effect we want to predict is present. Likewise, examples with yi = 0 are 
called negative examples: the effect we want to predict is absent. For our project, yi = 1 means 
that the customer churned, and yi = 0 means the opposite: the customer stayed with us.

The output of logistic regression is probability—the probability that the observation xi is positive, or, in 
other words, the probability that yi = 1. For our case, it’s the probability that the customer i will churn.

To be able to treat the output as a probability, we need to make sure that 
the predictions of the model always stay between zero and one. 
We use a special mathematical function for this purpose called sigmoid.

If we compare it with the linear regression formula, the only difference is this sigmoid function: 
in case of linear regression, we have only w0 + xiTw. This is why both of these models are linear; 
they are both based on the dot product operation.

The sigmoid function maps any value to a number between zero and one (figure 3.24). It’s defined this way:

g(x_i) = sigmoid(w_0 + x_i1*w_1 + x_i2*w_2 + ... + x_in*w_n)

x_i = observation.

code
----

def logistic_regression(xi):

    score = bias

    for j in range(n):
    
        score = score + xi[j] * w[j]

    prob = sigmoid 




Definition of the sigmoid function:

import math
 
def sigmoid(score):

    return 1 / (1 + math.exp(-score))
    
    

We use score to mean the intermediate result before applying the sigmoid function. 
The score can take any real value. The probability is the result of applying the sigmoid function to the score; 
this is the final output, and it can take only the values between zero and one.

The parameters of the logistic regression model are the same as for linear regression:

    * w_0: the bias term.
    * w = (w_1, w_2, ... , w_n): weighted vector.

To learn the weights, we need to train the model, which we will do now using Scikit-learn.

Training logistic regression
----------------------------

# first import the model

from sklearn.linear_model import LogisticRegression

# train it by calling the fit method

model = LogisticRegression(solver='liblinear', random_state=1)

model.fit(X_train, y_train)



"""

# Training logistic regression

# first import the model

from sklearn.linear_model import LogisticRegression

# train it by calling the fit method

model = LogisticRegression(solver='liblinear', random_state=1)

model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
"""

The class LogisticRegression from Scikit-learn encapsulates the training logic behind this model. 
It’s configurable, and we can change quite a few parameters. 
In fact, we already specify two of them: solver and random_state. Both are needed for reproducibility:

    random_state. The seed number for the random-number generator. 
    It shuffles the data when training the model; to make sure the shuffle is the same every time, we fix the seed.
    solver. The underlying optimization library.
    
Other useful parameters for the model include C, which controls the regularization level. We talk about it in the next chapter when we cover parameter tuning. Specifying C is optional; by default, it gets the value 1.0.

The training takes a few seconds, and when it’s done, the model is ready to make predictions. 
Let’s see how well the model performs. We can apply it to our validation data to obtain
the probability of churn for each customer in the validation dataset.

To do that, we need to apply the one-hot encoding scheme to all the categorical variables. 
First, we convert the dataframe to a list of dictionaries and then feed it to the DictVectorizer we fit previously:

"""

val_dict = df_val[categorical + numerical].to_dict(orient='records')

X_val = dv.transform(val_dict)


In [48]:
"""

As a result, we get X_val, a matrix with features from the validation dataset. Now we are ready to put this matrix to the model. 
To get the probabilities, we use the predict_ proba method of the model:

"""

y_pred = model.predict_proba(X_val)


In [49]:
"""

The result of predict_proba is a two-dimensional NumPy array, or a two-column matrix. 
The first column of the array contains the probability that the target is
negative (no churn), and the second column contains the probability that the target is positive (churn).
 
These columns convey the same information. We know the probability of churn—it’s p—and the probability of not churning is always 1 – p, 
so we don’t need both columns.

Thus, it’s enough to take only the second column of the prediction. 
To select only one column from a two-dimensional array in NumPy, we can use the slicing operation [:, 1]:

 
"""
 
y_pred = model.predict_proba(X_val)[:, 1]
 


In [50]:
"""

Two positions are inside the brackets, the first one for rows and the second one for columns.

When we use [:, 1], NumPy interprets it this way:

    : means select all the rows.
    1 means select only the column at index 1, and because the indexing starts at 0, it’s the second column.

As a result, we get a one-dimensional NumPy array that contains the values from the second column only.

This output (probabilities) is often called soft predictions. 
These tell us the probability of churning as a number between zero and one. 
It’s up to us to decide how to interpret this number and how to use it.

Remember how we wanted to use this model: 
we wanted to retain customers by identifying those who are about to cancel their contract with 
the company and send them promotional messages, offering discounts and other benefits. 
We do this in the hope that after receiving the benefit, they will stay with the company. 
On the other hand, we don’t want to give promotions to all our customers, because it will hurt us 
financially: we will make less profit, if any.

To make the actual decision about whether to send a promotional letter to our customers, using the probability alone is not enough. We need hard predictions—binary values of True (churn, so send the mail) or False (not churn, so don’t send the mail).

To get the binary predictions, we take the probabilities and cut them above a certain threshold. 
If the probability for a customer is higher than this threshold, we predict churn, otherwise, not churn. 
If we select 0.5 to be this threshold, making the binary predictions is easy. We just use the “>=” operator:

"""

y_pred >= 0.5

array([False, False, False, ..., False,  True, False])

In [51]:
"""

The comparison operators in NumPy are applied element-wise, and the result is 
a new array that contains only Boolean values: 

True and False. Under the hood, it performs the comparison for each element of the y_pred array. 
If the element is greater than 0.5 or equal to 0.5, the corresponding element 
in the output array is True, and otherwise, it’s False.

Write the results to the churn result

"""

churn = y_pred >= 0.5

In [52]:
"""

When we have these hard predictions made by our model, we would like to 
understand how good they are, so we are ready to move to the next step: 
evaluating the quality of these predictions. In the next chapter, we will spend a lot 
more time learning about different evaluation techniques for 
binary classification, but for now, let’s do a simple check to make sure our model learned something useful.

The simplest thing to check is to take each prediction and compare it with the actual value. 
If we predict churn and the actual value is churn, or we 
predict non-churn and the actual value is non-churn, our model made the correct prediction. If the predictions don’t match, they aren’t good. If we calculate the number of times our predictions match the actual value, we can use it for measuring the quality of our model.

This quality measure is called accuracy. It’s very easy to calculate accuracy with NumPy:

"""

(y_val == churn).mean()

0.8016129032258065

In [None]:
"""

What does this mean?

First, we apply the == operator to compare two NumPy arrays: y_val and churn. 
If you remember, the first array, y_val, contains only numbers: 
zeros and ones. This is our target variable: one if the customer churned and zero otherwise. 
The second array contains Boolean predictions: True and False values. 
In this case True means we predict the customer will churn, and False means the customer will not churn.

Even though these two arrays have different types inside (integer and Boolean), it’s still possible to compare them.
The Boolean array is cast to integer such that True values are turned to “1” and False values are turned to “0.” 
Then it’s possible for NumPy to perform the actual comparison.

Like the >= operator, the == operator is applied element-wise. 
In this case, however, we have two arrays to compare, and here, we compare each element 
of one array with the respective element of the other array. 
The result is again a Boolean array with True or False values, depending on the outcome of the comparison.

In our case, if the true value in y_pred matches our prediction in churn, the label is True, and if it doesn’t, 
the label is False. In other words, we have True if our prediction is correct and False if it’s not.

Finally, we take the results of comparison—the Boolean array—and compute its mean using 
the mean() method. This method, however, is applied to numbers, not Boolean values, 
so before calculating the mean, the values are cast to integers: True values to “1” and False values to “0”.

Finally, as we already know, if we compute the mean of an array that contains only ones and zeros, 
the result is the fraction of ones in that array, which we already used for calculating the 
churn rate. Because “1” (True) in this case is a correct prediction and “0” (False) is an 
incorrect prediction, the resulting number tells us the percentage of correct predictions.

After executing this line of code, we see 0.8 in output. This means that the model 
predictions matched the actual value 80% of the time, or the model makes correct predictions 
in 80% of cases. This is what we call the accuracy of the model.

Now we know how to train a model and evaluate its accuracy, but it’s still useful to 
understand how it makes the predictions. In the next section, we try to look inside 
the models and see how we can interpret the coefficients it learned.


"""


In [53]:
"""

Model interpretation

We know that the logistic regression model has two parameters that it learns from data:

    * w_0 is the bias term
    * w = (w_1, w_2, ... ,w_n); the weights of the vector
    

We can get the bias term from model.intercept_[0]. 
When we train our model on all features, the bias term is –0.12.

The rest of the weights are stored in model.coef_[0]. If we look inside, it’s just 
an array of numbers, which is hard to understand on its own.

To see which feature is associated with each weight, let’s use the get_feature_ names method of the DictVectorizer. 
We can zip the feature names together with the coefficients before looking at them:

"""

dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))


{'contract=month-to-month': 0.563,
 'contract=one_year': -0.086,
 'contract=two_year': -0.599,
 'dependents=no': -0.03,
 'dependents=yes': -0.092,
 'deviceprotection=no': 0.1,
 'deviceprotection=no_internet_service': -0.116,
 'deviceprotection=yes': -0.106,
 'gender=female': -0.027,
 'gender=male': -0.095,
 'internetservice=dsl': -0.323,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.116,
 'monthlycharges': 0.001,
 'multiplelines=no': -0.168,
 'multiplelines=no_phone_service': 0.127,
 'multiplelines=yes': -0.081,
 'onlinebackup=no': 0.136,
 'onlinebackup=no_internet_service': -0.116,
 'onlinebackup=yes': -0.142,
 'onlinesecurity=no': 0.258,
 'onlinesecurity=no_internet_service': -0.116,
 'onlinesecurity=yes': -0.264,
 'paperlessbilling=no': -0.213,
 'paperlessbilling=yes': 0.091,
 'partner=no': -0.048,
 'partner=yes': -0.074,
 'paymentmethod=bank_transfer_(automatic)': -0.027,
 'paymentmethod=credit_card_(automatic)': -0.136,
 'paymentmethod=electronic_check': 0.175,


In [54]:
"""

To understand how the model works, let’s consider what happens when we apply this model. 
To build the intuition, let’s train a simpler and smaller model that uses only three variables: 

    contract, tenure, and totalcharges.
    
    
The variables tenure and totalcharges are numeric so we don’t need to do any additional preprocessing; 
we can take them as is. On the other hand, contract is a categorical variable, so to be able to use it, we need to apply one-hot encoding.

Let’s redo the same steps we did for training, this time using a smaller set of features:

"""

small_subset = ['contract', 'tenure', 'totalcharges']
train_dict_small = df_train[small_subset].to_dict(orient='records')
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)
 
X_small_train = dv_small.transform(train_dict_small)


In [55]:

"""

So as not to confuse it with the previous model, we add small to all the names. This way, it’s clear that 
we use a smaller model, and it saves us from accidentally overwriting the results we already have. 
Additionally, we will use it to compare the quality of the small model with the full one.

Let’s see which features the small model will use. For that, as previously, we use the get_feature_names method from DictVectorizer:

"""

dv_small.get_feature_names()

['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'tenure',
 'totalcharges']

In [56]:

"""

here are five features. As expected, we have tenure and totalcharges, and because 
they are numeric, their names are not changed.

As for the contract variable, it’s categorical, so DictVectorizer applies the one-hot encoding scheme
to convert it to numbers.
contract has three distinct values: month-to-month, one year, and two years. 
Thus, one-hot encoding scheme creates three new features: 

    contract=month-to-month, contract=one_year, and contract= two_years.

Let’s train the small model on this set of features:

"""

model_small = LogisticRegression(solver='liblinear', random_state=1)
model_small.fit(X_small_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [57]:
"""

The model is ready after a few seconds, and we can look inside the weights it learned. 
Let’s first check the bias term:

"""

model_small.intercept_[0]



-0.5772299124846147

In [58]:
"""

Output: -0.577. Then we can check the other weights, using the same code as previously:

"""

dict(zip(dv_small.get_feature_names(), model_small.coef_[0].round(3)))



{'contract=month-to-month': 0.866,
 'contract=one_year': -0.327,
 'contract=two_year': -1.117,
 'tenure': -0.094,
 'totalcharges': 0.001}

In [None]:
"""

Now let’s take a look at these weights and try to understand what they mean and how we can interpret them.

First, let’s think about the bias term and what it means. Recall that in the case of linear regression, it’s the baseline prediction: the prediction we would make without knowing anything else about the observation. In the car price prediction project, it would be the price of a car on average. This is not the final prediction; later, this baseline is corrected with other weights.

In the case of logistic regression, it’s similar: it’s the baseline prediction—or the score we would make on average. Likewise, we later correct this score with the other weights. However, for logistic regression, interpretation is a bit trickier because we also need to apply the sigmoid function before we get the final output. Let’s consider an example to help us understand that.

In our case, the bias term has the value of –0.577. This value is negative. If we look at the sigmoid function, we can see that for negative values, the output is lower than 0.5 (figure 3.31). For –0.639, the resulting probability of churning is 34%. This means that on average, a customer is more likely to stay with us than churn.

The reason why the sign before the bias term is negative is the class imbalance. There are a lot fewer churned users in the training data than non-churned ones, meaning the probability of churn on average is low, so this value for the bias term makes sense.

The next three weights are the weights for the contract variable. Because we use one-hot encoding, we have three contract features and three weights, one for each feature:

To build our intuition on how one-hot encoded weights can be understood and interpreted, let’s think 
of a client with a month-to-month contract. The contract variable has the following one-hot encoding representation: 
the first position corresponds to the month-to-month value and is hot, so it’s set to “1.” 
The remaining positions correspond to one_year and two_years, so they are cold and set to “0”.

We also know the weights w1, w2, and w3 that correspond to contract=month-to-month, contract=one_year, 
and contract=two_years.

To make a prediction, we perform the dot product between the feature vector and the weights, 
which is multiplication of the values in each position and then summation. 
The result of the multiplication is 0.91, which turns out to be the same as the weight 
of the contract=month-to-month.

Let’s consider another example: a client with a two-year contract. 
In this case, the contract=two_year feature is hot and has a value of “1,” and the 
rest are cold. When we multiply the vector with the one-hot encoding representation 
of the variable by the weight vector, we get –1.404.

As we see, during the prediction, only the weight of the hot feature is taken into account, 
and the rest of the weights are not considered in calculating the score. 
This makes sense: the cold features have values of zero, and when we multiply by zero, we get zero again

"""

In [59]:
"""

Note: lots more text in the project book; look at when project done

Using the Model


Now we know a lot better how logistic regression, and we can also interpret what our model learned 
and understand how it makes the predictions.

Additionally, we applied the model to the validation set, computed
the probabilities of churning for every customer there, and concluded that 
the model is 80% accurate. 
In the next chapter we will evaluate whether this number is satisfactory, 
but for now, let’s try to use the model we trained. 
Now we can apply the model to customers for scoring them. It’s quite easy.


"""

# take a customer we want to score and put all the variable values in a dictionary:

customer = {
    'customerid': '8879-zkjof',
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no',
    'deviceprotection': 'yes',
    'techsupport': 'yes',
    'streamingtv': 'yes',
    'streamingmovies': 'yes',
    'contract': 'one_year',
    'paperlessbilling': 'yes',
    'paymentmethod': 'bank_transfer_(automatic)',
    'monthlycharges': 79.85,
    'totalcharges': 3320.75,
}

In [60]:
# Now we can use our model to see whether this customer is going to churn.

# we convert this dictionary to a matrix by using the DictVectorizer:

X_test = dv.transform([customer])

In [61]:
print(X_test)

[[0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00
  0.00000e+00 7.98500e+01 1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00
  1.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 4.10000e+01 3.32075e+03]]


In [62]:
# take this matrix and put it into the trained model:

model.predict_proba(X_test)

array([[0.92667739, 0.07332261]])

In [63]:
"""

All we need from the matrix is the number at the first row and second column: 
the probability of churning for this customer. 
To select this number from the array, we use the brackets operator:


"""



model.predict_proba(X_test)[0, 1]



0.07332260940444735

In [65]:
"""

churn rate is << 50%

ACTION TO PERFORM:
------------------

No need to send promotional materials/offers to this customer.



Now consider another customer


"""

customer = {
    'gender': 'female',
    'seniorcitizen': 1,
    'partner': 'no',
    'dependents': 'no',
    'phoneservice': 'yes',
    'multiplelines': 'yes',
    'internetservice': 'fiber_optic',
    'onlinesecurity': 'no',
    'onlinebackup': 'no',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'yes',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 85.7,
    'totalcharges': 85.7
}

In [66]:
# make a prediction

X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

0.8321645137210785

In [None]:
"""

The output of the model is 83% likelihood of churn.

ACTION TO PERFORM:
------------------

so we should send this client a promotional email in the hope of retaining them.


"""