In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
## Read the dataset
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
len(df)

7043

In [3]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

- **CustomerID**: the ID of the customer
- **Gender**: male/female
- **SeniorCitizen**: whether the customer is a senior citizen (0/1)
- **Partner**: whether they live with a partner (yes/no)
- **Dependents**: whether they have dependents (yes/no)
- **Tenure**: number of months since the start of the contract
- **PhoneService**: whether they have phone service (yes/no)
- **MultipleLines**: whether they have multiple phone lines (yes/no/no phone service)
- **InternetService**: the type of internet service (no/fiber/optic)
- **OnlineSecurity**: if online security is enabled (yes/no/no internet)
- **OnlineBackup**: if online backup service is enabled (yes/no/no internet)
- **DeviceProtection**: if the device protection service is enabled (yes/no/no internet)
- **TechSupport**: if the customer has tech support (yes/no/no internet)
- **StreamingTV**: if the TV streaming service is enabled (yes/no/no internet)
- **StreamingMovies**: if the movie streaming service is enabled (yes/no/no internet)
- **Contract**: the type of contract (monthly/yearly/two years)
- **PaperlessBilling**: if the billing is paperless (yes/no)
- **PaymentMethod**: payment method (electronic check, mailed check, bank transfer, credit card)
- **MonthlyCharges**: the amount charged monthly (numeric)
- **TotalCharges**: the total amount charged (numeric)
- **Churn**: if the client has canceled the contract (yes/no)


In [5]:
df.dtypes


customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
df.dtypes.value_counts()

object     18
int64       2
float64     1
Name: count, dtype: int64

In [7]:
## Missing value in TotalCharges
total_charges =pd.to_numeric(df.TotalCharges,errors='coerce')
df[total_charges.isnull()][['customerID','TotalCharges']]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


In [8]:
## Set the missing values to zero
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.TotalCharges = df.TotalCharges.fillna(0)

In [9]:
## columns uniform by lowercasing 
df.columns = df.columns.str.lower().str.replace(' ','_')
string_columns = list(df.dtypes[df.dtypes =='object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ','_')

In [10]:
## target convert in boolean
df.churn = (df.churn == 'yes').astype(int)

In [11]:
## data splitting
from sklearn.model_selection import train_test_split

In [12]:
df_train_full, df_test = train_test_split(df,test_size=0.2, random_state=42)

In [13]:
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=42)

In [14]:
y_train = df_train.churn.values
y_val = df_val.churn.values

In [15]:
del df_train['churn']
del df_val['churn']

### EDA Exploratory data aanalysis

In [16]:
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [17]:
## Distribution of values in the Target variable
df_train_full.churn.value_counts()

churn
0    4138
1    1496
Name: count, dtype: int64

In [18]:
## The proportion of churned users
global_mean  = df_train_full.churn.mean()
round(global_mean,2)*100

27.0

the dataset is imbalanced. The churn rate is our date 27%

In [19]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
'phoneservice', 'multiplelines', 'internetservice',
'onlinesecurity', 'onlinebackup', 'deviceprotection',
'techsupport', 'streamingtv', 'streamingmovies',
'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [20]:
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

### Feature importance analysis

What makes customers churn? and What are the characteristics of people who churn?

#### churn rate

In [21]:
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()
male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()
print(f'male is {round(male_mean*100,2)}% and female{round(female_mean*100,2)}% ')

male is 26.05% and female27.08% 


The difference between the group rates for both females
and males is quite small, which indicates that knowing the gender of the customer
doesn’t help us identify whether they will churn.

In [22]:
partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean()
partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()
print(f'"yes" is {round(partner_yes*100,2)}% and  "no" {round(partner_no*100,2)}% ')

"yes" is 20.07% and  "no" 32.64% 


As we see, the rates for those who have a partner are quite different from rates for
those who don’t: 20% and 33%, respectively. It means that clients with no partner are
more likely to churn than the ones with a partner

### Risk ratio

In [23]:
df_group = df_train_full.groupby(by='gender').churn.agg(['mean'])
df_group

Unnamed: 0_level_0,mean
gender,Unnamed: 1_level_1
female,0.270841
male,0.260478


In [24]:
df_group['diff'] = df_group['mean'] - global_mean
df_group['risk'] = df_group['mean']/ global_mean
df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.270841,0.00531,1.019998
male,0.260478,-0.005053,0.980971


In [25]:
from IPython.display import display
for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean']/ global_mean
    display(df_group)
    

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.270841,0.00531,1.019998
male,0.260478,-0.005053,0.980971


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.237098,-0.028433,0.892922
1,0.413907,0.148377,1.558793


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.326446,0.060916,1.229411
yes,0.200733,-0.064798,0.755968


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.312326,0.046795,1.176233
yes,0.155674,-0.109856,0.586276


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.253623,-0.011908,0.955156
yes,0.266824,0.001293,1.004871


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.251397,-0.014134,0.946771
no_phone_service,0.253623,-0.011908,0.955156
yes,0.284105,0.018574,1.069952


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.191851,-0.073679,0.722521
fiber_optic,0.415558,0.150028,1.56501
no,0.076606,-0.188924,0.288502


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.416014,0.150484,1.566727
no_internet_service,0.076606,-0.188924,0.288502
yes,0.145342,-0.120189,0.547363


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.398693,0.133162,1.501494
no_internet_service,0.076606,-0.188924,0.288502
yes,0.216531,-0.048999,0.815467


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.387706,0.122175,1.460117
no_internet_service,0.076606,-0.188924,0.288502
yes,0.226825,-0.038705,0.854234


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.413472,0.147941,1.557153
no_internet_service,0.076606,-0.188924,0.288502
yes,0.152855,-0.112676,0.575657


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.335418,0.069887,1.263197
no_internet_service,0.076606,-0.188924,0.288502
yes,0.298945,0.033415,1.125841


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.333333,0.067803,1.255348
no_internet_service,0.076606,-0.188924,0.288502
yes,0.30132,0.035789,1.134784


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.426533,0.161002,1.60634
one_year,0.117987,-0.147544,0.444343
two_year,0.028379,-0.237151,0.106878


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.16414,-0.10139,0.618159
yes,0.33594,0.070409,1.265164


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.174475,-0.091056,0.65708
credit_card_(automatic),0.152404,-0.113126,0.573961
electronic_check,0.449921,0.18439,1.69442
mailed_check,0.190328,-0.075203,0.716782


- For **gender**, there is not much difference between females and males. Both means are approximately the same, and for both groups, the risks are close to 1.
- **Senior citizens** tend to churn more than nonseniors: the risk of churning is 1.53 for seniors and 0.89 for nonseniors.
- People with a **partner** churn less than people with no partner. The risks are 0.75 and 1.22, respectively.
- People who use **phone service** are not at risk of churning: the risk is close to 1, and there’s almost no difference with the global churn rate. People who don’t use phone service are even less likely to churn: the risk is below 1, and the difference with the global churn rate is negative. 
- Clients with no **tech support** tend to churn more than those who do.
- People with **monthly contracts** cancel the contract a lot more often than others, and people with **two-year contacts** churn very rarely.


### mutual Information

In [26]:
'''
Mutual information  implement in Scikit-learn in the mutual_info_
score function from the metrics packag
'''
from sklearn.metrics import mutual_info_score

In [27]:
def calculate_mi(series):
    return mutual_info_score(series,df_train_full.churn)

In [28]:
df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
contract,0.096652
onlinesecurity,0.063393
techsupport,0.060935
internetservice,0.053313
onlinebackup,0.045424
paymentmethod,0.042861
deviceprotection,0.042007
streamingtv,0.030844
streamingmovies,0.030705
paperlessbilling,0.019077


In [29]:
##  The most useful features according
#to the mutual information score
df_mi[:5]

Unnamed: 0,MI
contract,0.096652
onlinesecurity,0.063393
techsupport,0.060935
internetservice,0.053313
onlinebackup,0.045424


In [30]:
#he least useful features according
#to the mutual information score
df_mi[-5:]

Unnamed: 0,MI
partner,0.010227
seniorcitizen,0.010059
multiplelines,0.000654
gender,6.9e-05
phoneservice,4e-05


### Correlation Coefficient

In [31]:
df_train_full[numerical].corrwith(df_train_full.churn)

tenure           -0.344925
monthlycharges    0.188574
totalcharges     -0.193370
dtype: float64

### Feature engineered

In [74]:
## One-hot encoding for categorical variables

In [83]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')
train_dict

[{'gender': 'female',
  'seniorcitizen': 0,
  'partner': 'no',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'no',
  'onlinebackup': 'no',
  'deviceprotection': 'no',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'month-to-month',
  'paperlessbilling': 'yes',
  'paymentmethod': 'electronic_check',
  'tenure': 18,
  'monthlycharges': 71.1,
  'totalcharges': 1247.75},
 {'gender': 'female',
  'seniorcitizen': 1,
  'partner': 'no',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'no',
  'onlinesecurity': 'no_internet_service',
  'onlinebackup': 'no_internet_service',
  'deviceprotection': 'no_internet_service',
  'techsupport': 'no_internet_service',
  'streamingtv': 'no_internet_service',
  'streamingmovies': 'no_internet_service',
  'contract': 'one_year',
  'paperlessbilling': 'no',
  'paymentmethod': 'mailed_check',
  't

In [84]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [86]:
X_train = dv.transform(train_dict)
X_train[0]

array([1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
       1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
       0.00000e+00, 1.00000e+00, 0.00000e+00, 7.11000e+01, 1.00000e+00,
       0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
       1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
       1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
       0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
       0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
       1.00000e+00, 0.00000e+00, 0.00000e+00, 1.80000e+01, 1.24775e+03])

In [94]:
feature_names = dv.get_feature_names_out()
print(feature_names)

['contract=month-to-month' 'contract=one_year' 'contract=two_year'
 'dependents=no' 'dependents=yes' 'deviceprotection=no'
 'deviceprotection=no_internet_service' 'deviceprotection=yes'
 'gender=female' 'gender=male' 'internetservice=dsl'
 'internetservice=fiber_optic' 'internetservice=no' 'monthlycharges'
 'multiplelines=no' 'multiplelines=no_phone_service' 'multiplelines=yes'
 'onlinebackup=no' 'onlinebackup=no_internet_service' 'onlinebackup=yes'
 'onlinesecurity=no' 'onlinesecurity=no_internet_service'
 'onlinesecurity=yes' 'paperlessbilling=no' 'paperlessbilling=yes'
 'partner=no' 'partner=yes' 'paymentmethod=bank_transfer_(automatic)'
 'paymentmethod=credit_card_(automatic)' 'paymentmethod=electronic_check'
 'paymentmethod=mailed_check' 'phoneservice=no' 'phoneservice=yes'
 'seniorcitizen' 'streamingmovies=no'
 'streamingmovies=no_internet_service' 'streamingmovies=yes'
 'streamingtv=no' 'streamingtv=no_internet_service' 'streamingtv=yes'
 'techsupport=no' 'techsupport=no_interne

### Machine learning for classification