In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('Telco-Customer-Churn.csv')
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.columns = df.columns.str.lower().str.replace(' ', "_")

In [5]:
columns = list(df.columns)
for c in columns:
    print(c)
    print(df[c].nunique())
    print(df[c].unique()[:5])
    print(df[c].dtypes)
    print('\n \n')

customerid
7043
['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' '7795-CFOCW' '9237-HQITU']
object

 

gender
2
['Female' 'Male']
object

 

seniorcitizen
2
[0 1]
int64

 

partner
2
['Yes' 'No']
object

 

dependents
2
['No' 'Yes']
object

 

tenure
73
[ 1 34  2 45  8]
int64

 

phoneservice
2
['No' 'Yes']
object

 

multiplelines
3
['No phone service' 'No' 'Yes']
object

 

internetservice
3
['DSL' 'Fiber optic' 'No']
object

 

onlinesecurity
3
['No' 'Yes' 'No internet service']
object

 

onlinebackup
3
['Yes' 'No' 'No internet service']
object

 

deviceprotection
3
['No' 'Yes' 'No internet service']
object

 

techsupport
3
['No' 'Yes' 'No internet service']
object

 

streamingtv
3
['No' 'Yes' 'No internet service']
object

 

streamingmovies
3
['No' 'Yes' 'No internet service']
object

 

contract
3
['Month-to-month' 'One year' 'Two year']
object

 

paperlessbilling
2
['Yes' 'No']
object

 

paymentmethod
4
['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (a

In [6]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors = 'coerce')
df.totalcharges.isnull().sum()

11

In [7]:
df.totalcharges = df.totalcharges.fillna(0)

In [8]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)
categorical

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'churn']

In [9]:
for c in categorical:
    df[c] = df[c].str.lower().str.replace(' ', "_")

In [10]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [11]:
df['churn'] = (df['churn'] == 'yes').astype(int)

In [12]:
numerical =[]; categorical =[]
for c in columns: 
    if df[c].nunique() < 5:
        assert c not in categorical
        categorical.append(c)
    else:
        numerical.append(c)

In [13]:
numerical.remove('customerid')
categorical.remove('churn')

In [14]:
def mi_score(series):
    return mutual_info_score(series,df.churn)

In [15]:
mi = df[categorical].apply(mi_score)
mi.sort_values(ascending = False).round(4)

contract            0.0985
onlinesecurity      0.0647
techsupport         0.0630
internetservice     0.0556
onlinebackup        0.0468
paymentmethod       0.0445
deviceprotection    0.0439
streamingmovies     0.0320
streamingtv         0.0319
paperlessbilling    0.0192
dependents          0.0145
partner             0.0115
seniorcitizen       0.0106
multiplelines       0.0008
phoneservice        0.0001
gender              0.0000
dtype: float64

In [16]:
mi_high = list(mi[mi>= 0.04].sort_values(ascending = False).index)

In [17]:
df[numerical].corrwith(df.churn)

tenure           -0.352229
monthlycharges    0.193356
totalcharges     -0.198324
dtype: float64

In [18]:
features = numerical+categorical



In [19]:
full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [20]:
df_train, df_val = train_test_split(full_train, test_size = 0.25, random_state = 42)

In [21]:
print(len(df_train),len(df_val),len(df_test))

4225 1409 1409


In [22]:
y_train = df_train['churn']
y_test = df_test['churn']
y_val = df_val['churn']

In [23]:
del df_train['churn']
del df_test['churn']
del df_val['churn']

In [24]:
X_train = df_train[features]
X_test = df_test[features]
X_val = df_val[features]

In [25]:
dict_X_train = X_train.to_dict(orient = 'records')
dict_X_test = X_test.to_dict(orient = 'records')
dict_X_val = X_val.to_dict(orient = 'records')

In [26]:
dv = DictVectorizer(sparse = False)
X_train = dv.fit_transform(dict_X_train)
X_test = dv.transform(dict_X_test)
X_val = dv.transform(dict_X_val)

In [27]:
model = LogisticRegression()
model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
y_pred = model.predict_proba(X_val)[:,1]

In [29]:
churn_decision = y_pred >= 0.5

In [30]:
df_pred =pd.DataFrame()
df_pred['probability']  = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_val
df_pred

Unnamed: 0,probability,prediction,actual
0,0.106369,0,
1,0.230152,0,0.0
2,0.427553,0,
3,0.545482,1,0.0
4,0.057238,0,1.0
...,...,...,...
1404,0.051298,0,
1405,0.280021,0,
1406,0.738973,1,
1407,0.077293,0,


In [31]:
y_val

3575    0
4958    0
1447    1
3334    1
4041    0
       ..
6757    1
1118    0
6590    1
2782    0
2979    0
Name: churn, Length: 1409, dtype: int32

In [32]:
df_pred['actual'] = y_val

In [33]:
y_val

3575    0
4958    0
1447    1
3334    1
4041    0
       ..
6757    1
1118    0
6590    1
2782    0
2979    0
Name: churn, Length: 1409, dtype: int32