In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
df=pd.read_csv('churn.csv')


In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


## Data Preparation

In [4]:
df.columns=df.columns.str.lower()

In [5]:
categorical_col=list(df.dtypes[df.dtypes=='object'].index)
categorical_col

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'totalcharges',
 'churn']

In [6]:
for c in categorical_col:
    df[c]=df[c].str.lower().str.replace(' ','_')

In [7]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [8]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [9]:
tc=pd.to_numeric(df.totalcharges, errors='coerce')

In [10]:
df.totalcharges=pd.to_numeric(df.totalcharges, errors='coerce')

In [11]:
tc

0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: totalcharges, Length: 7043, dtype: float64

In [12]:
df[tc.isnull()][['customerid','totalcharges']]

Unnamed: 0,customerid,totalcharges
488,4472-lvygi,
753,3115-czmzd,
936,5709-lvoeq,
1082,4367-nuyao,
1340,1371-dwpaz,
3331,7644-omvmy,
3826,3213-vvolg,
4380,2520-sgtta,
5218,2923-arzlg,
6670,4075-wkniu,


In [13]:
df.totalcharges=df.totalcharges.fillna(0)

In [14]:
df.totalcharges.isnull().sum()

0

In [15]:

df.churn

0        no
1        no
2       yes
3        no
4       yes
       ... 
7038     no
7039     no
7040     no
7041    yes
7042     no
Name: churn, Length: 7043, dtype: object

In [16]:
df.churn=(df.churn=='yes').astype(int)

## Setting up vaidation data

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
df_full_train, df_test=train_test_split(df, test_size=0.2, random_state=1)

In [230]:
df_train, df_val=train_test_split(df_full_train, test_size=0.25, random_state=42)

In [20]:
len(df_train),len(df_test),len(df_val)

(4225, 1409, 1409)

In [21]:
df_train=df_train.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)

In [22]:
df_full_train=df_full_train.reset_index(drop=True)

In [23]:
y_train=df_train.churn.values
y_test=df_test.churn.values
y_val=df_val.churn.values

In [24]:
del df_train['churn']

In [25]:
del df_val['churn']

In [26]:
del df_test['churn']

In [27]:
cat=df.dtypes[df.dtypes=='object']
cat

customerid          object
gender              object
partner             object
dependents          object
phoneservice        object
multiplelines       object
internetservice     object
onlinesecurity      object
onlinebackup        object
deviceprotection    object
techsupport         object
streamingtv         object
streamingmovies     object
contract            object
paperlessbilling    object
paymentmethod       object
dtype: object

In [28]:
catgorical=['customerid','gender','partner','dependents','phoneservice','multiplelines','internetservice','onlinesecurity',
            'onlinebackup','deviceprotection','techsupport','streamingtv','streamingmovies','contract',
            'paperlessbilling','paymentmethod']

In [29]:
df_full_train

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.70,258.35,0
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.90,3160.55,1
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.40,2044.75,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0781-lkxbr,male,1,no,no,9,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,yes,electronic_check,100.50,918.60,1
5630,3507-gasnp,male,0,no,yes,60,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.95,1189.90,0
5631,8868-wozgu,male,0,no,no,28,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,yes,electronic_check,105.70,2979.50,1
5632,1251-krreg,male,0,no,no,2,yes,yes,dsl,no,...,no,no,no,no,month-to-month,yes,mailed_check,54.40,114.10,1


## Explanatory Data analysis

In [30]:
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [31]:
numerical=['monthlycharges', 'totalcharges','tenure']

In [32]:
categorical=['gender','partner','dependents','phoneservice','multiplelines','internetservice','onlinesecurity',
            'onlinebackup','deviceprotection','techsupport','streamingtv','streamingmovies','contract',
            'paperlessbilling','paymentmethod']

In [33]:
df_full_train[catgorical].nunique()

customerid          5634
gender                 2
partner                2
dependents             2
phoneservice           2
multiplelines          3
internetservice        3
onlinesecurity         3
onlinebackup           3
deviceprotection       3
techsupport            3
streamingtv            3
streamingmovies        3
contract               3
paperlessbilling       2
paymentmethod          4
dtype: int64

## FEature importanmce still EDA

In [34]:
global_churn=df.churn.mean()

In [35]:
df_full_train[df_full_train.gender=='male'].churn.mean()

0.2632135306553911

In [36]:
df_full_train[df_full_train.gender=='female'].churn.mean()

0.27682403433476394

In [37]:
## wecan see that the churn rate relating to gender is or no much difference

In [38]:
df_full_train.partner.value_counts()

no     2932
yes    2702
Name: partner, dtype: int64

In [39]:
churn_no_partner=df_full_train[df_full_train.partner=='no'].churn.mean()
churn_no_partner

0.3298090040927694

In [40]:
churn_partner=df_full_train[df_full_train.partner=='yes'].churn.mean()
churn_partner

0.20503330866025166

In [41]:
global_churn-churn_partner

0.06033656213344424

In [42]:
global_churn-churn_no_partner

-0.06443913329907353

### 1 if global churn rate - group churn rate is 
### <0 then the are more likely to churn
### >0 less likely to churn

In [43]:
global_churn/churn_partner

1.294276878853008

In [44]:
global_churn/churn_no_partner

0.8046168160983623

## Risk ratio
### 1 if group churn rate / global churn rate is 
### >1 then the are more likely to churn
### <1 less likely to churn

In [45]:
from IPython.display import display

In [46]:
#calculating risk for al categorical variable

for c in categorical:
    df_group=df_full_train.groupby(c).churn.agg(['mean','count'])
    df_group['diff']=df_group['mean']-global_churn
    df_group['risk']=df_group['mean']/global_churn
    display(df_group)

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.011454,1.043163
male,0.263214,2838,-0.002156,0.991874


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.064439,1.242828
yes,0.205033,2702,-0.060337,0.772632


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.04839,1.18235
yes,0.165666,1666,-0.099704,0.624284


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.024054,0.909358
yes,0.273049,5087,0.007679,1.028937


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.007962,0.969995
no_phone_service,0.241316,547,-0.024054,0.909358
yes,0.290742,2387,0.025372,1.095609


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.073022,0.724828
fiber_optic,0.425171,2479,0.159802,1.602184
no,0.077805,1221,-0.187565,0.293195


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.155551,1.586168
no_internet_service,0.077805,1221,-0.187565,0.293195
yes,0.153226,1612,-0.112144,0.577405


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.138954,1.523622
no_internet_service,0.077805,1221,-0.187565,0.293195
yes,0.217232,1915,-0.048137,0.818602


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.130506,1.491787
no_internet_service,0.077805,1221,-0.187565,0.293195
yes,0.230412,1940,-0.034957,0.868269


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.153544,1.578604
no_internet_service,0.077805,1221,-0.187565,0.293195
yes,0.159926,1632,-0.105443,0.602655


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.077462,1.291901
no_internet_service,0.077805,1221,-0.187565,0.293195
yes,0.302723,2167,0.037353,1.140757


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.073537,1.27711
no_internet_service,0.077805,1221,-0.187565,0.293195
yes,0.307273,2200,0.041903,1.157904


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.166331,1.62679
one_year,0.120573,1186,-0.144797,0.45436
two_year,0.028274,1344,-0.237096,0.106545


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.093299,0.648419
yes,0.338151,3321,0.072781,1.274264


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.097199,0.633722
credit_card_(automatic),0.164339,1217,-0.101031,0.619281
electronic_check,0.45589,1893,0.19052,1.717942
mailed_check,0.19387,1305,-0.0715,0.730564


## Feature importance  of categorical variables Mutual Information

In [47]:
from sklearn.metrics import mutual_info_score

In [48]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train['churn'])

In [49]:
mi=df_full_train[categorical].apply(mutual_info_churn_score)

In [50]:
mi.sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

## Feature importance of numerical variables by corelation coefficient

In [51]:
df_full_train[numerical].corr

<bound method DataFrame.corr of       monthlycharges  totalcharges  tenure
0              19.70        258.35      12
1              73.90       3160.55      42
2              65.15       4681.75      71
3              85.45       6300.85      71
4              70.40       2044.75      30
...              ...           ...     ...
5629          100.50        918.60       9
5630           19.95       1189.90      60
5631          105.70       2979.50      28
5632           54.40        114.10       2
5633           68.25       1114.85      16

[5634 rows x 3 columns]>

In [52]:
df_full_train[numerical].corrwith(df_full_train.churn)

monthlycharges    0.196805
totalcharges     -0.196353
tenure           -0.351885
dtype: float64

In [53]:
df_full_train[numerical].corrwith(df_full_train.churn).abs()

monthlycharges    0.196805
totalcharges      0.196353
tenure            0.351885
dtype: float64

higher monthly charges -- increases churn rate
tenure increase ---- less churn rate

In [54]:
df_full_train[df_full_train['tenure']>12].churn.mean()

0.17634908339788277

In [55]:
df_full_train[df_full_train['tenure']<12].churn.mean()

0.48383233532934133

In [56]:
df_full_train[df_full_train['monthlycharges']<=20].churn.mean()

0.08795411089866156

In [57]:
df_full_train[(df_full_train['monthlycharges']>20)&(df_full_train['monthlycharges']<=50)].churn.mean()

0.18340943683409436

In [58]:
df_full_train[df_full_train['monthlycharges']>=50].churn.mean()

0.3248223216635957

In [59]:
# as Tenure decreases churn rate increases (negative correlation)
# as monthly charge increases churn rate increases(positive correlation)

## One-hot Encoding

In [60]:
from sklearn.feature_extraction import DictVectorizer

In [61]:
dicts=df_train[['gender','contract','tenure']].iloc[:100].to_dict(orient='record')
dicts

[{'gender': 'female', 'contract': 'two_year', 'tenure': 72},
 {'gender': 'male', 'contract': 'month-to-month', 'tenure': 10},
 {'gender': 'female', 'contract': 'month-to-month', 'tenure': 5},
 {'gender': 'female', 'contract': 'month-to-month', 'tenure': 5},
 {'gender': 'female', 'contract': 'two_year', 'tenure': 18},
 {'gender': 'male', 'contract': 'month-to-month', 'tenure': 4},
 {'gender': 'male', 'contract': 'month-to-month', 'tenure': 1},
 {'gender': 'female', 'contract': 'month-to-month', 'tenure': 1},
 {'gender': 'female', 'contract': 'two_year', 'tenure': 72},
 {'gender': 'female', 'contract': 'month-to-month', 'tenure': 6},
 {'gender': 'female', 'contract': 'two_year', 'tenure': 72},
 {'gender': 'male', 'contract': 'month-to-month', 'tenure': 17},
 {'gender': 'female', 'contract': 'two_year', 'tenure': 66},
 {'gender': 'female', 'contract': 'month-to-month', 'tenure': 2},
 {'gender': 'female', 'contract': 'month-to-month', 'tenure': 4},
 {'gender': 'male', 'contract': 'month-to

In [62]:
dv=DictVectorizer(sparse=False)

In [63]:
dv.fit(dicts)

DictVectorizer(sparse=False)

In [64]:
dv.transform(dicts)

array([[ 0.,  0.,  1.,  1.,  0., 72.],
       [ 1.,  0.,  0.,  0.,  1., 10.],
       [ 1.,  0.,  0.,  1.,  0.,  5.],
       [ 1.,  0.,  0.,  1.,  0.,  5.],
       [ 0.,  0.,  1.,  1.,  0., 18.],
       [ 1.,  0.,  0.,  0.,  1.,  4.],
       [ 1.,  0.,  0.,  0.,  1.,  1.],
       [ 1.,  0.,  0.,  1.,  0.,  1.],
       [ 0.,  0.,  1.,  1.,  0., 72.],
       [ 1.,  0.,  0.,  1.,  0.,  6.],
       [ 0.,  0.,  1.,  1.,  0., 72.],
       [ 1.,  0.,  0.,  0.,  1., 17.],
       [ 0.,  0.,  1.,  1.,  0., 66.],
       [ 1.,  0.,  0.,  1.,  0.,  2.],
       [ 1.,  0.,  0.,  1.,  0.,  4.],
       [ 1.,  0.,  0.,  0.,  1.,  3.],
       [ 0.,  0.,  1.,  1.,  0., 71.],
       [ 1.,  0.,  0.,  1.,  0., 32.],
       [ 0.,  1.,  0.,  0.,  1., 53.],
       [ 0.,  0.,  1.,  0.,  1., 56.],
       [ 1.,  0.,  0.,  0.,  1., 61.],
       [ 0.,  1.,  0.,  1.,  0., 41.],
       [ 1.,  0.,  0.,  1.,  0.,  1.],
       [ 0.,  0.,  1.,  1.,  0.,  3.],
       [ 1.,  0.,  0.,  0.,  1.,  3.],
       [ 0.,  0.,  1.,  0

In [65]:
dv.get_feature_names()

['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'gender=female',
 'gender=male',
 'tenure']

In [113]:
numerical


['monthlycharges', 'totalcharges', 'tenure']

In [66]:
train_dicts=df_train[categorical+numerical].to_dict(orient='record')
dv=DictVectorizer(sparse=False)
dv.fit(train_dicts)
x_train=dv.transform(train_dicts)

In [67]:
dv.get_feature_names()

['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender=female',
 'gender=male',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'multiplelines=no',
 'multiplelines=no_phone_service',
 'multiplelines=yes',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'partner=no',
 'partner=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'phoneservice=no',
 'phoneservice=yes',
 'streamingmovies=no',
 'streamingmovies=no_internet_service',
 'streamingmovies=yes',
 'streamingtv=no',
 'streamingtv=no_internet_service',
 'streamingtv=

In [68]:
x_train[0]

array([0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
       0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00,
       0.00000e+00, 1.00000e+00, 0.00000e+00, 1.15500e+02, 0.00000e+00,
       0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
       0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
       0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
       0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
       1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
       0.00000e+00, 1.00000e+00, 7.20000e+01, 8.42515e+03])

In [69]:
x_train.shape


(4225, 44)

In [70]:
val_dicts=df_val[categorical+numerical].to_dict(orient='record')
x_val=dv.transform(val_dicts)
x_val[0]

array([0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 7.0850e+01, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 1.0000e+00, 7.1000e+01, 4.9734e+03])

## Logistics Regression


In [71]:
#ogistics regressionsolves binary problem, betwen 1/0 or yes orno positive or negative, spamor no spam
#g(xi)--0_1

# this is very similar to linear regresson only that it uses asigmoid to convert any number in to a number betwen -0 to 1

In [76]:
from sklearn.linear_model import LogisticRegression

In [114]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(x_train,y_train)

LogisticRegression(solver='liblinear')

In [115]:
#toget our bycharm (W0)which is our prediction when we do not know anythingabout our target variables
model.intercept_[0]

-0.12250622279455545

In [135]:
#now these are arederived weights after fitting(W)
model.coef_[0].round(3)

array([ 0.619, -0.06 , -0.681, -0.006, -0.116,  0.053, -0.103, -0.073,
       -0.042, -0.08 , -0.382,  0.362, -0.103,  0.003, -0.251,  0.138,
       -0.009,  0.058, -0.103, -0.077,  0.219, -0.103, -0.238, -0.251,
        0.128, -0.127,  0.005, -0.1  , -0.048,  0.073, -0.047,  0.138,
       -0.26 , -0.103, -0.103,  0.083, -0.064, -0.103,  0.045,  0.178,
       -0.103, -0.198, -0.067,  0.   ])

In [139]:
#hard prediction,this gives us the exact prediction we have in our y train
# but not what we reay need
model.predict(x_train)

array([0, 1, 1, ..., 1, 0, 1])

In [140]:
#to earn the probability
x_pred=model.predict_proba(x_train)[:,1]
#first col: probabiity of being negative
#second col: the probabilty of beonging to the positve class i.e churning
#we areony concerned on th scond col

In [143]:
y_pred_for_val=model.predict_proba(x_val)[:,1]
y_pred_for_val

array([0.00733333, 0.20402659, 0.22232674, ..., 0.1506309 , 0.78724589,
       0.82287331])

In [146]:
churn_des=y_pred_for_val>=0.5

In [147]:
churn_des

array([False, False, False, ..., False,  True,  True])

In [148]:
df_val[churn_des]

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges
3,8433-wxgna,male,0,no,no,2,yes,no,fiber_optic,yes,no,no,no,no,no,month-to-month,yes,electronic_check,75.70,189.20
8,3440-jpscl,female,0,no,no,6,yes,no,fiber_optic,no,no,yes,yes,yes,yes,month-to-month,yes,mailed_check,99.95,547.65
12,7228-omtpn,male,0,no,no,4,yes,no,fiber_optic,no,no,no,no,yes,yes,month-to-month,yes,electronic_check,88.45,370.65
19,6711-fldfb,female,0,no,no,7,yes,yes,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,74.90,541.15
24,2612-ranwt,female,0,no,no,12,yes,yes,fiber_optic,no,no,yes,no,yes,yes,month-to-month,yes,bank_transfer_(automatic),100.15,1164.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1397,5976-jcjrh,male,0,yes,no,10,yes,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,70.30,738.20
1398,2034-cgrhz,male,1,no,no,24,yes,yes,fiber_optic,no,yes,yes,no,yes,yes,month-to-month,yes,credit_card_(automatic),102.95,2496.70
1399,5276-kqwhg,female,1,no,no,2,yes,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,69.60,131.65
1407,6521-yytyi,male,0,no,yes,1,yes,yes,fiber_optic,no,no,no,no,yes,yes,month-to-month,yes,electronic_check,93.30,93.30


In [149]:
#geting the cus id of churners

In [158]:
df_val[churn_des][['contract','customerid']]

Unnamed: 0,contract,customerid
3,month-to-month,8433-wxgna
8,month-to-month,3440-jpscl
12,month-to-month,7228-omtpn
19,month-to-month,6711-fldfb
24,month-to-month,2612-ranwt
...,...,...
1397,month-to-month,5976-jcjrh
1398,month-to-month,2034-cgrhz
1399,month-to-month,5276-kqwhg
1407,month-to-month,6521-yytyi


In [None]:
#checking the accuracy of our model

In [161]:
churn_des.astype(int)

array([0, 0, 0, ..., 0, 1, 1])

In [160]:
y_val

array([0, 0, 0, ..., 0, 1, 1])

In [164]:
(churn_des.astype(int)==y_val).mean()

0.7998580553584103

In [166]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred_for_val
df_pred['prediction'] = churn_des.astype(int)
df_pred['actual'] = y_val

In [168]:
df_pred['final']=df_pred['prediction']==df_pred['actual']

In [169]:
df_pred

Unnamed: 0,probability,prediction,actual,final
0,0.007333,0,0,True
1,0.204027,0,0,True
2,0.222327,0,0,True
3,0.586562,1,1,True
4,0.219410,0,0,True
...,...,...,...,...
1404,0.304577,0,0,True
1405,0.040693,0,1,False
1406,0.150631,0,0,True
1407,0.787246,1,1,True


In [171]:
df_pred.final.mean()

0.7998580553584103

## MOdel intrepretattion

In [180]:
a=dv.get_feature_names()

In [181]:
b=model.coef_[0].round(3)

In [188]:
dict(zip(a,b))

{'contract=month-to-month': 0.619,
 'contract=one_year': -0.06,
 'contract=two_year': -0.681,
 'dependents=no': -0.006,
 'dependents=yes': -0.116,
 'deviceprotection=no': 0.053,
 'deviceprotection=no_internet_service': -0.103,
 'deviceprotection=yes': -0.073,
 'gender=female': -0.042,
 'gender=male': -0.08,
 'internetservice=dsl': -0.382,
 'internetservice=fiber_optic': 0.362,
 'internetservice=no': -0.103,
 'monthlycharges': 0.003,
 'multiplelines=no': -0.251,
 'multiplelines=no_phone_service': 0.138,
 'multiplelines=yes': -0.009,
 'onlinebackup=no': 0.058,
 'onlinebackup=no_internet_service': -0.103,
 'onlinebackup=yes': -0.077,
 'onlinesecurity=no': 0.219,
 'onlinesecurity=no_internet_service': -0.103,
 'onlinesecurity=yes': -0.238,
 'paperlessbilling=no': -0.251,
 'paperlessbilling=yes': 0.128,
 'partner=no': -0.127,
 'partner=yes': 0.005,
 'paymentmethod=bank_transfer_(automatic)': -0.1,
 'paymentmethod=credit_card_(automatic)': -0.048,
 'paymentmethod=electronic_check': 0.073,
 '

In [189]:
df_full_train

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.70,258.35,0
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.90,3160.55,1
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.40,2044.75,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0781-lkxbr,male,1,no,no,9,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,yes,electronic_check,100.50,918.60,1
5630,3507-gasnp,male,0,no,yes,60,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.95,1189.90,0
5631,8868-wozgu,male,0,no,no,28,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,yes,electronic_check,105.70,2979.50,1
5632,1251-krreg,male,0,no,no,2,yes,yes,dsl,no,...,no,no,no,no,month-to-month,yes,mailed_check,54.40,114.10,1


In [192]:
y_full_train=df_full_train['churn']
y_full_train

0       0
1       1
2       0
3       0
4       0
       ..
5629    1
5630    0
5631    1
5632    1
5633    0
Name: churn, Length: 5634, dtype: int32

In [193]:
del df_full_train['churn']

In [194]:
df_full_train

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.70,258.35
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,yes,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.90,3160.55
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,yes,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,no,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,yes,no,yes,yes,no,one_year,no,electronic_check,70.40,2044.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0781-lkxbr,male,1,no,no,9,yes,yes,fiber_optic,no,no,yes,no,yes,yes,month-to-month,yes,electronic_check,100.50,918.60
5630,3507-gasnp,male,0,no,yes,60,yes,no,no,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.95,1189.90
5631,8868-wozgu,male,0,no,no,28,yes,yes,fiber_optic,no,yes,yes,no,yes,yes,month-to-month,yes,electronic_check,105.70,2979.50
5632,1251-krreg,male,0,no,no,2,yes,yes,dsl,no,yes,no,no,no,no,month-to-month,yes,mailed_check,54.40,114.10


In [195]:
full_train_dicts=df_full_train[categorical+numerical].to_dict(orient='record')
dv=DictVectorizer(sparse=False)
dv.fit(full_train_dicts)
x_full_train=dv.transform(full_train_dicts)
x_full_train

array([[0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        1.20000e+01, 2.58350e+02],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        4.20000e+01, 3.16055e+03],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        7.10000e+01, 4.68175e+03],
       ...,
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.80000e+01, 2.97950e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.00000e+00, 1.14100e+02],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        1.60000e+01, 1.11485e+03]])

In [196]:
stop for abakpa juncton take keke to nowas enter nowas stop for nowas you go se zenith uba roban 

SyntaxError: invalid syntax (<ipython-input-196-a600b4b02069>, line 1)

In [197]:
model=LogisticRegression(solver='liblinear')
model.fit(x_full_train,y_full_train)

LogisticRegression(solver='liblinear')

In [200]:
dicts_test = df_test[categorical + numerical].to_dict(orient='records')
x_test = dv.transform(dicts_test)

In [199]:
y_test

array([0, 0, 0, ..., 0, 0, 1])

In [201]:
x_test

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        4.10000e+01, 3.32075e+03],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.60000e+01, 6.47185e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.20000e+01, 5.24350e+02],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        7.10000e+01, 3.88865e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        6.50000e+01, 5.68845e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.70000e+01, 1.74350e+03]])

In [206]:
y_pred_for_test = model.predict_proba(x_test)[:, 1]
y_pred_for_test

array([0.06281307, 0.10334356, 0.33781456, ..., 0.00681238, 0.18047403,
       0.64380376])

In [212]:
test_churn_des=(y_pred_for_test>=0.5).astype(int)

In [214]:
(test_churn_des==y_test).mean()

0.8090844570617459

## using our model

In [223]:
dicts_test[10]

{'gender': 'male',
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'yes',
 'deviceprotection': 'no',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'mailed_check',
 'monthlycharges': 93.95,
 'totalcharges': 2861.45,
 'tenure': 32}

In [218]:
custormer1_dict=dicts_test[10]

In [226]:
x_cust=dv.transform([custormer1_dict])
x_cust.shape

(1, 44)

In [228]:
model.predict_proba(x_cust)[:,1]

array([0.44404055])

In [229]:
y_test[10]

0