### Churn Prediction Project (Classification)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
import urllib.request

url = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"
file_path = "data/Telco-Customer-Churn.csv"

#urllib.request.urlretrieve(url, file_path)

In [3]:
df = pd.read_csv('data/Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [36]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [37]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
dtype: object

In [38]:
df.totalcharges

0         24.80
1        996.45
2       1031.70
3         76.35
4       3260.10
         ...   
1404    4378.80
1405    5686.40
1406     329.75
1407    2960.10
1408      31.35
Name: totalcharges, Length: 1409, dtype: float64

In [39]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

In [40]:
df.churn.head()

AttributeError: 'DataFrame' object has no attribute 'churn'

In [9]:
df.churn = (df.churn == 'yes').astype(int)

### Setting up the validation framework

In [41]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [42]:
len(df_full_train), len(df_test)

(1127, 282)

In [43]:
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [44]:
len(df_train), len(df_val)

(845, 282)

In [45]:
y_train, y_val, y_test = None, None, None

for i in ['train', 'val', 'test']:
   df = globals()[f'df_{i}']
   #print(f'y_{i} = {globals().get(f"y_{i}", "not defined")}')

   y = globals()[f'y_{i}']
   
   df = df.reset_index(drop=True)
   
   y = df.churn.values

   df = df.drop(columns=['churn'])

   # Reassign the variables to the updated versions
   globals()[f'df_{i}'] = df
   globals()[f'y_{i}'] = y

AttributeError: 'DataFrame' object has no attribute 'churn'

In [46]:
globals()['df_train'].totalcharges

389     1785.65
627      308.25
1230     157.55
1265      24.40
524     1411.20
         ...   
420     6557.75
878       45.40
1082    5602.25
26      4016.85
1126    5878.90
Name: totalcharges, Length: 845, dtype: float64

### EDA

* Check missing values
* Look at the target variable (churn)
* Look at numerical and categorical variables

In [47]:
df_full_train.isnull().sum() 

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
dtype: int64

In [48]:
df_full_train.reset_index(drop=True, inplace=True)

In [49]:
df_full_train.churn.value_counts(normalize=True)

AttributeError: 'DataFrame' object has no attribute 'churn'

In [50]:
df_full_train.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
dtype: object

### Feature importance: Churn rate and risk ratio

In [51]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [52]:
categorical = categorical_columns.copy()
categorical.remove('customerid')
categorical.append('seniorcitizen')

In [53]:
categorical

['gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'seniorcitizen']

In [54]:
df_full_train[categorical].nunique()

gender              2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
seniorcitizen       2
dtype: int64