In [27]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt


In [28]:
df = pd.read_csv('Telco-Customer-Churn.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [29]:
df.dtypes


customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [30]:
# TotalCharges is an object type, we need to convert it to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [31]:
df['Churn']

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object

In [32]:
# Churn is a categorical variable, we need to convert it to numeric
df['Churn'] = (df['Churn']=='Yes').astype(int)
df['Churn'].value_counts(normalize=True)

Churn
0    0.73463
1    0.26537
Name: proportion, dtype: float64

In [56]:
# Churn is a categorical variable, we need to convert it to numeric

df['seniorcitizen'] = df['seniorcitizen'].map({0: 'No', 1: 'Yes'})
df['seniorcitizen'].value_counts(normalize=True)

seniorcitizen
No     0.837853
Yes    0.162147
Name: proportion, dtype: float64

In [33]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [34]:
df.churn

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: churn, Length: 7043, dtype: int64

# EDA
  #Check missing values
 #Look at the target variable (churn)
 #Look at numerical and categorical variables

In [57]:
df.dtypes

customerid           object
gender               object
seniorcitizen        object
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [68]:
# Segment variables
numerical = df.select_dtypes(include = ['int','float']).columns.to_list()
                             
categorical_columns  = df.select_dtypes(include=['object']).columns.to_list()
numerical

['tenure', 'monthlycharges', 'totalcharges', 'churn']

In [70]:

categorical_columns 
#numerical.drop('churn')

['customerid',
 'gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']


# 3.3 Setting up the validation framework

In [37]:
from sklearn.model_selection import train_test_split
def train_test(df,test_ratio):
    df = df.copy()
    #df.drop('churn')
    df_train_full, df_test = train_test_split(df, test_size=test_ratio, random_state=1)

    return df_train_full, df_test 

In [44]:
df_train_full, df_test = train_test(df, 0.2)
df_train, df_val = train_test(df_train_full, 0.25)

In [45]:
len(df_train_full),len(df_test)


(5634, 1409)

In [46]:
len(df_train), len(df_val)

(4225, 1409)

In [50]:
df_train_full.head(2)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
1814,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,0
5946,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,1


In [49]:
df_train.head(2)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
3897,8015-ihcgw,female,0,yes,yes,72,yes,yes,fiber_optic,yes,...,yes,yes,yes,yes,two_year,yes,electronic_check,115.5,8425.15,0
1980,1960-uycnn,male,0,no,no,10,yes,yes,fiber_optic,no,...,yes,no,no,yes,month-to-month,yes,electronic_check,95.25,1021.55,0


In [52]:
# Reset the indexes 
df_train_full.reset_index(drop=True)
df_train.reset_index(drop=True)
df_val.reset_index(drop=True)  
df_test.reset_index(drop=True)       

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,8879-zkjof,female,0,no,no,41,yes,no,dsl,yes,...,yes,yes,yes,yes,one_year,yes,bank_transfer_(automatic),79.85,3320.75,0
1,0201-mibol,female,1,no,no,66,yes,yes,fiber_optic,yes,...,no,no,yes,yes,two_year,yes,bank_transfer_(automatic),102.40,6471.85,0
2,1600-dilpe,female,0,no,no,12,yes,no,dsl,no,...,no,no,no,no,month-to-month,yes,bank_transfer_(automatic),45.00,524.35,0
3,8601-qacrs,female,0,no,no,5,yes,yes,dsl,no,...,no,no,no,no,month-to-month,yes,mailed_check,50.60,249.95,1
4,7919-zodzz,female,0,yes,yes,10,yes,no,dsl,no,...,yes,no,no,yes,one_year,yes,mailed_check,65.90,660.05,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,5130-iekqt,male,1,no,no,25,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,no,mailed_check,105.95,2655.25,1
1405,4452-rohmo,female,0,no,no,15,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.60,331.60,0
1406,6164-haqtx,male,0,no,no,71,no,no_phone_service,dsl,yes,...,yes,yes,yes,no,two_year,no,bank_transfer_(automatic),53.95,3888.65,0
1407,3982-dqlus,male,1,yes,yes,65,yes,yes,fiber_optic,yes,...,no,no,no,no,month-to-month,yes,electronic_check,85.75,5688.45,0


In [None]:
# Target variables

y_train = df_train.churn.values

y_val = df_val.churn.values

y_test = df_test.churn.values

In [53]:
# Remove the target variable from the dataframes
df_train.drop('churn', axis=1, inplace=True)
df_val.drop('churn', axis=1, inplace=True)
df_test.drop('churn', axis=1, inplace=True)


In [75]:
categorical_columns = [
 'gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [76]:

df_train_full[categorical_columns].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [78]:
df_train_full.head(2).T

Unnamed: 0,1814,5946
customerid,5442-pptjy,6261-rcvns
gender,male,female
seniorcitizen,0,0
partner,yes,no
dependents,yes,no
tenure,12,42
phoneservice,yes,yes
multiplelines,no,no
internetservice,no,dsl
onlinesecurity,no_internet_service,yes


In [80]:
global_churn = df_train_full.churn.mean()
global_churn 

np.float64(0.26996805111821087)

In [87]:
def choose_features_by_churn_risk(diff,risk):
    """
    This function takes a DataFrame and a risk value, and returns the features that have a higher churn risk than the global churn risk.
    """
    base_features= []

    for col in categorical_columns:
        df_group = df.groupby(col)['churn'].agg(['mean','count'])
        df_group['diff'] = df_group['mean'] - global_churn
        df_group['risk'] = df_group['mean']/global_churn

        for val in df[col].value_counts().index:

            perc_diff = (df.groupby(df[col]==val)['churn'].mean() - global_churn) 
            perc_risk = (df.groupby(df[col]==val)['churn'].mean() - global_churn) / global_churn

            if perc_diff > diff or perc_risk > risk:
                base_features.append(col)
    return base_features

In [88]:
base = choose_features_by_churn_risk(0.10,40)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().