In [1]:
## Importing libraries ##

# Data manipulation
import pandas as pd
import numpy as np

# Data visualisation
import seaborn as sns
import matplotlib as plt

In [2]:
# Loading the data
df = pd.read_csv('telco-customer-churn\WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Previewing data frame (.T to examine series in full)
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [3]:
## Preprocessing the data ##

# Cleaning series titles
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Cleaning string-type values
categorical = list(df.dtypes[df.dtypes == 'object'].index)
for i in categorical:
    df[i] = df[i].str.lower().str.replace(' ', '_')

# Converting target series to binary for future classification
df.churn = (df.churn == 'yes').astype(int)

# totalcharges is 'object' type but contains numerical data - want to convert to numeric type
pd.to_numeric(df.totalcharges, errors="coerce").isna().sum()

# Choose NA values to be 0 (not always the best option but works here)
df.totalcharges = pd.to_numeric(df.totalcharges, errors="coerce").fillna(0)

In [4]:
## Establishing Validation Framework - scikit-learn ##

# Importing function to shuffle and split dataset
from sklearn.model_selection import train_test_split

# Establishing 60/20/20 splits
train_val, test = train_test_split(df, test_size=0.2, random_state=1)
train, val = train_test_split(train_val, test_size=0.2/0.8, random_state=1)

# Feature matrices
train_val = train_val.reset_index(drop=True)
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

# Target vectors
y_train_val = train_val.churn
y_train = train.churn
y_val = val.churn
y_test = test.churn

# Removing target vectors from feature matrices
del train['churn']
del val['churn']
del test['churn']

In [5]:
## Exploratory Data Analysis (EDA) ##

# Global churn rate
global_churn = train_val.churn.value_counts(normalize=True)[1]

# Numerical series
numeric_ser = ['tenure', 'monthlycharges', 'totalcharges']

# Categorical series
categorical_ser = []
for i in df.columns:
    if i not in numeric_ser and i != 'customerid' and i != 'churn':
        categorical_ser.append(i)

### Key Definitions:

- **Difference** = `Group Churn Rate - Global Churn Rate`
- **Risk Ratio** =  `Group Churn Rate / Global Churn Rate`

These metrics can be used to quantify the importance of **categorical** variables. Similarly, the importance of numeric values can be quantified using correlations.

In [6]:
## Exploratory Data Analysis (EDA) Cont. ##

# Importing tool to iteratively display data in for loop
from IPython.display import display

for i in train_val[categorical_ser]:
    train_val_group = train_val.groupby(i).churn.agg(['mean', 'count'])

    train_val_group['diff'] = train_val_group['mean'] - train_val.churn.mean()
    train_val_group['risk'] = train_val_group['mean'] / train_val.churn.mean()

    display(train_val_group)
    print()

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498





Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208





Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.059841,1.221659
yes,0.205033,2702,-0.064935,0.759472





Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.043792,1.162212
yes,0.165666,1666,-0.104302,0.613651





Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.028652,0.89387
yes,0.273049,5087,0.003081,1.011412





Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.012561,0.953474
no_phone_service,0.241316,547,-0.028652,0.89387
yes,0.290742,2387,0.020773,1.076948





Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.077621,0.712482
fiber_optic,0.425171,2479,0.155203,1.574895
no,0.077805,1221,-0.192163,0.288201





Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.150953,1.559152
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.153226,1612,-0.116742,0.56757





Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.134355,1.497672
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.217232,1915,-0.052736,0.80466





Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.125907,1.466379
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.230412,1940,-0.039556,0.85348





Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.148946,1.551717
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.159926,1632,-0.110042,0.59239





Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.072864,1.269897
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.302723,2167,0.032755,1.121328





Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.068938,1.255358
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.307273,2200,0.037305,1.138182





Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.161733,1.599082
one_year,0.120573,1186,-0.149395,0.446621
two_year,0.028274,1344,-0.241694,0.10473





Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.097897,0.637375
yes,0.338151,3321,0.068183,1.25256





Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.101797,0.622928
credit_card_(automatic),0.164339,1217,-0.10563,0.608733
electronic_check,0.45589,1893,0.185922,1.688682
mailed_check,0.19387,1305,-0.076098,0.718121





Additionally, Mutual Information is a method of comparing the importance of entire variables. This enables us to compare whole variables to each other.

In [7]:
## Exploratory Data Analysis (EDA) Cont. ##

# Importing function for calculation mutual information score
from sklearn.metrics import mutual_info_score

# Function to calculating MI scores for each series (compared to churn)
def mutual_info_churn_score(series):
    return mutual_info_score(series, train_val.churn)

train_val[categorical_ser].apply(mutual_info_churn_score).sort_values(ascending=False)[:5]

contract           0.098320
onlinesecurity     0.063085
techsupport        0.061032
internetservice    0.055868
onlinebackup       0.046923
dtype: float64

> The top 5 most important categorical series (in their ability to affect `churn`) are `contract`, `onlinesecurity`, `techsupport`, `internetservice` and `onlinebackup`.

In [8]:
## Exploratory Data Analysis (EDA) Cont. ##

# Calculating correlation between numeric type series and churn
train_val[numeric_ser].corrwith(train_val.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

As `tenure` and `totalcharges` increase `churn` decreases. On the contrary, as `monthlycharges` increases `churn` also increases.

In [9]:
## Exploratory Data Analysis (EDA) Cont. ##

# Check the churn rate under specific conditions (tenure between 2 months and 12 months)
train_val[(train_val.tenure > 2) & (train_val.tenure <= 12)].churn.mean()

0.3994413407821229

We can encode categorical variables using scikit-learn's `DictVectorization()` by first converting our feature matrix to a dictionary and then calling the function. This is known as One-Hot encoding.

In [10]:
## One-Hot encoding ##

# Importing function for dictionary vectorization
from sklearn.feature_extraction import DictVectorizer

# Converting training set to dictionary
train_dicts = train[categorical_ser + numeric_ser].to_dict(orient="records")
val_dicts = val[categorical_ser + numeric_ser].to_dict(orient="records")

dv = DictVectorizer(sparse=False)

# Feature matrices
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [11]:
## Logistic Regression (converts linear regression values into probabilities using sigmoid) ##

# Importing logistic regression function
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

# Training the model
model.fit(X_train, y_train)

# Soft predictions
y_train_pred = model.predict_proba(X_train)[:,1]
y_val_pred   = model.predict_proba(X_val)[:,1]

# Classification framework for probabilities
churn_decision_train = (y_train_pred >= 0.5).astype(int)
churn_decision_val   = (y_val_pred >= 0.5).astype(int)

# Accuracy of model on training set
score_train = (churn_decision_train == y_train).mean()
score_val =   (churn_decision_val == y_val).mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Now want to perform training on extended train set (train + val) - must first del .churn
del train_val['churn']

In [13]:
# Converting splits to dictionaries
train_val_dicts = train_val[categorical_ser + numeric_ser].to_dict(orient="records")
test_dicts = test[categorical_ser + numeric_ser].to_dict(orient="records")

# Defining dictionary vectoriser
dv = DictVectorizer(sparse=False)

# Defining feature matrices
X_train_val = dv.fit_transform(train_val_dicts)
X_test = dv.transform(test_dicts)

# Defining model
model = LogisticRegression()

# Training model on extended dataset
model.fit(X_train_val, y_train_val)

# Predicted test target vector
y_test_pred = model.predict_proba(X_test)[:,1]

# Classifying probabilities into binary outcomes
churn_decision_test = (y_test_pred >= 0.5).astype(int)

# Model score
score_test = (churn_decision_test == y_test).mean()

# Comparing different scores
score_train, score_val, score_test

(0.805207100591716, 0.8034066713981547, 0.815471965933286)

All scores are relatively similar which suggests our model is well-fitting and hasn't overfit or gone wrong anywhere.

In [14]:
## Using the model ##

rand = np.random.randint(0, 200)

customer = dv.transform(test_dicts[rand])
if model.predict_proba(customer)[:,1] >= 0.5:
    print('Model estimate: 1')
else:
    print('Model estimate: 0')

print('True value:    ', y_test[rand])

Model estimate: 1
True value:     1
