In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score, plot_roc_curve

In [45]:
df = pd.read_csv('data/tele-churn.csv')
#df.info()

0. state: string: Code of the US State 
1. account length: numerical: Number of months the customer has been with the telco provider 
2. area code: string: Area code 
3. phone number: string: Customer’s phone number 
4. international plan: (yes/no): State of subscription of international plan 
5. voice mail plan: (yes/no): State of subscription of voice mail plan 
6. number vmail messages numerical: Count of Voice mail messages 
7. total day minutes numerical: Total minutes of day calls 
8. total day calls numerical: Total number of day calls 
9. total day charge numerical: Total charge of day calls 
10. total eve minutes numerical: Total minutes of evening calls 
11. total eve calls numerical: Total number of evening calls 
12. total eve charge numerical: Total charge of evening calls 
13. total night minutes numerical: Total minutes of night calls 
14. total night calls numerical: Total number of night calls 
15. total night charge numerical: Total charge of night calls 
16. total intl minutes numerical: Total minutes of international calls 
17. total intl calls numerical: Total number of international calls 
18. total intl charge: numerical: Total charge of international calls 
19. customer service calls: numerical: Number of calls to customer service 
20. churn: (True/False): Whether the customer churned or not 

### Imbalance in target, going to need to adjust

In [5]:
df.describe()

Unnamed: 0,account length,area code,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,101.064806,437.182418,8.09901,179.775098,100.435644,30.562307,200.980348,100.114311,17.08354,200.872037,100.107711,9.039325,10.237294,4.479448,2.764581,1.562856
std,39.822106,42.37129,13.688365,54.467389,20.069084,9.259435,50.713844,19.922625,4.310668,50.573847,19.568609,2.275873,2.79184,2.461214,0.753773,1.315491
min,1.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,33.0,1.04,0.0,0.0,0.0,0.0
25%,74.0,408.0,0.0,143.7,87.0,24.43,166.6,87.0,14.16,167.0,87.0,7.52,8.5,3.0,2.3,1.0
50%,101.0,415.0,0.0,179.4,101.0,30.5,201.4,100.0,17.12,201.2,100.0,9.05,10.3,4.0,2.78,1.0
75%,127.0,510.0,20.0,216.4,114.0,36.79,235.3,114.0,20.0,235.3,113.0,10.59,12.1,6.0,3.27,2.0
max,243.0,510.0,51.0,350.8,165.0,59.64,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4,9.0


### Dropping state, area code, and phone number as they will not be used to evaluate churn

In [46]:
df['area code'] = df['area code'].astype(str)

In [47]:
df['area code'].value_counts()

415    1655
510     840
408     838
Name: area code, dtype: int64

In [48]:
df = pd.get_dummies(df, columns=['area code'])

In [11]:
df['international plan'].value_counts()

no     3010
yes     323
Name: international plan, dtype: int64

In [12]:
df['voice mail plan'].value_counts()

no     2411
yes     922
Name: voice mail plan, dtype: int64

In [13]:
df['churn'].value_counts()

False    2850
True      483
Name: churn, dtype: int64

In [49]:
df['international plan'] = df['international plan'].map({'no': 0, 'yes': 1})
df['voice mail plan'] = df['voice mail plan'].map({'no': 0, 'yes': 1})
df['churn'] = df['churn'].astype(int)

In [35]:
len(df[df.duplicated()])

0

In [29]:
#df.corr()

In [36]:
#checking for multicollinearity
# save absolute value of correlation matrix as a data frame
# converts all values to absolute value
# stacks the row:column pairs into a multindex
# reset the index to set the multindex to seperate columns
# sort values. 0 is the column automatically generated by the stacking

df_mulcol=df.corr().abs().stack().reset_index().sort_values(0, ascending=False)

# zip the variable name columns (Which were only named level_0 and level_1 by default) in a new column named "pairs"
df_mulcol['pairs'] = list(zip(df_mulcol.level_0, df_mulcol.level_1))

# set index to pairs
df_mulcol.set_index(['pairs'], inplace = True)

#d rop level columns
df_mulcol.drop(columns=['level_1', 'level_0'], inplace = True)

# rename correlation column as cc rather than 0
df_mulcol.columns = ['cc']

# drop duplicates. This could be dangerous if you have variables perfectly correlated with variables other than themselves.

df_mulcol.drop_duplicates(inplace=True)

In [37]:
print(df_mulcol[(df_mulcol.cc>.50) & (df_mulcol.cc <1)])

                                                 cc
pairs                                              
(total day minutes, total day charge)      1.000000
(total eve charge, total eve minutes)      1.000000
(total night charge, total night minutes)  0.999999
(total intl minutes, total intl charge)    0.999993
(voice mail plan, number vmail messages)   0.956927
(area code_415, area code_510)             0.576476
(area code_415, area code_408)             0.575559


### dropping total minute columns since it is a redundant column 
- (total charge = total minutes)

### Dropping area code because of high multicollinearity
- also dropping state and phone number

In [51]:
df.drop(columns=['state', 'phone number', 'area code_408', 'area code_415', 'area code_510',
                 'total day minutes', 'total eve minutes', 'total night minutes', 'total intl minutes'], 
        axis=1, inplace=True)

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   account length          3333 non-null   int64  
 1   international plan      3333 non-null   int64  
 2   voice mail plan         3333 non-null   int64  
 3   number vmail messages   3333 non-null   int64  
 4   total day calls         3333 non-null   int64  
 5   total day charge        3333 non-null   float64
 6   total eve calls         3333 non-null   int64  
 7   total eve charge        3333 non-null   float64
 8   total night calls       3333 non-null   int64  
 9   total night charge      3333 non-null   float64
 10  total intl calls        3333 non-null   int64  
 11  total intl charge       3333 non-null   float64
 12  customer service calls  3333 non-null   int64  
 13  churn                   3333 non-null   int32  
dtypes: float64(4), int32(1), int64(9)
memory