<h1>Initial EDA for telecom data</h1>

In [81]:
import os
import sys

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.dummy import DummyClassifier

import matplotlib.pyplot as plt
import seaborn as sns

import psycopg2 as pg

Read the data file

In [82]:
train_df = pd.read_csv('../data/telecom_train.csv').drop('Unnamed: 0', axis=1)

In [83]:
train_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7242-QZLXF,Male,0,No,Yes,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,25.3,25.3,Yes
1,8325-QRPZR,Female,0,No,No,7,Yes,No,Fiber optic,No,...,Yes,No,No,No,Month-to-month,Yes,Electronic check,75.15,525.0,No
2,7874-ECPQJ,Female,0,No,Yes,4,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Bank transfer (automatic),20.05,85.5,No
3,2476-YGEFM,Female,0,No,No,29,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Credit card (automatic),76.0,2215.25,No
4,2276-YDAVZ,Female,0,No,No,3,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Credit card (automatic),75.1,270.7,Yes


In [84]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5282 entries, 0 to 5281
Data columns (total 21 columns):
customerID          5282 non-null object
gender              5282 non-null object
SeniorCitizen       5282 non-null int64
Partner             5282 non-null object
Dependents          5282 non-null object
tenure              5282 non-null int64
PhoneService        5282 non-null object
MultipleLines       5282 non-null object
InternetService     5282 non-null object
OnlineSecurity      5282 non-null object
OnlineBackup        5282 non-null object
DeviceProtection    5282 non-null object
TechSupport         5282 non-null object
StreamingTV         5282 non-null object
StreamingMovies     5282 non-null object
Contract            5282 non-null object
PaperlessBilling    5282 non-null object
PaymentMethod       5282 non-null object
MonthlyCharges      5282 non-null float64
TotalCharges        5282 non-null object
Churn               5282 non-null object
dtypes: float64(1), int64(2), obj

In [85]:
train_df['TotalCharges'].values

array(['25.3', '525', '85.5', ..., '306.05', '1200.15', '457.3'],
      dtype=object)

In [86]:
# Convert the total charges to a number
total_charges = train_df['TotalCharges']
f_charges = []
for charges in total_charges :
    try:
        f_charges.append(float(charges))
    except :
        f_charges.append(0.0)
        
train_df['TotalCharges'] = f_charges

Convert the yes/no columns to a 1/0

In [87]:
# convert dependents to 1 = yes, 2 = no
train_df = train_df.replace('Yes', 1)
train_df = train_df.replace('No', 0)

In [88]:
train_df = train_df.replace('No internet service', 0)
train_df = train_df.replace('No phone service', 0)

In [89]:
train_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7242-QZLXF,Male,0,0,1,1,0,0,DSL,0,...,0,0,0,0,Month-to-month,1,Electronic check,25.3,25.3,1
1,8325-QRPZR,Female,0,0,0,7,1,0,Fiber optic,0,...,1,0,0,0,Month-to-month,1,Electronic check,75.15,525.0,0
2,7874-ECPQJ,Female,0,0,1,4,1,0,0,0,...,0,0,0,0,Month-to-month,1,Bank transfer (automatic),20.05,85.5,0
3,2476-YGEFM,Female,0,0,0,29,1,1,Fiber optic,0,...,0,0,0,0,Month-to-month,1,Credit card (automatic),76.0,2215.25,0
4,2276-YDAVZ,Female,0,0,0,3,1,1,Fiber optic,0,...,0,0,0,0,Month-to-month,1,Credit card (automatic),75.1,270.7,1


Create dummies for each of the categorical features

In [90]:
contract_dummies = pd.get_dummies(train_df['Contract'])
internet_dummies = pd.get_dummies(train_df['InternetService'])
gender_dummies = pd.get_dummies(train_df['gender'])

Save the new clean data frame

In [91]:
train_df[contract_dummies.columns] = contract_dummies
train_df[internet_dummies.columns] = internet_dummies
train_df[gender_dummies.columns] = gender_dummies

In [92]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5282 entries, 0 to 5281
Data columns (total 29 columns):
customerID          5282 non-null object
gender              5282 non-null object
SeniorCitizen       5282 non-null int64
Partner             5282 non-null int64
Dependents          5282 non-null int64
tenure              5282 non-null int64
PhoneService        5282 non-null int64
MultipleLines       5282 non-null int64
InternetService     5282 non-null object
OnlineSecurity      5282 non-null int64
OnlineBackup        5282 non-null int64
DeviceProtection    5282 non-null int64
TechSupport         5282 non-null int64
StreamingTV         5282 non-null int64
StreamingMovies     5282 non-null int64
Contract            5282 non-null object
PaperlessBilling    5282 non-null int64
PaymentMethod       5282 non-null object
MonthlyCharges      5282 non-null float64
TotalCharges        5282 non-null float64
Churn               5282 non-null int64
Month-to-month      5282 non-null uint8
One 

In [93]:
train_df.drop('Two year', axis=1, inplace=True)
train_df.drop(0, axis=1, inplace=True)
train_df.drop('Male', axis=1, inplace=True)

In [94]:
train_df.drop(['Contract', 'gender', 'InternetService'], axis=1, inplace=True)

In [95]:
std = StandardScaler()
columns_to_scale = ['MonthlyCharges', 'TotalCharges']
train_df[columns_to_scale] = std.fit_transform(train_df[columns_to_scale])

In [96]:
train_df.to_csv('../data/churn_train_clean.csv')

Data Analysis

In [97]:
train_df.groupby('Churn')['Churn'].count()

Churn
0    3892
1    1390
Name: Churn, dtype: int64

Data Summary:
* Churn is not hugely imbalanced
* Many categorical features are binary
* Categorical features that have multiple groups one-hot encoding has been applied
    