<h1>Initial EDA for telecom data</h1>

In [1]:
import os
import sys

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.dummy import DummyClassifier

import matplotlib.pyplot as plt
import seaborn as sns

import psycopg2 as pg

Read the data file

In [2]:
train_df = pd.read_csv('../data/telecom_train.csv').drop('Unnamed: 0', axis=1)

In [3]:
train_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7242-QZLXF,Male,0,No,Yes,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,25.3,25.3,Yes
1,8325-QRPZR,Female,0,No,No,7,Yes,No,Fiber optic,No,...,Yes,No,No,No,Month-to-month,Yes,Electronic check,75.15,525.0,No
2,7874-ECPQJ,Female,0,No,Yes,4,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Bank transfer (automatic),20.05,85.5,No
3,2476-YGEFM,Female,0,No,No,29,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Credit card (automatic),76.0,2215.25,No
4,2276-YDAVZ,Female,0,No,No,3,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Credit card (automatic),75.1,270.7,Yes


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5282 entries, 0 to 5281
Data columns (total 21 columns):
customerID          5282 non-null object
gender              5282 non-null object
SeniorCitizen       5282 non-null int64
Partner             5282 non-null object
Dependents          5282 non-null object
tenure              5282 non-null int64
PhoneService        5282 non-null object
MultipleLines       5282 non-null object
InternetService     5282 non-null object
OnlineSecurity      5282 non-null object
OnlineBackup        5282 non-null object
DeviceProtection    5282 non-null object
TechSupport         5282 non-null object
StreamingTV         5282 non-null object
StreamingMovies     5282 non-null object
Contract            5282 non-null object
PaperlessBilling    5282 non-null object
PaymentMethod       5282 non-null object
MonthlyCharges      5282 non-null float64
TotalCharges        5282 non-null object
Churn               5282 non-null object
dtypes: float64(1), int64(2), obj

In [5]:
train_df['TotalCharges'].values

array(['25.3', '525', '85.5', ..., '306.05', '1200.15', '457.3'],
      dtype=object)

In [6]:
# Convert the total charges to a number
total_charges = train_df['TotalCharges']
f_charges = []
for charges in total_charges :
    try:
        f_charges.append(float(charges))
    except :
        f_charges.append(0.0)
        
train_df['TotalCharges'] = f_charges

Convert the yes/no columns to a 1/0

In [7]:
# convert dependents to 1 = yes, 2 = no
train_df['Dependents'] = [1 if train_df.loc[x,'Dependents'] == 'Yes' else 0 for x in train_df.index]

In [8]:
train_df['Partner'] = [1 if train_df.loc[x,'Partner'] == 'Yes' else 0 for x in train_df.index]

In [9]:
train_df['PhoneService'] = [1 if train_df.loc[x,'PhoneService'] == 'Yes' else 0 for x in train_df.index]

In [10]:
train_df['MultipleLines'] = [1 if train_df.loc[x,'MultipleLines'] == 'Yes' else 0 for x in train_df.index]

In [11]:
train_df['OnlineSecurity'] = [1 if train_df.loc[x,'OnlineSecurity'] == 'Yes' else 0 for x in train_df.index]

In [12]:
train_df['OnlineBackup'] = [1 if train_df.loc[x,'OnlineBackup'] == 'Yes' else 0 for x in train_df.index]

In [13]:
train_df['DeviceProtection'] = [1 if train_df.loc[x,'DeviceProtection'] == 'Yes' else 0 for x in train_df.index]

In [14]:
train_df['TechSupport'] = [1 if train_df.loc[x,'TechSupport'] == 'Yes' else 0 for x in train_df.index]

In [15]:
train_df['StreamingTV'] = [1 if train_df.loc[x,'StreamingTV'] == 'Yes' else 0 for x in train_df.index]

In [16]:
train_df['StreamingMovies'] = [1 if train_df.loc[x,'StreamingMovies'] == 'Yes' else 0 for x in train_df.index]

In [17]:
train_df['PaperlessBilling'] = [1 if train_df.loc[x,'PaperlessBilling'] == 'Yes' else 0 for x in train_df.index]

Create dummies for each of the categorical features

In [19]:
contract_dummies = pd.get_dummies(train_df['Contract'])
internet_dummies = pd.get_dummies(train_df['InternetService'])
gender_dummies = pd.get_dummies(train_df['gender'])

Save the new clean data frame

In [20]:
train_df[contract_dummies.columns] = contract_dummies
train_df[internet_dummies.columns] = internet_dummies
train_df[gender_dummies.columns] = gender_dummies

In [None]:
train_df.drop('Two year', axis=1, inplace=True)
train_df.drop('DSL', axis=1, inplace=True)
train_df.drop('Male', axis=1, inplace=True)

In [23]:
train_df.drop(['Contract', 'gender', 'InternetService'], axis=1, inplace=True)

In [24]:
train_df.to_csv('../data/churn_train_clean.csv')

Data Analysis

In [25]:
train_df.groupby('Churn')['Churn'].count()

Churn
No     3892
Yes    1390
Name: Churn, dtype: int64

Data Summary:
* Churn is not hugely imbalanced
* Many categorical features are binary
* Categorical features that have multiple groups one-hot encoding has been applied
    