# EDA of Customer Churn Telco Data
## Author: Hristo Panov

## 00 Prerequisite and packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels.api as sm
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

# Custom Imports
import sys
sys.path.insert(0, '..')

from utils.const_folders import folder_paths

%matplotlib inline

## 01 Read Data

In [3]:
# Read From CSV
df = pd.read_csv(
    f"{folder_paths.path}/WA_Fn-UseC_-Telco-Customer-Churn.csv"
    ,sep=","
    ,index_col=None
)

## 02 Data Audit

In [12]:
# Columns and Rows
print('The dataset contains {one} rows and {two} columns'.format(one=df.shape[0],two=df.shape[1]))
print('Those columns are {one}'.format(one=df.columns.to_list()))

The dataset contains 7043 rows and 21 columns
Those columns are ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [14]:
# Preview
df.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
564,7319-VENRZ,Male,0,No,No,7,Yes,No,DSL,No,...,Yes,Yes,Yes,No,Month-to-month,No,Bank transfer (automatic),64.3,445.95,No
584,2393-DIVAI,Female,0,No,No,3,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Mailed check,20.0,61.7,No
2325,3097-PYWXL,Female,0,Yes,Yes,4,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Electronic check,19.95,82.9,No
220,9408-SSNVZ,Female,0,No,No,4,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.15,268.35,Yes
4525,6578-KRMAW,Male,0,No,No,32,Yes,No,Fiber optic,Yes,...,Yes,No,No,Yes,Month-to-month,Yes,Credit card (automatic),93.4,2979.3,No


In [16]:
#Check if the dataframe is unique on Customer Level - aka each row cointains information about one customer
df.shape[0] == len(df['customerID'].unique())

True

In [17]:
#Set the index of the data set to be equal to the customerID
df.set_index('customerID',inplace=True)

In [18]:
#Explore Feature types
df.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

## 03 Data Prep 

In [23]:
# Total Charges are expected to be numeric. Lets Convert them
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
# Check if the conversion is sucesfull
print('The datatype of {one} is {two}'.format(one="TotalCharges",two=df["TotalCharges"].dtypes ))

The datatype of TotalCharges is float64


In [20]:
print(df['TotalCharges'].dtypes)

float64
