# Exploratory Data Analysis

## Data preprocessing

In [7]:
import numpy as np
import pandas as pd

In [8]:
%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt

#plt.style.use('ggplot')
#plt.style.use('seaborn-poster')
plt.style.use('seaborn-white')

colors = ['maroon', '#d98047', '#d94758']

In [10]:
data = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head()

In [12]:
df = data.copy()

In [17]:
df.info()

In [14]:
#更改資料類型
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df = df.astype({'SeniorCitizen':'category'})

In [19]:
df.info()

In [16]:
df.isnull().sum()

In [17]:
#刪除缺漏值
df.dropna(inplace = True)

In [19]:
#將目標變項從字串更改為數值
df['Churn'].replace(to_replace='Yes', value=1, inplace=True)
df['Churn'].replace(to_replace='No',  value=0, inplace=True)

In [12]:
df.columns.values

In [20]:
#更改欄位位置
df = df[['customerID', 'Churn', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 
         'MonthlyCharges', 'TotalCharges', 'Contract', 'PaperlessBilling', 'PaymentMethod', 
         'PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup',
         'DeviceProtection','TechSupport','StreamingTV','StreamingMovies']]

In [9]:
df.head()

In [21]:
import pandas_profiling as pp

profile = pp.ProfileReport(df, title="Pandas Profiling Report")
profile.to_file("ThisReport.html")

In [11]:
#了解各變項相關性

import phik
from phik.report import plot_correlation_matrix
from phik import report

In [12]:
phik_overview = df.iloc[:, 1:].phik_matrix()
phik_overview.round(2)

In [17]:
phik_overview.loc[:, 'Churn'].sort_values(ascending = False)[1:].plot(kind='bar', color='#d94758', rot=82, title='Phik correlation')

In [33]:
plot_correlation_matrix(phik_overview.values, 
                        x_labels=phik_overview.columns, 
                        y_labels=phik_overview.index, 
                        identity_layout=False,
                        vmin=0, vmax=1, color_map="YlOrRd", 
                        title=r"correlation $\phi_K$", 
                        fontsize_factor=1.5, 
                        figsize=(20, 16))
plt.tight_layout()

In [35]:
#客戶流失率
ax = (df['Churn'].value_counts()*100.0/len(df)).plot.pie \
        (autopct='%.1f%%', labels=['No', 'Yes'], figsize=(5,5), fontsize=12, colors=colors)
ax.set_title('Churn Rate', size = 16)
ax.set_ylabel('')

In [10]:
#contract
contract_churn = df.groupby(['Churn', 'Contract'])

ax = contract_churn.size().unstack().plot.pie \
    (autopct='%1.1f%%', labels=['', ''], stacked=True, subplots=True, \
     legend=False, figsize=(16, 16), colors=colors)
ax[0].legend(loc=2, labels=['Churn:No', 'Churn:Yes'])
ax[0].set_title('Month-to-month')
ax[1].set_title('One year')
ax[2].set_title('Two year')
ax[0].set_ylabel('')
ax[1].set_ylabel('')
ax[2].set_ylabel('')

In [75]:
#tenure
ax = sns.boxplot(x = df.Churn, y = df.tenure)
ax.set_xticks(ticks=[0, 1], labels=['No', 'Yes'])

In [23]:
df['tenure_group'] = pd.cut(df.tenure, bins=[0,10,72],labels=['new', 'regular'])

In [24]:
df_NewClient = df[df['tenure_group']=='new']
df_NewClient.head()

In [26]:
df_Churn = df_NewClient[df_NewClient['Churn']==1]
df_NoChurn = df_NewClient[df_NewClient['Churn']==0]
df_NoChurn = df_NoChurn.sample(n=len(df_Churn), random_state=0)
df_NewClientChurn = pd.concat([df_Churn, df_NoChurn],axis=0)
print(df_NewClientChurn['Churn'].value_counts())

In [18]:
from scipy import stats

services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
           'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

for i, item in enumerate(services):

    print('--------' + item + '--------')

    Thisdf_crosstab = pd.crosstab(index=df_NewClientChurn[item], columns=df_NewClientChurn['Churn'])
    Thisdf_crosstab.columns=['Churn:No', 'Churn:Yes']
    print(Thisdf_crosstab, '\n')

    print('----Chi-square test----')
    alpha = 0.05
    chi2, p, dof, expected = stats.chi2_contingency(Thisdf_crosstab)
    print('chi-square=%.2f, df=%d, p=%.2f' %(chi2, dof, p))
    if p >= alpha:
        print('H0 is accepted')
    else:
        print('H0 is rejected')

    #calculate Cramer's V 
    crosstab_arr=np.array(Thisdf_crosstab)
    obs=np.sum(crosstab_arr)
    minDim=min(crosstab_arr.shape)-1
    V=np.sqrt((chi2/obs)/minDim)
    print('Cramer\'s V=%.2f' %(V))
    print('\n')

    #直方圖
    fig, ax=plt.subplots(1, 1)
    sns.countplot(data=df_NewClientChurn, x=item, hue='Churn', palette=colors, dodge=True, alpha=1, ax=ax)
    ax.legend(title='Churn', loc=0, labels=['Churn:No', 'Churn:Yes'])
    plt.show()

In [28]:
services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
           'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(20,20))

for i, item in enumerate(services):
    if i < 3:
        sns.countplot(data=df_NewClientChurn, x=item, hue='Churn', palette=colors, dodge=True, alpha=1, ax=axes[i,0])
        axes[i,0].legend(title='Churn', loc=0, labels=['Churn:No', 'Churn:Yes'])
        
    elif i >=3 and i < 6:
        sns.countplot(data=df_NewClientChurn, x=item, hue='Churn', palette=colors, dodge=True, alpha=1, ax=axes[i-3,1])
        axes[i-3,1].legend(title='Churn', loc=0, labels=['Churn:No', 'Churn:Yes'])
        
    elif i < 9:
        sns.countplot(data=df_NewClientChurn, x=item, hue='Churn', palette=colors, dodge=True, alpha=1, ax=axes[i-6,2])
        axes[i-6,2].legend(title='Churn', loc=0, labels=['Churn:No', 'Churn:Yes'])
    #ax.set_title(item)

In [19]:
#PaymentMethod
PaymentMethod_churn = df.groupby(['Churn', 'PaymentMethod'])

In [28]:
df['PaymentMethod'].unique()

In [34]:
ax = PaymentMethod_churn.size().unstack().plot.pie \
        (autopct='%1.1f%%', labels=['', ''], stacked=True, subplots=True, \
         legend=False, figsize=(24, 8), colors=colors)
ax[0].legend(loc=2, labels=['Churn:No', 'Churn:Yes'])

PaymentMethod_list = ['Bank transfer (automatic)', 'Credit card (automatic)', 'Electronic check', 'Mailed check']
for i, item in enumerate(PaymentMethod_list):
    ax[i].set_title(item)
    ax[i].set_ylabel('')

In [31]:
#PaperlessBilling
PaperlessBilling_churn = df.groupby(['Churn', 'PaperlessBilling'])

In [35]:
ax = PaperlessBilling_churn.size().unstack().plot.pie \
        (autopct='%1.1f%%', labels=['', ''], stacked=True, subplots=True, \
         legend=False, figsize=(16, 8), colors=colors)
ax[0].legend(loc=2, labels=['Churn:No', 'Churn:Yes'])

PaperlessBilling_list = ['PaperlessBilling:No', 'PaperlessBilling:Yes']
for i, item in enumerate(PaperlessBilling_list):
    ax[i].set_title(item)
    ax[i].set_ylabel('')

In [36]:
#Dependents
Dependents_churn = df.groupby(['Churn', 'Dependents'])

In [38]:
ax = Dependents_churn.size().unstack().plot.pie \
        (autopct='%1.1f%%', labels=['', ''], stacked=True, subplots=True, \
         legend=False, figsize=(16, 8), colors=colors)
ax[0].legend(loc=2, labels=['Churn:No', 'Churn:Yes'])

Dependents_list = ['Dependents:No', 'Dependents:Yes']
for i, item in enumerate(Dependents_list):
    ax[i].set_title(item)
    ax[i].set_ylabel('')

In [19]:
#seniority
seniority_churn = df.groupby(['Churn', 'SeniorCitizen'])

In [82]:
ax = seniority_churn.size().unstack().plot.pie \
        (autopct='%1.1f%%', labels=['', ''], stacked=True, subplots=True, \
         legend=False, figsize=(16, 16), colors=colors)
ax[0].legend(loc=2, labels=['Churn:No', 'Churn:Yes'])
ax[0].set_title('seniority:No')
ax[1].set_title('seniority:Yes')
ax[0].set_ylabel('')
ax[1].set_ylabel('')

In [17]:
#MonthlyCharges
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 0) ], color="maroon", shade = True)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 1) ], ax=ax, color="#d98047", shade= True)
ax.legend(["Churn:No", "Churn:Yes"], loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Charges')
ax.set_title('Distribution of monthly charges by churn')

In [18]:
#TotalCharges
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 0) ],color="maroon", shade = True)
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 1) ], ax =ax, color="#d98047", shade= True)
ax.legend(["Churn:No", "Churn:Yes"], loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Total Charges')
ax.set_title('Distribution of total charges by churn')

In [48]:
#services

df_Churn = df[df['Churn']==1]
df_NoChurn = df[df['Churn']==0]
df_NoChurn = df_NoChurn.sample(n=len(df_Churn), random_state=0)
df_ChurnSample = pd.concat([df_Churn, df_NoChurn], axis=0)

In [49]:
df_ChurnSample['Churn'].value_counts()

In [52]:
services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
           'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(20,20))

for i, item in enumerate(services):
    if i < 3:
        sns.countplot(data=df_ChurnSample, x=item, hue='Churn', palette=colors, dodge=True, alpha=1, ax=axes[i,0])
        axes[i,0].legend(title='Churn', loc=0, labels=['Churn:No', 'Churn:Yes'])
        
    elif i >=3 and i < 6:
        sns.countplot(data=df_ChurnSample, x=item, hue='Churn', palette=colors, dodge=True, alpha=1, ax=axes[i-3,1])
        axes[i-3,1].legend(title='Churn', loc=0, labels=['Churn:No', 'Churn:Yes'])
        
    elif i < 9:
        sns.countplot(data=df_ChurnSample, x=item, hue='Churn', palette=colors, dodge=True, alpha=1, ax=axes[i-6,2])
        axes[i-6,2].legend(title='Churn', loc=0, labels=['Churn:No', 'Churn:Yes'])
    ax.set_title(item)

In [55]:
from scipy import stats

services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
           'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

for i, item in enumerate(services):

    print('--------' + item + '--------')

    Thisdf_crosstab = pd.crosstab(index=df_ChurnSample[item], columns=df_ChurnSample['Churn'])
    Thisdf_crosstab.columns=['Churn:No', 'Churn:Yes']
    print(Thisdf_crosstab, '\n')

    print('----Chi-square test----')
    alpha = 0.05
    chi2, p, dof, expected = stats.chi2_contingency(Thisdf_crosstab)
    print('chi-square=%.2f, df=%d, p=%.2f' %(chi2, dof, p))
    if p >= alpha:
        print('H0 is accepted')
    else:
        print('H0 is rejected')

    #calculate Cramer's V 
    crosstab_arr=np.array(Thisdf_crosstab)
    obs=np.sum(crosstab_arr)
    minDim=min(crosstab_arr.shape)-1
    V=np.sqrt((chi2/obs)/minDim)
    print('Cramer\'s V=%.2f' %(V))
    print('\n')

    #直方圖
    fig, ax=plt.subplots(1, 1)
    sns.countplot(data=df_ChurnSample, x=item, hue='Churn', palette=colors, dodge=True, alpha=1, ax=ax)
    ax.legend(title='Churn', loc=0, labels=['Churn:No', 'Churn:Yes'])
    plt.show()

# Predictive Model

In [72]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [71]:
df_dummies = pd.get_dummies(df.iloc[:, 1:])
df_dummies.head()

In [73]:
y = df_dummies['Churn'].values
x = df_dummies.drop(columns=['Churn'])

features = x.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(x)
x = pd.DataFrame(scaler.transform(x))
x.columns = features
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

In [74]:
#SVM = .706
ThisSvm = Pipeline([('clf', SVC(random_state=0, class_weight='balanced'))])

param_grid = [{
        'clf__kernel': ['rbf', 'linear'], 
        'clf__C': [1.0, 10.0, 100.0],
        }]

gs_ThisSvm = GridSearchCV(ThisSvm, param_grid, scoring='roc_auc', cv=10, verbose=2, n_jobs=-1)

In [75]:
gs_ThisSvm.fit(x_train, y_train)

In [76]:
print('Best parameter set: %s ' % gs_ThisSvm.best_params_)
print('CV AUC: %.3f' % gs_ThisSvm.best_score_)

In [77]:
clf = gs_ThisSvm.best_estimator_
print('Test AUC: %.3f' % clf.score(x_test, y_test))

In [78]:
pred = clf.predict(x_test)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

In [88]:
#LogisticRegression = .77
model = LogisticRegression(class_weight='balanced')
result = model.fit(x_train, y_train)

prediction_test = model.predict(x_test)
print (metrics.accuracy_score(y_test, prediction_test))

In [89]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, prediction_test)

In [90]:
weights = pd.Series(model.coef_[0], index=x.columns.values)
print(weights.sort_values(ascending = False)[:10].plot(kind='bar', color='maroon'))

In [91]:
print(weights.sort_values(ascending = False)[-10:].plot(kind='bar', color='maroon'))

In [93]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, prediction_test))  

# Others

In [43]:
ax = sns.distplot(df['tenure'], hist=True, kde=False, hist_kws={'color':'#d94758', 'edgecolor':'maroon'})
ax.set_ylabel('# of Customers')
ax.set_xlabel('Tenure (months)')
ax.set_title('# of Customers by their tenure')

In [47]:
fig, (ax1,ax2,ax3) = plt.subplots(nrows=1, ncols=3, sharey = True, figsize = (20,6))

ax = sns.distplot(df[df['Contract']=='Month-to-month']['tenure'],
                   hist=True, kde=False, color='maroon', hist_kws={'edgecolor':'black'}, ax=ax1)
ax.set_ylabel('# of Customers')
ax.set_xlabel('Tenure (months)')
ax.set_title('Month to Month Contract')

ax = sns.distplot(df[df['Contract']=='One year']['tenure'],
                   hist=True, kde=False, color='#d94758', hist_kws={'edgecolor':'black'}, ax=ax2)
ax.set_xlabel('Tenure (months)',size = 14)
ax.set_title('One Year Contract',size = 14)

ax = sns.distplot(df[df['Contract']=='Two year']['tenure'],
                   hist=True, kde=False, color='#d98047', hist_kws={'edgecolor':'black'}, ax=ax3)
ax.set_xlabel('Tenure (months)')
ax.set_title('Two Year Contract')

In [48]:
services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
           'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(20,20))

for i, item in enumerate(services):
    if i < 3:
        ax = df[item].value_counts().plot(kind='bar', ax=axes[i,0], rot=0, color=colors)
        
    elif i >=3 and i < 6:
        ax = df[item].value_counts().plot(kind='bar', ax=axes[i-3,1], rot=0, color=colors)
        
    elif i < 9:
        ax = df[item].value_counts().plot(kind='bar', ax=axes[i-6,2], rot=0, color=colors)
    ax.set_title(item)

In [51]:
contract_churn = df.groupby(['Churn', 'Contract'])

In [22]:
print(contract_churn.size())

In [23]:
print(contract_churn.size().unstack())