In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.inspection import PartialDependenceDisplay
from xgboost import plot_importance
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import optuna
import statsmodels.api as sm
from sklearn.model_selection import StratifiedKFold
import pickle

In [None]:
df = pd.read_csv('E:/MyProject/Python/CustomerChurn/data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

In [None]:
df.info()

In [None]:
len(df['customerID'].unique())

In [None]:
df.drop('customerID',axis=1,inplace=True)
df.info()

In [None]:
df[df['TotalCharges']==' ']

In [None]:
df.loc[df['TotalCharges']==' ','TotalCharges'] = df.loc[df['TotalCharges']==' ','tenure'] * df.loc[df['TotalCharges']==' ','MonthlyCharges']
df['TotalCharges']=df['TotalCharges'].astype(float)
df[df['TotalCharges']==0]

In [None]:
unique,count = np.unique(df['Churn'],return_counts=True)
plt.pie(x=count,labels=unique,autopct='%.0f%%')
plt.xlabel('Churn')
plt.title('Churn Class Distribution')
plt.show()

In [None]:
tab = pd.crosstab(df['PaymentMethod'],df['Churn'])
sns.heatmap(tab,annot=True,fmt='.4g')

In [None]:
pvalue = stats.chi2_contingency(tab)[1]
dependencies = 'Dependent' if pvalue<0.05 else 'Independent'
pd.DataFrame({'columns':['PaymentMethod'],'p_value':[pvalue],'Dependence': [dependencies]})

It seems a lot of churned customer using Electronic Check as payment method

In [None]:
sns.boxplot(x=df['Churn'],y=df['MonthlyCharges'])
plt.show()

Churned Customer have overall higher Monthly Charges compared to loyal customer. Hypothesis 1: High Monthly charges may be one of the reason why customer churn

In [None]:
sns.boxplot(x=df['Churn'],y=df['tenure'])
plt.show()

In [None]:
contract = df[['Contract','Churn']].copy()
mapping = {'Month-to-month':0, 'One year':1, 'Two year':2}
contract['Contract'] = contract['Contract'].apply(lambda x: mapping[x])
contract['Churn'] = contract['Churn'].apply(lambda x: 1 if x=='Yes' else 0)
sns.heatmap(contract.corr('kendall'),annot=True)

Loyal Customers tend to have longer tenure and longer contract compared to Churned Customer. 

Hypothesis 2: Churned Customer may be feel the monthly charges too high or the provided services not worth it for the price, OR maybe there are some new customer that tried the services

Lets compare Churned customers and loyal customers behaviour for each service

In [None]:
services = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies']
_, ax = plt.subplots(3, 3, figsize=(30,10))
for i in range(len(services)):
    tab=pd.crosstab(df[services[i]],df['Churn'])
    sns.heatmap(tab,ax=ax[i//3,i%3],annot=True,fmt='.4g')
    ax[i//3,i%3].set_ylabel(services[i])

In [None]:
chi2_test=pd.DataFrame({'Columns':services})
dependency=[]
p_values=[]
for col in services:
    tab=pd.crosstab(df[col],df['Churn'])
    p_values.append(stats.chi2_contingency(tab)[1])
    if stats.chi2_contingency(tab)[1]<0.05:
        dependency.append('Dependent')
    else:
        dependency.append('Independent')
chi2_test['Dependency']=dependency
chi2_test['P_Values']=p_values
chi2_test

In [None]:
cols = services + ['Churn']
services = df[cols].copy(True)
services.head()

In [None]:
for col in services.columns.values:
    unique = set(services[col].unique())
    if unique == {'No','Yes'}:
        mapping = {'No':0,'Yes':1}
        services[col] = services[col].apply(lambda x: mapping[x])
    elif unique == {'No', 'No internet service', 'Yes'}:
        mapping = {'No internet service':0,'No':1,'Yes':2}
        services[col] = services[col].apply(lambda x: mapping[x])
    elif unique == {'No', 'No phone service', 'Yes'}:
        mapping = {'No phone service':0,'No':1,'Yes':2}
        services[col] = services[col].apply(lambda x: mapping[x])
    elif unique == {'DSL', 'Fiber optic', 'No'}:
        mapping = {'No':0,'DSL':1,'Fiber optic':2}
        services[col] = services[col].apply(lambda x: mapping[x])
services.head()

In [None]:
_, ax = plt.subplots(figsize=(30,10))
sns.heatmap(services.corr('kendall'),annot=True)
plt.show()

In [None]:
model = sm.Logit(services['Churn'],
                 sm.add_constant(services.drop('Churn',axis=1))).fit()
print(model.summary())

In [None]:
model.save('logistic.pickle')

In [None]:
importances = np.exp(model.params.sort_values())[1:]
indices = importances.index
plt.title('Model Coefficients')
plt.barh(indices, importances[indices], color='b', align='center')
plt.yticks()
plt.show()

There is moderately strong association between Internet Service and Customer Churn. Hypothesis 3: The reason customer churned may be because customer is not satisfied with the internet service

Lets check association between services and monthly charges

In [None]:
services.drop('Churn',axis=1,inplace=True)
services['MonthlyCharges'] = df['MonthlyCharges']
_, ax = plt.subplots(figsize=(30,10))
sns.heatmap(services.corr('spearman'),annot=True)
plt.show()

In [None]:
model = sm.OLS(services['MonthlyCharges'],
               sm.add_constant(services.drop('MonthlyCharges',axis=1))).fit()
print(model.summary())

In [None]:
importances = model.params.sort_values()[1:]
indices = importances.index
plt.title('Model Coefficients')
plt.barh(indices, importances[indices], color='b', align='center')
plt.yticks()
plt.show()

It seems Phone Service and Internet Service Contribute a lot to Monthly Charges. Hypothesis 4: Internet Service high contribution to Monthly Charges may be one of the reason customer churn.

Now Lets Try to Model all Variables with XGBoost

In [None]:
df['Churn'] = df['Churn'].apply(lambda x: 1 if x=='Yes' else 0)
X,y = df.drop('Churn',axis=1),df['Churn']

In [None]:
def preprocessing(df:pd.DataFrame):
    num_col = ['SeniorCitizen','tenure','MonthlyCharges','TotalCharges']
    cat_col = list(set(df.columns)-set(num_col))
    df1=df.copy()
    for col in cat_col:
        if col == 'gender':
            mapping = {'Female':0,'Male':1}
        elif col == 'Contract':
            mapping = {'Month-to-month':0, 'One year':1, 'Two year':2}
        elif col in ['Dependents','PaperlessBilling','PhoneService','Partner']:
            mapping = {'No':0, 'Yes':1}
        elif col == 'MultipleLines':
            mapping = {'No phone service':0,'No':1,'Yes':2}
        elif col in ['DeviceProtection','TechSupport','OnlineSecurity','StreamingTV','StreamingMovies','OnlineBackup']:
            mapping = {'No internet service':0,'No':1,'Yes':2}
        elif col == 'InternetService':
            mapping = {'No':0,'DSL':1,'Fiber optic':2}
        elif col == 'PaymentMethod':
            continue
        df1[col] = df1[col].apply(lambda x: mapping[x] if x in mapping else -1)
    return df1

In [None]:
onehot = OneHotEncoder(sparse_output=False,dtype=np.float64,handle_unknown='ignore',drop='first')
onehot.set_output(transform='pandas')
onehot_seq = ('onehot',onehot,['PaymentMethod'])
transformer = ColumnTransformer([onehot_seq],
                                remainder='passthrough',
                                n_jobs=-1,
                                verbose_feature_names_out=False)
transformer.set_output(transform='pandas')

In [None]:
def objective(trial):
    cv = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
    
    params={'max_depth':trial.suggest_int("max_depth",6,20,step=1),
            'n_estimators':trial.suggest_int("n_estimators",100,900,step=100),
            'subsample':trial.suggest_float("subsample",0.5,1,step=0.1),
            'colsample_bytree':trial.suggest_float("colsample_bytree",0.5,1,step=0.1),
            'reg_lambda':trial.suggest_float("reg_lambda",0,2e-1,step=0.025),
            'reg_alpha':trial.suggest_float("reg_alpha",0,2e-1,step=0.025),
            'max_leaves':trial.suggest_int("max_leaves",12,40,step=2),
            'learning_rate':trial.suggest_float("learning_rate",0.05,0.3,step=0.05),
            'max_bin':trial.suggest_int("max_bin",256,4096,step=16),
            'early_stopping_rounds':10,
            'eval_metric':'logloss',
            'grow_policy':'lossguide',
            'random_state':42,
            'n_jobs':-1}
    
    scores = []
    for train,val in cv.split(X,y):
        X1,y1 = X.iloc[train],y.iloc[train]
        X2,y2 = X.iloc[val],y.iloc[val]
        X1 = transformer.fit_transform(preprocessing(X1))
        X2 = transformer.transform(preprocessing(X2))
        clf = XGBClassifier(**params)
        clf.fit(X1,y1,eval_set=[(X2,y2)],verbose=False)
        pred = clf.predict_proba(X2)
        scores.append(roc_auc_score(y2,pred[:,1]))
    return sum(scores)/len(scores)

In [None]:
sampler=optuna.samplers.TPESampler(seed=42)
study=optuna.create_study(sampler=sampler,direction='maximize')
study.optimize(objective,n_trials=50)

In [None]:
print('Best AUC Score: ', study.best_value)

In [None]:
best_params = study.best_params
best_params['random_state']=42
best_params['grow_policy']='lossguide'
best_params['n_jobs']=-1

In [None]:
X = transformer.fit_transform(preprocessing(X))
with open('transformer.pkl','wb') as f:
    pickle.dump(transformer,f)
clf = XGBClassifier(**best_params)
clf.fit(X,y)
with open('xgb.pkl','wb') as f:
    pickle.dump(clf,f)

In [None]:
X = transformer.fit_transform(preprocessing(X))
with open('xgb.pkl','rb') as f:
    clf = pickle.load(f)

In [None]:
plot_importance(clf,importance_type='gain')

In [None]:
pdd=PartialDependenceDisplay.from_estimator(clf, X, X.columns.values)
pdd.figure_.set_figwidth(20)
pdd.figure_.set_figheight(25)