In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
customerDf=pd.read_csv('C:/Users/wjyjy/Downloads/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
# shape of df
customerDf.shape

In [None]:

pd.set_option('display.max_columns',None)

customerDf.head(10)

In [None]:
pd.isnull(customerDf).sum()

In [None]:
customerDf.info()

In [None]:
for x in customerDf.columns:
    test=customerDf.loc[:,x].value_counts()
    print('The number of rows of {0}：{1}'.format(x,test.sum()))
    print('The type of {0}：{1}'.format(x,customerDf[x].dtypes))
    print('The context of {0}：\n{1}\n'.format(x,test))

- We found 11 users with missing data in the "TotalCharges" column

In [None]:
customerDf['TotalCharges']=customerDf['TotalCharges'].apply(lambda x: np.NaN if str(x).isspace() else x)

In [None]:
customerDf['TotalCharges']=customerDf[['TotalCharges']].astype(float)

In [None]:
pd.isnull(customerDf).sum()

In [None]:
print(customerDf[customerDf['TotalCharges']!=customerDf['TotalCharges']][['tenure','MonthlyCharges','TotalCharges','Churn']])

- We found that these 11 users have a 'tenure' (length of time on the network) of 0, presumably they are new to the network in the current month. Even if the users lost in the month of registration, they still need to pay the current month's fee. Therefore, we change the length of time these 11 users have been on the network to 1, and fill the total consumption amount with the monthly consumption amount, which is in line with the actual situation

In [None]:
customerDf=customerDf.fillna(method='pad',axis=1)
customerDf['TotalCharges']=customerDf[['TotalCharges']].astype(float)
customerDf['SeniorCitizen']=customerDf[['SeniorCitizen']].astype(int)
customerDf['tenure']=customerDf[['tenure']].astype(int)
customerDf['MonthlyCharges']=customerDf[['MonthlyCharges']].astype(float)

In [None]:
print(customerDf[customerDf['tenure']==0][['tenure','MonthlyCharges','TotalCharges']])

- View the number and percentage of churned users

In [None]:
def savePic(name):
    plt.savefig('C:/Users/wjyjy/Downloads/{0}.png'.format(name), dpi=600,bbox_inches='tight')

In [None]:
plt.rcParams['figure.figsize']=6,6
plt.pie(customerDf['Churn'].value_counts(),labels=customerDf['Churn'].value_counts().index,autopct='%1.2f%%',explode=(0.1,0))
plt.title('Churn(Yes/No) Ratio')
savePic('Churn(Yes or No) Ratio')
plt.show()

In [None]:
churnDf=customerDf['Churn'].value_counts().to_frame()
x=churnDf.index
y=churnDf['Churn']

plt.bar(x,y,width = 0.5,color = 'c')
plt.title('Churn(Yes/No) Num')
for a,b in zip(x,y):
    plt.text(a,b+10,'%.0f' % b, ha='center', va= 'bottom')
savePic('Churn(Yes or No) Num')
plt.show();

- It is an unbalanced data set, with 26.54% of churned users.

In [None]:
def barplot_percentages(feature,orient='v',axis_name="percentage of customers"):
    ratios = pd.DataFrame()
    g = (customerDf.groupby(feature)["Churn"].value_counts()/len(customerDf)).to_frame()
    g.rename(columns={"Churn":axis_name},inplace=True)
    g.reset_index(inplace=True)

    #print(g)
    if orient == 'v':
        ax = sns.barplot(x=feature, y= axis_name, hue='Churn', data=g, orient=orient)
        ax.set_yticklabels(['{:,.0%}'.format(y) for y in ax.get_yticks()])
        plt.rcParams.update({'font.size': 13})
        #plt.legend(fontsize=10)
    else:
        ax = sns.barplot(x= axis_name, y=feature, hue='Churn', data=g, orient=orient)
        ax.set_xticklabels(['{:,.0%}'.format(x) for x in ax.get_xticks()])
        #plt.legend(fontsize=10)
    plt.title('Churn(Yes/No) Ratio as {0}'.format(feature))
    savePic('Churn(Yes or No) Ratio as {0}'.format(feature))
    plt.show()
barplot_percentages("SeniorCitizen")
barplot_percentages("gender")

In [None]:
customerDf['churn_rate'] = customerDf['Churn'].replace("No", 0).replace("Yes", 1)
g = sns.FacetGrid(customerDf, col="SeniorCitizen", height=4, aspect=.9)
ax = g.map(sns.barplot, "gender", "churn_rate", palette = "Blues_d", order= ['Female', 'Male'])
plt.rcParams.update({'font.size': 13})
savePic('Churn(Yes or No) Ratio as gender and SeniorCitizen')
plt.show()

- User churn is independent of gender; older users account for a significantly higher percentage of churn than younger users.

In [None]:
fig, axis = plt.subplots(1, 2, figsize=(12,4))
axis[0].set_title("Has Partner")
axis[1].set_title("Has Dependents")
axis_y = "percentage of customers"

# Plot Partner column
gp_partner = (customerDf.groupby('Partner')["Churn"].value_counts()/len(customerDf)).to_frame()
gp_partner.rename(columns={"Churn": axis_y}, inplace=True)
gp_partner.reset_index(inplace=True)
ax1 = sns.barplot(x='Partner', y= axis_y, hue='Churn', data=gp_partner, ax=axis[0])
ax1.legend(fontsize=10)


# Plot Dependents column
gp_dep = (customerDf.groupby('Dependents')["Churn"].value_counts()/len(customerDf)).to_frame()
#print(gp_dep)
gp_dep.rename(columns={"Churn": axis_y} , inplace=True)
#print(gp_dep)
gp_dep.reset_index(inplace=True)
#print(gp_dep)

ax2 = sns.barplot(x='Dependents', y= axis_y, hue='Churn', data=gp_dep, ax=axis[1])



plt.rcParams.update({'font.size': 20})
ax2.legend(fontsize=10)


savePic('Churn(Yes or No) Ratio as partner and dependents')
plt.show()

- The churn rate of users with partners is lower than that of users without partners; the churn rate of users with dependents is lower than that of users without dependents.

In [None]:
# Kernel density estimaton
def kdeplot(feature,xlabel):
    plt.figure(figsize=(9, 4))
    plt.title("KDE for {0}".format(feature))
    ax0 = sns.kdeplot(customerDf[customerDf['Churn'] == 'No'][feature].dropna(), color= 'navy', label= 'Churn: No', shade='True')
    ax1 = sns.kdeplot(customerDf[customerDf['Churn'] == 'Yes'][feature].dropna(), color= 'orange', label= 'Churn: Yes',shade='True')
    plt.xlabel(xlabel)
    plt.rcParams.update({'font.size': 20})
    plt.legend(fontsize=10)
kdeplot('tenure','tenure')
savePic('Churn(Yes or No) Ratio as tenure kde')
kdeplot('MonthlyCharges','MonthlyCharges')
savePic('Churn(Yes or No) Ratio as MonthlyCharges kde')
kdeplot('TotalCharges','TotalCharges')
savePic('Churn(Yes or No) Ratio as TotalCharges kde')
plt.show();

In [None]:
plt.figure(figsize=(9, 4.5))
barplot_percentages("InternetService", orient="h");

In [None]:
cols = ["PhoneService","MultipleLines","OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
df1 = pd.melt(customerDf[customerDf["InternetService"] != "No"][cols])
df1.rename(columns={'value': 'Has service'},inplace=True)
plt.figure(figsize=(20, 8))
ax = sns.countplot(data=df1, x='variable', hue='Has service')
ax.set(xlabel='Internet Additional service', ylabel='Num of customers')
plt.rcParams.update({'font.size':20})
plt.legend( labels = ['No Service', 'Has Service'],fontsize=15)
plt.title('Num of Customers as Internet Additional Service')
savePic('Churn(Yes or No) Num as Internet Additional Service')
plt.show()

In [None]:
plt.figure(figsize=(20, 8))
df1 = customerDf[(customerDf.InternetService != "No") & (customerDf.Churn == "Yes")]
df1 = pd.melt(df1[cols])
df1.rename(columns={'value': 'Has service'}, inplace=True)
ax = sns.countplot(data=df1, x='variable', hue='Has service', hue_order=['No', 'Yes'])
ax.set(xlabel='Internet Additional service', ylabel='Churn Num')
plt.rcParams.update({'font.size':20})
plt.legend( labels = ['No Service', 'Has Service'],fontsize=15)
plt.title('Num of Churn Customers as Internet Additional Service')
savePic('Churn Num as Internet Additional Service')
plt.show()

In [None]:
g = sns.FacetGrid(customerDf, col="PaperlessBilling", height=6, aspect=.9)
ax = g.map(sns.barplot, "Contract", "churn_rate", palette = "Blues_d", order= ['Month-to-month', 'One year', 'Two year'])
plt.rcParams.update({'font.size':18})
savePic('Churn Ratio as PaperlessBilling')
plt.show()

In [None]:
plt.figure(figsize=(9, 4.5))
barplot_percentages("MultipleLines", orient='h')

In [None]:
plt.figure(figsize=(9, 4.5))
barplot_percentages("PaymentMethod",orient='h')

In [None]:
customerDf = customerDf.drop(['customerID'], axis=1)

In [None]:
customerDf = customerDf.drop(['PhoneService'], axis=1)

In [None]:
customerDf.loc[:,'OnlineSecurity'].replace(to_replace='No internet service',value='No',inplace=True)

In [None]:
customerDf.loc[:,'OnlineBackup'].replace(to_replace='No internet service',value='No',inplace=True)

In [None]:
customerDf.loc[:,'DeviceProtection'].replace(to_replace='No internet service',value='No',inplace=True)

In [None]:
customerDf.loc[:,'TechSupport'].replace(to_replace='No internet service',value='No',inplace=True)

In [None]:
customerDf.loc[:,'StreamingMovies'].replace(to_replace='No internet service',value='No',inplace=True)

In [None]:
customerDf = customerDf.drop(['churn_rate'], axis=1)

## SPLIT

In [None]:
for x in customerDf.columns:
    test=customerDf.loc[:,x].value_counts()
    print('The number of rows of {0}：{1}'.format(x,test.sum()))
    print('The type of {0}：{1}'.format(x,customerDf[x].dtypes))
    print('The context of {0}：\n{1}\n'.format(x,test))

In [None]:
from sklearn.model_selection import StratifiedKFold

## PREPROCCESS

- Looking at the data types, we find that except for "tenure", "MonthlyCharges", and "TotalCharges", which are continuous features, all the others are category features. For continuous features, standardscaler is better to use. For category features, one-hot encoding is used;

# CV

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score
import xgboost
import warnings
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
import shap
import pickle
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
y = customerDf['Churn']
X = customerDf.loc[:, customerDf.columns != 'Churn']

In [None]:

onehot_ftrs = ['gender','SeniorCitizen', 'Partner', 'Dependents', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod']

std_ftrs = ['tenure', 'MonthlyCharges', 'TotalCharges']

# collect all the encoders into one preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), onehot_ftrs),
        ('std', StandardScaler(), std_ftrs)])

prep = Pipeline(steps=[('preprocessor', preprocessor)]) # for now we only preprocess, later we will add other steps here

## Ridge Classifier

In [None]:
nr_states = 5
test_scores = np.zeros(nr_states)
final_models = []
test_set = []
for i in range(nr_states):
    print('randoms state '+str(i+1))
    X_train, X_other, y_train, y_other = train_test_split(X,y,train_size = 0.6,stratify=y,random_state=42*i)
    X_val, X_test, y_val, y_test = train_test_split(X_other,y_other,train_size = 0.5,stratify=y_other,random_state=42*i)
    X_train_prep = prep.fit_transform(X_train)
    X_val_prep = prep.transform(X_val)
    X_test_prep = prep.transform(X_test)
    
    test_set.append(pd.concat([pd.DataFrame(X_test_prep,columns=preprocessor.get_feature_names_out()),pd.DataFrame(np.reshape(np.array(y_test), (1, -1)).ravel(),columns=['y_true'])],axis=1))
    
    
    param_grid = {
                  'C':  [1e-3,1e-2,1e-1,1,1e+3,1e+2,1e+1],
                  'max_iter':[10,100]             
                  } 
    train_score = np.zeros(len(ParameterGrid(param_grid)))
    val_score = np.zeros(len(ParameterGrid(param_grid)))
    val_score_f1 = np.zeros(len(ParameterGrid(param_grid)))
    models = []
    for p in range(len(ParameterGrid(param_grid))):
        params = ParameterGrid(param_grid)[p]
        print('   ',params) # initialize the classifier
        clf = LogisticRegression(**params,penalty='l2', solver='liblinear',random_state = 42*i)
        clf.fit(X_train_prep,y_train) # fit the model
        models.append(clf) # save it
        # calculate train and validation accuracy scores
        y_train_pred = clf.predict(X_train_prep)
        train_score[p] = accuracy_score(y_train,y_train_pred)
        y_val_pred = clf.predict(X_val_prep)
        val_score[p] = accuracy_score(y_val,y_val_pred)
        val_score_f1[p] = f1_score(y_val, y_val_pred,pos_label='Yes')
        print('   ',train_score[p],val_score[p],val_score_f1[p])
    print('best model parameters:',ParameterGrid(param_grid)[np.argmax(val_score)])
    print('corresponding validation accuracy score:',np.max(val_score))
    # collect and save the best model
    final_models.append(models[np.argmax(val_score)])
    # calculate and save the test score
    y_test_pred = final_models[-1].predict(X_test_prep)
    test_scores[i] = accuracy_score(y_test,y_test_pred)
    print('test accuracy score:',test_scores[i])
    print('test F1 score:', f1_score(y_test, y_test_pred,pos_label='Yes'))

In [None]:
for i in range(nr_states):
    y_pred = final_models[i].predict(test_set[i].iloc[:,test_set[i].columns!='y_true'])
    testset_pred.append(pd.DataFrame(np.reshape(np.array(y_pred), (1, -1)).ravel(),columns=['y_true']))
testset_pred = []
file = open('results/ridge.save', 'wb')
pickle.dump((final_models,test_set,testset_pred),file)
file.close()

In [None]:
print('mean of F1 score:',np.mean(val_score_f1))
print('mean of accuracy:',np.mean(val_score))
print('std of F1 score:',np.std(val_score_f1))
print('std of accuracy:',np.std(val_score))

In [None]:
final_set=pd.DataFrame()

for i in range(0,1):
    y_pred = final_models[i].predict(test_set[i].iloc[:,test_set[i].columns!='y_true'])
    df_subset=pd.concat([test_set[i],pd.DataFrame(y_pred,columns=['y_pred'])],axis=1)
    final_set=pd.concat([final_set,df_subset])
cm = confusion_matrix(final_set['y_true'],final_set['y_pred'])
disp = ConfusionMatrixDisplay(cm,display_labels=['class 0', 'class 1'])
disp.plot()
plt.tight_layout()
plt.ylabel('True Label',fontsize=12)
plt.xlabel('Predicted Label',fontsize=12)
plt.title('Confusion matrix (Ridge)',fontsize=15)
savePic('Confusion matrix (Ridge)')
plt.show();

## Lasso Classifier

In [None]:
nr_states = 5
test_scores = np.zeros(nr_states)
final_models = []
for i in range(nr_states):
    print('randoms state '+str(i+1))
    X_train, X_other, y_train, y_other = train_test_split(X,y,train_size = 0.6,stratify=y,random_state=42*i)
    X_val, X_test, y_val, y_test = train_test_split(X_other,y_other,train_size = 0.5,stratify=y_other,random_state=42*i)
    X_train_prep = prep.fit_transform(X_train)
    X_val_prep = prep.transform(X_val)
    X_test_prep = prep.transform(X_test)
    
    test_set.append(pd.concat([pd.DataFrame(X_test_prep,columns=preprocessor.get_feature_names_out()),pd.DataFrame(np.reshape(np.array(y_test), (1, -1)).ravel(),columns=['y_true'])],axis=1))
    
    param_grid = {
                  'C':  [1e-3,1e-2,1e-1,1,1e+3,1e+2,1e+1],
                  'max_iter':[10,100]             
                  } 
    train_score = np.zeros(len(ParameterGrid(param_grid)))
    val_score = np.zeros(len(ParameterGrid(param_grid)))
    val_score_f1 = np.zeros(len(ParameterGrid(param_grid)))
    models = []
    for p in range(len(ParameterGrid(param_grid))):
        params = ParameterGrid(param_grid)[p]
        print('   ',params) # initialize the classifier
        clf = LogisticRegression(**params,penalty='l1', solver='liblinear',random_state = 42*i)
        clf.fit(X_train_prep,y_train) # fit the model
        models.append(clf) # save it
        # calculate train and validation accuracy scores
        y_train_pred = clf.predict(X_train_prep)
        train_score[p] = accuracy_score(y_train,y_train_pred)
        y_val_pred = clf.predict(X_val_prep)
        val_score[p] = accuracy_score(y_val,y_val_pred)
        val_score_f1[p] = f1_score(y_val, y_val_pred,pos_label='Yes')
        print('   ',train_score[p],val_score[p],val_score_f1[p])
    print('best model parameters:',ParameterGrid(param_grid)[np.argmax(val_score)])
    print('corresponding validation score:',np.max(val_score))
    # collect and save the best model
    final_models.append(models[np.argmax(val_score)])
    # calculate and save the test score
    y_test_pred = final_models[-1].predict(X_test_prep)
    test_scores[i] = accuracy_score(y_test,y_test_pred)
    print('test score:',test_scores[i])
    print('test F1 score:', f1_score(y_test, y_test_pred,pos_label='Yes'))

In [None]:
for i in range(nr_states):
    y_pred = final_models[i].predict(test_set[i].iloc[:,test_set[i].columns!='y_true'])
    testset_pred.append(pd.DataFrame(np.reshape(np.array(y_pred), (1, -1)).ravel(),columns=['y_true']))
testset_pred = []
file = open('results/Lasso.save', 'wb')
pickle.dump((final_models,test_set,testset_pred),file)
file.close()

In [None]:
print('mean of F1 score:',np.mean(val_score_f1))
print('mean of accuracy:',np.mean(val_score))
print('std of F1 score:',np.std(val_score_f1))
print('std of accuracy:',np.std(val_score))

In [None]:
final_set=pd.DataFrame()

for i in range(0,1):
    y_pred = final_models[i].predict(test_set[i].iloc[:,test_set[i].columns!='y_true'])
    df_subset=pd.concat([test_set[i],pd.DataFrame(y_pred,columns=['y_pred'])],axis=1)
    final_set=pd.concat([final_set,df_subset])
cm = confusion_matrix(final_set['y_true'],final_set['y_pred'])
disp = ConfusionMatrixDisplay(cm,display_labels=['class 0', 'class 1'])
disp.plot()
plt.tight_layout()
plt.ylabel('True Label',fontsize=12)
plt.xlabel('Predicted Label',fontsize=12)
plt.title('Confusion matrix (Lasso)',fontsize=15)
savePic('Confusion matrix (Lasso)')
plt.show();

## RF

In [None]:
nr_states = 5
test_scores = np.zeros(nr_states)
final_models = []
for i in range(nr_states):
    print('randoms state '+str(i+1))
    X_train, X_other, y_train, y_other = train_test_split(X,y,train_size = 0.6,stratify=y,random_state=42*i)
    X_val, X_test, y_val, y_test = train_test_split(X_other,y_other,train_size = 0.5,stratify=y_other,random_state=42*i)
    X_train_prep = prep.fit_transform(X_train)
    X_val_prep = prep.transform(X_val)
    X_test_prep = prep.transform(X_test)
    
    test_set.append(pd.concat([pd.DataFrame(X_test_prep,columns=preprocessor.get_feature_names_out()),pd.DataFrame(np.reshape(np.array(y_test), (1, -1)).ravel(),columns=['y_true'])],axis=1))
    
    param_grid = {
                  'max_depth': [7,8,9,10,11,12,13,14], # no upper bound so the values are evenly spaced in log
                  'max_features': [0.25, 0.5,0.75,1.0] # linearly spaced because it is between 0 and 1, 0 is omitted
                  } 
    train_score = np.zeros(len(ParameterGrid(param_grid)))
    val_score = np.zeros(len(ParameterGrid(param_grid)))
    val_score_f1 = np.zeros(len(ParameterGrid(param_grid)))
    models = []
    for p in range(len(ParameterGrid(param_grid))):
        params = ParameterGrid(param_grid)[p]
        print('   ',params) 
        clf = RandomForestClassifier(**params,n_jobs=-1,random_state=42*i) # initialize the classifier
        clf.fit(X_train_prep,y_train) # fit the model
        models.append(clf) # save it
        # calculate train and validation accuracy scores
        y_train_pred = clf.predict(X_train_prep)
        train_score[p] = accuracy_score(y_train,y_train_pred)
        y_val_pred = clf.predict(X_val_prep)
        val_score[p] = accuracy_score(y_val,y_val_pred)
        val_score_f1[p] = f1_score(y_val, y_val_pred,pos_label='Yes')
        print('   ',train_score[p],val_score[p],val_score_f1[p])
    print('best model parameters:',ParameterGrid(param_grid)[np.argmax(val_score)])
    print('corresponding validation score:',np.max(val_score))
    # collect and save the best model
    final_models.append(models[np.argmax(val_score)])
    # calculate and save the test score
    y_test_pred = final_models[-1].predict(X_test_prep)
    test_scores[i] = accuracy_score(y_test,y_test_pred)
    print('test score:',test_scores[i])
    print('test F1 score:', f1_score(y_test, y_test_pred,pos_label='Yes'))

In [None]:
for i in range(nr_states):
    y_pred = final_models[i].predict(test_set[i].iloc[:,test_set[i].columns!='y_true'])
    testset_pred.append(pd.DataFrame(np.reshape(np.array(y_pred), (1, -1)).ravel(),columns=['y_true']))
testset_pred = []
file = open('results/RF.save', 'wb')
pickle.dump((final_models,test_set,testset_pred),file)
file.close()

In [None]:
print('mean of F1 score:',np.mean(val_score_f1))
print('mean of accuracy:',np.mean(val_score))
print('std of F1 score:',np.std(val_score_f1))
print('std of accuracy:',np.std(val_score))

In [None]:
final_set=pd.DataFrame()

for i in range(0,1):
    y_pred = final_models[i].predict(test_set[i].iloc[:,test_set[i].columns!='y_true'])
    df_subset=pd.concat([test_set[i],pd.DataFrame(y_pred,columns=['y_pred'])],axis=1)
    final_set=pd.concat([final_set,df_subset])
cm = confusion_matrix(final_set['y_true'],final_set['y_pred'])
disp = ConfusionMatrixDisplay(cm,display_labels=['class 0', 'class 1'])
disp.plot()
plt.tight_layout()
plt.ylabel('True Label',fontsize=12)
plt.xlabel('Predicted Label',fontsize=12)
plt.title('Confusion matrix (RF)',fontsize=15)
savePic('Confusion matrix (RF)')
plt.show();


## XGboost

In [None]:
nr_states = 5
test_scores = np.zeros(nr_states)
final_models = []
for i in range(nr_states):
    warnings.simplefilter(action='ignore', category=UserWarning)
    print('randoms state '+str(i+1))
    X_train, X_other, y_train, y_other = train_test_split(X,y,train_size = 0.6,stratify=y,random_state=42*i)
    X_val, X_test, y_val, y_test = train_test_split(X_other,y_other,train_size = 0.5,stratify=y_other,random_state=42*i)
    X_train_prep = prep.fit_transform(X_train)
    X_val_prep = prep.transform(X_val)
    X_test_prep = prep.transform(X_test)
    
    test_set.append(pd.concat([pd.DataFrame(X_test_prep,columns=preprocessor.get_feature_names_out()),pd.DataFrame(np.reshape(np.array(y_test), (1, -1)).ravel(),columns=['y_true'])],axis=1))
    
    param_grid = {"learning_rate": [0.03],
              "n_estimators": [100,1000],
              "max_depth": [3,5,7,9,11,13,15],
                'min_child_weight':[1,5,7,9],
                 # 'subsample':[0.5],
                'eval_metric':['auc']}
    train_score = np.zeros(len(ParameterGrid(param_grid)))
    val_score = np.zeros(len(ParameterGrid(param_grid)))
    val_score_f1 = np.zeros(len(ParameterGrid(param_grid)))
    models = []
    for p in range(len(ParameterGrid(param_grid))):
        params = ParameterGrid(param_grid)[p]
        print('   ',params) 
        clf = xgboost.sklearn.XGBClassifier(**params,random_state = 42*i,n_jobs=-1) # initialize the classifier
        clf.fit(X_train_prep,y_train) # fit the model
        models.append(clf) # save it
        # calculate train and validation accuracy scores
        y_train_pred = clf.predict(X_train_prep)
        train_score[p] = accuracy_score(y_train,y_train_pred)
        y_val_pred = clf.predict(X_val_prep)
        val_score[p] = accuracy_score(y_val,y_val_pred)
        val_score_f1[p] = f1_score(y_val, y_val_pred,pos_label='Yes')
        print('   ',train_score[p],val_score[p],val_score_f1[p])
    print('best model parameters:',ParameterGrid(param_grid)[np.argmax(val_score)])
    print('corresponding validation score:',np.max(val_score))
    # collect and save the best model
    final_models.append(models[np.argmax(val_score)])
    # calculate and save the test score
    y_test_pred = final_models[-1].predict(X_test_prep)
    test_scores[i] = accuracy_score(y_test,y_test_pred)
    print('test score:',test_scores[i])
    print('test F1 score:', f1_score(y_test, y_test_pred,pos_label='Yes'))

In [None]:
for i in range(nr_states):
    y_pred = final_models[i].predict(test_set[i].iloc[:,test_set[i].columns!='y_true'])
    testset_pred.append(pd.DataFrame(np.reshape(np.array(y_pred), (1, -1)).ravel(),columns=['y_true']))
testset_pred = []
file = open('results/XGB.save', 'wb')
pickle.dump((final_models,test_set,testset_pred),file)
file.close()

In [None]:
print('mean of F1 score:',np.mean(val_score_f1))
print('mean of accuracy:',np.mean(val_score))
print('std of F1 score:',np.std(val_score_f1))
print('std of accuracy:',np.std(val_score))

In [None]:
final_set=pd.DataFrame()

for i in range(0,1):
    y_pred = final_models[i].predict(test_set[i].iloc[:,test_set[i].columns!='y_true'])
    df_subset=pd.concat([test_set[i],pd.DataFrame(y_pred,columns=['y_pred'])],axis=1)
    final_set=pd.concat([final_set,df_subset])
cm = confusion_matrix(final_set['y_true'],final_set['y_pred'])
disp = ConfusionMatrixDisplay(cm,display_labels=['class 0', 'class 1'])
disp.plot()
plt.tight_layout()
plt.ylabel('True Label',fontsize=12)
plt.xlabel('Predicted Label',fontsize=12)
plt.title('Confusion matrix (XGBoost)',fontsize=15)
savePic('Confusion matrix (XGBoost)')
plt.show();


In [None]:
test_set = []
X_train, X_other, y_train, y_other = train_test_split(X,y,train_size = 0.6,stratify=y,random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_other,y_other,train_size = 0.5,stratify=y_other,random_state=42)
X_train_prep = prep.fit_transform(X_train)
X_val_prep = prep.transform(X_val)
X_test_prep = prep.transform(X_test)

test_set.append(pd.DataFrame(X_test_prep,columns=preprocessor.get_feature_names_out()))

param_grid = {"learning_rate": [0.03],
              "n_estimators": [100],
              "max_depth": [5],
               # 'min_child_weight':[5],
                 # 'subsample':[0.5],
                'eval_metric':['auc']}
clf = xgboost.sklearn.XGBClassifier(**params,random_state = 42,n_jobs=-1) # initialize the classifier
clf.fit(X_train_prep,y_train)
clf.feature_importances_
test_set = pd.DataFrame(test_set[0])

In [None]:
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(test_set)
shap.summary_plot(shap_values, test_set, plot_type="bar",max_display=10,show=False)
savePic('global1')

In [None]:
shap.summary_plot(shap_values, test_set,max_display=10,show=False)
savePic('global2')

In [None]:
from scipy.special import expit
expit(-3.73)

In [None]:
shap.initjs()
import matplotlib.pyplot as pyplot
index = 0
shap.force_plot(explainer.expected_value, 
                shap_values[index,:], 
                test_set.iloc[index,:],show=False,matplotlib=True)
savePic('local')
#,show=False

## SVC

In [None]:
nr_states = 5
test_scores = np.zeros(nr_states)
final_models = []
test_set = []

for i in range(nr_states):
    print('randoms state '+str(i+1))
    X_train, X_other, y_train, y_other = train_test_split(X,y,train_size = 0.6,stratify=y,random_state=42*i)
    X_val, X_test, y_val, y_test = train_test_split(X_other,y_other,train_size = 0.5,stratify=y_other,random_state=42*i)
    X_train_prep = prep.fit_transform(X_train)
    X_val_prep = prep.transform(X_val)
    X_test_prep = prep.transform(X_test)
    
    test_set.append(pd.concat([pd.DataFrame(X_test_prep,columns=preprocessor.get_feature_names_out()),pd.DataFrame(np.reshape(np.array(y_test), (1, -1)).ravel(),columns=['y_true'])],axis=1))
    
    param_grid = {
                   
                  'max_iter':[10,100] ,
                    'C': [1e-3,1e-2,1e-1, 1e0, 1e1,1e2] 
                  } 
    train_score = np.zeros(len(ParameterGrid(param_grid)))
    val_score = np.zeros(len(ParameterGrid(param_grid)))
    val_score_f1 = np.zeros(len(ParameterGrid(param_grid)))
    models = []
    for p in range(len(ParameterGrid(param_grid))):
        params = ParameterGrid(param_grid)[p]
        print('   ',params) 
        clf = SVC(**params,random_state = 42*i) # initialize the classifier
        clf.fit(X_train_prep,y_train) # fit the model
        models.append(clf) # save it
        # calculate train and validation accuracy scores
        y_train_pred = clf.predict(X_train_prep)
        train_score[p] = accuracy_score(y_train,y_train_pred)
        y_val_pred = clf.predict(X_val_prep)
        val_score[p] = accuracy_score(y_val,y_val_pred)
        val_score_f1[p] = f1_score(y_val, y_val_pred,pos_label='Yes')
        print('   ',train_score[p],val_score[p],val_score_f1[p])
    print('best model parameters:',ParameterGrid(param_grid)[np.argmax(val_score)])
    print('corresponding validation score:',np.max(val_score))
    # collect and save the best model
    final_models.append(models[np.argmax(val_score)])
    # calculate and save the test score
    y_test_pred = final_models[-1].predict(X_test_prep)
    test_scores[i] = accuracy_score(y_test,y_test_pred)
    print('test score:',test_scores[i])
    print('test F1 score:', f1_score(y_test, y_test_pred,pos_label='Yes'))

In [None]:
for i in range(nr_states):
    y_pred = final_models[i].predict(test_set[i].iloc[:,test_set[i].columns!='y_true'])
    testset_pred.append(pd.DataFrame(np.reshape(np.array(y_pred), (1, -1)).ravel(),columns=['y_true']))
testset_pred = []
file = open('results/SVC.save', 'wb')
pickle.dump((final_models,test_set,testset_pred),file)
file.close()

In [None]:
print('mean of F1 score:',np.mean(val_score_f1))
print('mean of accuracy:',np.mean(val_score))
print('std of F1 score:',np.std(val_score_f1))
print('std of accuracy:',np.std(val_score))

In [None]:
final_set=pd.DataFrame()

for i in range(0,1):
    y_pred = final_models[i].predict(test_set[i].iloc[:,test_set[i].columns!='y_true'])
    df_subset=pd.concat([test_set[i],pd.DataFrame(y_pred,columns=['y_pred'])],axis=1)
    final_set=pd.concat([final_set,df_subset])
cm = confusion_matrix(final_set['y_true'],final_set['y_pred'])
disp = ConfusionMatrixDisplay(cm,display_labels=['class 0', 'class 1'])
disp.plot()
plt.tight_layout()
plt.ylabel('True Label',fontsize=12)
plt.xlabel('Predicted Label',fontsize=12)
plt.title('Confusion matrix (SVC)',fontsize=15)
savePic('Confusion matrix (SVC)')
plt.show();


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
tips = sns.load_dataset("tips")

In [None]:
tips

In [None]:
fig, ax =plt.subplots(1,2,constrained_layout=True, figsize=(8, 3))
pic = sns.boxplot(x="day", y="total_bill", data=tips, ax=ax[0])
pic.set_title('x="day", y="total_bill"')
pic = sns.boxplot(x="total_bill", y="day", data=tips, ax=ax[1])
pic.set_title('x="total_bill", y="day"')