The outcome is a binary classification. Prompt suggests RandomForest to identify most predictive variables.

In [2]:
import pandas as pd
df = pd.read_csv("./data/customer_booking.csv", encoding="ISO-8859-1")
df = df.dropna()

In [None]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, roc_curve, auc, roc_auc_score
import pandas as pd
import matplotlib.pyplot as plt

predictors = df.columns[:-1]
k = 5

kF = KFold(n_splits = k, shuffle = True)
clf = RandomForestClassifier()
enc = OneHotEncoder(sparse_output = False)

categorical = ['sales_channel',
                    'trip_type',
                    'flight_day',
                    'route',
                    'booking_origin']

x = df.drop(columns = ['booking_complete'])
# print(x.shape)

# using one-hot encoding for the categorical variables, and creating post encoded df
x_enc = enc.fit_transform(x[categorical])
x_enc_df = pd.DataFrame(x_enc, columns = enc.get_feature_names_out(categorical))
df_post_enc = pd.concat([x.drop(columns = categorical), x_enc_df,df['booking_complete']], axis = 1)

#need to rename the target variable after processing with one-hot encoding
df_post_enc.rename(columns = {0:'booking_complete'},inplace = True)
df_post_enc = df_post_enc.fillna('-1')

i=0
for train_index,test_index in kF.split(df_post_enc):
    
    train = df_post_enc.iloc[train_index]
    test = df_post_enc.iloc[test_index]
    
    y_train = train['booking_complete']
    x_train = train.drop(columns = ['booking_complete'])
    
    y_test = test['booking_complete']
    x_test = test.drop(columns = ['booking_complete'])
    
    clf.fit(x_train,y_train)
    
    feature_importance = sorted(list(zip(x_train.columns,clf.feature_importances_)),
                                key = lambda x: x[1], reverse = True)
    print("Most impactful features in descending order:", feature_importance[:5])
    print(f"Fold {i+1} accuracy:",clf.score(x_test,y_test))
    print("F1 score:",f1_score(y_test,clf.predict(x_test)))
    i += 1
    
    fpr, tpr, thresholds = roc_curve(y_test,clf.predict_proba(x_test)[:,1])
    auroc = auc(fpr,tpr)
    
    plt.plot(fpr,tpr,color = 'orange')
    plt.plot([0,1],[0,1],color = 'grey', linestyle = "--")
    plt.show()
