In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier


from kmodes.kmodes import KModes
import copy

In [None]:
df = pd.read_excel("cleaned_dataset1.xlsx")
df.columns

Index(['customer_id', 'test_group', 'proposition', 'propositon_type',
       'clicked_on_proposition', 'card_color', 'age_group', 'province',
       'rsa_car', 'travel_booking_recency', 'trip_booking_recency',
       'travel_booking_frequency', 'trip_booking_frequency', 'car_insurance',
       'travel_insurance', 'other_vehicle_insurance', 'home_insurance',
       'bike_insurance', 'other_insurance', 'legal_assistance_insurance',
       'travel_recency_category', 'trip_recency_category',
       'travel_booking_frequency_category', 'trip_booking_frequency_category'],
      dtype='object')

In [144]:
def prepare_data(df, drop_cols):
  X_df = df.drop(columns=drop_cols)
  X_df = X_df.apply(lambda x: x.astype('category'))
  X_df_encoded = X_df.apply(lambda x: x.cat.codes)

  df_full = copy.deepcopy(df)

  return X_df_encoded, X_df, df_full

In [None]:
to_drop_from_x = ['clicked_on_proposition',
                                     'customer_id',
                                     'travel_booking_recency',
                                     'trip_booking_recency',
                                      'trip_booking_frequency',
                                      'travel_booking_frequency',
                                     'proposition',
                                     'propositon_type',
                                     'test_group']

covariates_df_encoded, covariates_df_original, full_df_original = prepare_data(df, to_drop_from_x)
df_to_output = copy.deepcopy(full_df_original)

In [145]:
#K-modes for only categorical variables
centroids_dict = dict()

for n in [5, ]:#7, 10, 12, 15]:
    kmodes = KModes(n_clusters=n, init='Cao', n_init=10)
    clusters_kmodes = kmodes.fit_predict(covariates_df_encoded)
    df_to_output[f'KModes_clusters_{str(n)}'] = clusters_kmodes

    #Calculate the centroids for clusters
    centroids = kmodes.cluster_centroids_
    centroids_df = pd.DataFrame(data = centroids, columns=covariates_df_encoded.columns)
    category_mappings = dict()
    for col in centroids_df.columns:
        category_mapping = dict(enumerate(covariates_df_original[col].cat.categories))
        centroids_df[col] = centroids_df[col].map(category_mapping)
        category_mappings[col] = category_mapping

    dict_key = f"{n}_clusters"

    ctr_per_cluster = df_to_output.groupby(by = f'KModes_clusters_{n}')[['clicked_on_proposition']].mean().round(4)
    centroids_df['click_rate'] = ctr_per_cluster.iloc[:, 0]

    centroids_dict[dict_key] = centroids_df



#Additional dfs:
covariates_df_encoded_with_kmodes = copy.deepcopy(covariates_df_encoded)
covariates_df_encoded_with_kmodes['cluster'] = df_to_output['KModes_clusters_5']

full_df_with_encoded_X = copy.deepcopy(full_df_original)
for col in covariates_df_encoded.columns:
  full_df_with_encoded_X[col] = covariates_df_encoded[col]

In [None]:
from kneed import KneeLocator
from sklearn.metrics import silhouette_score

cluster_range = [3, 5, 7, 10, 12, 15]

wcss = []
sil_scores = [] 

for k in cluster_range:
    kmodes = KModes(n_clusters=k, init='Cao', n_init=10, verbose=0)
    clusters = kmodes.fit_predict(df_for_kmode)

    wcss.append(kmodes.cost_)

    if k > 1:
        sil_scores.append(silhouette_score(df_for_kmode, clusters))


knee_locator = KneeLocator(cluster_range, wcss, curve="convex", direction="decreasing")
optimal_k_elbow = knee_locator.knee

optimal_k_silhouette = cluster_range[np.argmax(sil_scores)]

print(f"Optimal K (Elbow Method): {optimal_k_elbow}")
print(f"Optimal K (Silhouette Score): {optimal_k_silhouette}")

plt.figure(figsize=(8,5))
plt.plot(cluster_range, wcss, marker='o', linestyle='-')
plt.axvline(optimal_k_elbow, color='r', linestyle='--', label=f'Elbow at K={optimal_k_elbow}')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS')
plt.legend()
plt.grid(True)
plt.show()

In [138]:
#Create the ML ready data arrrays
def prepare_data_for_forests(full_df, covar_df_encoded, add_clusters = False, deduplicate = False, oversample = False, covariates_df_encoded_with_kmodes = covariates_df_encoded_with_kmodes):
    y = full_df['clicked_on_proposition'].values

    covariates_df_for_ml = covar_df_encoded
    if add_clusters:
        covariates_df_for_ml['cluster'] = covariates_df_encoded_with_kmodes['cluster']

    x = covariates_df_for_ml.values

    if oversample:
      ros = RandomOverSampler()
      x, y = ros.fit_resample(x, y)

    return x, y

In [None]:
#Random Forest Cross Validation

to_drop = ['clicked_on_proposition',
                                     'customer_id',
                                     'travel_booking_recency',
                                     'trip_booking_recency',
                                      'trip_booking_frequency',
                                      'travel_booking_frequency',
                                     'propositon_type',
                                     'test_group']

covariates_df_encoded_for_rf, covariates_df_original_for_rf, full_df_original_for_rf = prepare_data(df, to_drop)
x, y = prepare_data_for_forests(full_df_original_for_rf, covariates_df_encoded_for_rf, oversample = True, add_clusters = False) ####With no oversampling with RandomOverSampler the performance goes down

scoring_binary = ['accuracy', 'precision', 'recall', 'f1']

scoring_multinomial = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted'),
    'roc_auc': 'roc_auc_ovr'
}


rf_model = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,    
    random_state=42,
    #class_weight='balanced'
)


#Actual model
cv_results_rf = cross_validate(rf_model, X = x, y = y, cv=5, scoring=scoring_multinomial, return_train_score=True)

for metric in scoring_multinomial.keys():
   print(f"{metric.capitalize()} (Mean): {np.mean(cv_results_rf[f'test_{metric}']):.2f}")

In [None]:
#Feature Importance
rf_model_new = rf_model
rf_model_new.fit(x, y)

feature_importances = rf_model.feature_importances_

importance_rf_df = pd.DataFrame({
    'Feature': covariates_df_encoded.columns,
    'Importance_RF': feature_importances
}).sort_values(by='Importance_RF', ascending=False)

importance_rf_df

In [None]:
#Split the original dataset so that full set for Causal Forest is 60% and full set for Random Forest is 40%

full_df_original_1, full_df_original_2, covariates_df_encoded_1, covariates_df_encoded_2 = train_test_split(full_df_original,
                                                                                                            covariates_df_encoded, test_size=0.4, random_state=42
)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from econml.dr import DRLearner


proposition_categorised = full_df_original_1['proposition'].astype('category').cat.codes
proposition_mapping = dict(enumerate(df['proposition'].astype('category').cat.categories))

ids = full_df_original_1['customer_id'].values  # get the customer ids
thompson_assignment = df['test_group'].astype('category').cat.codes.values

#df['test_group'].astype('category').cat.categories


T = proposition_categorised.values  # Multiclass treatment
Y = full_df_original_1['clicked_on_proposition'].values
X = covariates_df_encoded_1.values



X_train, X_test, Y_train, Y_test, T_train, T_test, id_train, id_test = train_test_split(
    X, Y, T, ids, test_size=0.3, random_state=42
)

In [None]:
#With this approach, for each type of proposition, we set all other propositions to 0 and treat all other proposition as control group - one-vs-all approach
#As a result we get CATEs per test customer for each of the propositions
#And then we can select the highest CATE for each customers (each row) and that would be recommended treatment
import warnings
warnings.filterwarnings("ignore")

cate_estimates = np.zeros((X_test.shape[0], len(np.unique(T))))


#The first loop creates and fits the learners per proposition, and the second predicts the cates per proposition
learners_per_proposition = dict()
for t in np.unique(T):
    binary_T = (T_train == t).astype(int)  # Convert to binary treatment for DRLearner
    proposition_name = proposition_mapping[t]

    dr_learner = DRLearner(
    model_propensity=RandomForestClassifier(n_estimators=200, min_samples_leaf=10, class_weight='balanced'),
    model_regression=RandomForestRegressor(n_estimators=200, min_samples_leaf=10),
    model_final=RandomForestRegressor(n_estimators=200, min_samples_leaf=5),
    )

    learners_per_proposition[proposition_name] = dr_learner.fit(Y_train, binary_T, X=X_train)

In [None]:
#Testing predicting on the centroids:
centr_test = copy.deepcopy(centroids_dict['5_clusters'])

centroids_to_test = pd.DataFrame()

inverted_cat_mapping = dict()
for key in category_mappings.keys():
  inverted_cat_mapping[key] = {v: k for k, v in category_mappings[key].items()}
for col in covariates_df_encoded.columns:
  centroids_to_test[col] = centr_test[col].map(inverted_cat_mapping[col])

centroids_to_test_arr = centroids_to_test.values

In [128]:
#Predicting

def suggest_treatment(learners_per_proposition, X_test, T):
  cate_estimates = np.zeros((X_test.shape[0], len(np.unique(T))))

  for idx, t in enumerate(learners_per_proposition.keys()):
    cate_estimates[:, idx] = learners_per_proposition[t].effect(X_test)

  best_treatment = np.argmax(cate_estimates, axis=1)
  propositions_suggested = [proposition_mapping[prop] for prop in best_treatment]

  results_df = pd.DataFrame({
        'customer_id': id_test,
        'suggested_proposition': propositions_suggested
    })

  return results_df, propositions_suggested


results_df, propositions_suggested = suggest_treatment(learners_per_proposition, X_test, T)

In [129]:
#merge the sets together and extract thompsons only
results_df_to_filter = pd.merge(results_df, df, on='customer_id', how='left') #adjust the column selection for df to get the full dataset with all covariates
df_to_test_approach = results_df_to_filter[results_df_to_filter['test_group'] == 'thompson sampling']

df_to_test_approach_thompson = df_to_test_approach.drop(columns = 'suggested_proposition')

df_to_test_approach_new = df_to_test_approach
df_to_test_approach_new['proposition'] = df_to_test_approach_new['suggested_proposition']
df_to_test_approach_new = df_to_test_approach_new.drop(columns = 'suggested_proposition')

In [None]:
#Fit selected causal forest to the training data

drop_cols = ['clicked_on_proposition',
                                  'customer_id',
                                     'travel_booking_recency',
                                     'trip_booking_recency',
                                      'trip_booking_frequency',
                                      'travel_booking_frequency',
                                     'propositon_type',
                                     'test_group'] #drop everything as originally except for proposition

covariates_encoded_with_proposition, covariates_df_with_proposition, full_df_original = prepare_data(full_df_original_2, drop_cols)

rf_model_selected = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,           
    random_state=42,
    #class_weight='balanced'
)

X_train_rf, Y_train_rf = prepare_data_for_forests(full_df_original_2, covariates_encoded_with_proposition) #here we gotta train on proposition as well

rf_model_selected.fit(X_train_rf, Y_train_rf)

In [155]:
X_df_encoded_test_new, X_df_test_new, df_full_test_new = prepare_data(df_to_test_approach_new, drop_cols)
X_df_encoded_test_thompson, X_df_test_thompson, df_full_test_thompson = prepare_data(df_to_test_approach_thompson, drop_cols)



x_assigned_new, y_assigned_new = prepare_data_for_forests(df_to_test_approach_new, X_df_encoded_test_new)
x_assigned_thompson, y_assigned_thompson = prepare_data_for_forests(df_to_test_approach_thompson, X_df_encoded_test_thompson)



prediction_thompson = rf_model.predict(x_assigned_thompson)
prediction_new = rf_model.predict(x_assigned_new)


predicted_ctr_new = prediction_new.mean()
predicted_ctr_thompson = prediction_thompson.mean()

In [None]:
print(f"Predicted CTR for DRLearner approach is {predicted_ctr_new}")
print(f"Predicted CTR for Thomson sampling approach is {predicted_ctr_thompson}")

0.003956834532374101