## Created by: Joel Nail

#### Goal: achieve the highest possible AUC score for the given dataset

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score, auc, roc_curve
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
import xgboost as xgb
import catboost as cb
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, BatchNormalization, Dropout, Activation, Flatten

In [5]:
# reading in the datasets
train_df = pd.read_csv("data/train_final.csv")
test_df = pd.read_csv("data/test_final.csv")

# isolating X and y from training set and renaming the test set
X = train_df.drop(labels='Y', axis='columns')
y = train_df['Y']
X_kag_test = test_df

# splitting my overall training set into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

In [None]:
# function that accepts a desired output filename and and prediction proba array to create a csv for submission
def write_preds_to_csv(target_filename, model_preds_proba):
    X = train_df.drop(labels='Y', axis='columns')
    y = train_df['Y']
    X_kag_test = test_df

    #### I just realized I could just take the prediction in the 1st index of model_test
    #### that will always be the value we want to report - the code below is convoluted and unnecesary (but it does work)
    #### I guess it does provide a nice way of understanding how the proba function works, but still unecessarily long

    model_test_preds = []
    for pair in model_preds_proba:
        pair_list = pair.tolist()
        highest_prob = max(pair_list)
        index = pair_list.index(highest_prob)
        #print(index)
        if index == 0:
            model_test_preds.append(1-highest_prob)
        else:
            model_test_preds.append(highest_prob)

    pred_df = pd.DataFrame(model_test_preds)
    i=0
    n = 2604
    preds = []
    for pair in pred_df.values:
        preds.append([n, np.max(pair)])
        i+=1
        n+=1
    #print(preds)
    pred_df = pd.DataFrame(preds, columns=["Id", "Y"])
    #print(pred_df)
    file_name = target_filename + ".csv"
    pred_df.to_csv("/Users/joelnail/Documents/BDS/Kaggle_Comp/Predictions/" + file_name, index=False)


Below is the code I used for testing my models prior to submission

In [None]:
# best model on public leaderboard (random_strength = 0.25)
cat_model = cb.CatBoostClassifier(learning_rate=.01, l2_leaf_reg=1, iterations=1000, depth=5, border_count=33,\
    boosting_type='Ordered', random_strength=.25)

# best model on private leaderboard (random_strength = 0.5) 
# unfortunately, I did not choose this model to be considered in my final score - oh well
cat_model = cb.CatBoostClassifier(learning_rate=.01, l2_leaf_reg=1, iterations=1000, depth=5, border_count=33,\
    boosting_type='Ordered', random_strength=.5)

cat_model.fit(X_train, y_train, verbose=250)
cat_model_preds = cat_model.predict_proba(X_test)

cat_preds_proba = []
for pred in cat_model_preds:
   cat_preds_proba.append(pred[1])

print(roc_auc_score(y_test, cat_preds_proba))

Below is the code to fit the model to the entire training set and create the prediction file

In [None]:
cat_model = cb.CatBoostClassifier(learning_rate=.01, l2_leaf_reg=1, iterations=1000, depth=5, border_count=33,\
    boosting_type='Ordered', random_strength=.5)

cat_model.fit(X, y, verbose=250)
cat_model_preds = cat_model.predict_proba(X_kag_test)
write_preds_to_csv("best_model_preds", cat_model_preds)

Here's the code I used to find the best CatBoost hyperparameters. Grid Search took too long, so I primarily used Randomized Search

In [None]:
cat_model = cb.CatBoostClassifier()

params = {'depth':[1,2,3,4,5,6,7,8,9,10],
          'iterations':[250,500,1000],
          'learning_rate':[0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[1,3,5,10,100],
          'border_count':[5,10,20,32,50,100,200]}

params2 = {'depth':[5],
          'iterations':[1000],
          'learning_rate':[0.01], 
          'l2_leaf_reg':[1],
          'border_count':[32],
          'bootstrap_type':['MVS'],
          'boosting_type':['Ordered', 'Plain']}

# grid = GridSearchCV(estimator=cat_model, param_grid = params2, cv = 2, scoring="roc_auc")
# grid.fit(X_train, y_train)

# print("RESULTS")
# print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
# print("\n The best score across ALL searched params:\n", grid.best_score_)
# print("\n The best parameters across ALL searched params:\n", grid.best_params_)

rand_search = RandomizedSearchCV(estimator=cat_model, param_distributions=params2, scoring="roc_auc", verbose=0, n_iter=20)
rand_search.fit(X_train, y_train)

print("RESULTS")
print("\n The best estimator across ALL searched params:\n", rand_search.best_estimator_)
print("\n The best score across ALL searched params:\n", rand_search.best_score_)
print("\n The best parameters across ALL searched params:\n", rand_search.best_params_)

Although Catboost hyperparameter tuning gave me the best results, I did attempt stacking. If I'd had more time (i.e., started earlier haha), I think I could've produced a stacking model that outperformed my best model above

In [None]:
lr = LogisticRegression()
cat_model = cb.CatBoostClassifier(learning_rate=.01, l2_leaf_reg=1, iterations=1000, depth=5, border_count=32, boosting_type='Ordered')
xgb_model = xgb.XGBClassifier(min_child_weight=3, max_depth=4, learning_rate=0.05, gamma=0.2, colsample_bytree=0.7)
xgbrf_model = xgb.XGBRFClassifier()

level0 = list()
#level0.append(('rf', RandomForestClassifier()))
#level0.append(('dt', DecisionTreeClassifier()))
level0.append(('xgb', xgb_model))
level0.append(('xgbrf', xgbrf_model))
level0.append(('cb', cat_model))

stacked = StackingClassifier(estimators=level0, final_estimator=lr, cv=5, stack_method="predict_proba")
stacked.fit(X_train, y_train)
stacked_preds = stacked.predict(X_test)
print(accuracy_score(y_test, stacked_preds))

stacked.fit(X, y)
stacked_preds = stacked.predict_proba(X_kag_test)
write_preds_to_csv("stack_v7", stacked_preds)

Prior to CatBoost, I tried good ole logistic regression to gauge a baseline - it wasn't very good

In [None]:
lr = LogisticRegression(penalty='l2', C=10, random_state=42, solver="lbfgs", multi_class="multinomial", max_iter=10000)

lr.fit(X_train, y_train)

lr_preds = lr.predict_proba(X_test)

lr_preds_proba = []
for pred in lr_preds:
   lr_preds_proba.append(pred[1])

lr_auc = roc_auc_score(y_test, lr_preds_proba)
print(format(lr_auc, ".2%"))

I also tried out XGBoost (normal classifier and RF classifier) before moving on to CatBoost which seemed to perform best on this data

In [None]:
model_xgb = xgb.XGBClassifier(colsample_bytree = 0.5, gamma = 0, learning_rate = 0.05, max_depth = 20, reg_lambda = 0, scale_pos_weight = 1, subsample = 0.8)

model_xgb.fit(X_train, y_train)
model_xgb_pred = model_xgb.predict_proba(X_test)

xgb_preds_proba = []
for pred in model_xgb_pred:
    xgb_preds_proba.append(pred[1])

print(roc_auc_score(y_test, xgb_preds_proba))

In [None]:
model_xgbrf = xgb.XGBRFClassifier(colsample_bytree = 0.5, gamma = 1, learning_rate = 0.01, max_depth = 20, reg_lambda = 0, scale_pos_weight = 1, subsample = 0.8)

model_xgbrf.fit(X_train, y_train)
model_xgbrf_pred = model_xgbrf.predict_proba(X_test)

xgbrf_preds_proba = []
for pred in model_xgbrf_pred:
    xgbrf_preds_proba.append(pred[1])

print(roc_auc_score(y_test, xgbrf_preds_proba))

I also tried running a neural network, but it didn't produce great results. I would have liked to play around with it more (especially stacking CatBoost with a Neural Network), but there wasn't enough time to experiment with everything

In [None]:
# this model gets validation accuracy upwards of .8
model = tf.keras.Sequential()

model.add(BatchNormalization(input_shape=(X_train.shape[1:]))) # batch normalization very important for this dataset
model.add(Dense(32, input_shape=(X_train.shape[1:])))
model.add(Activation('relu'))
model.add(Dropout(.5))

model.add(BatchNormalization(input_shape=(X_train.shape[1:]))) # batch normalization very important for this dataset
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(.5))

model.add(Flatten())
model.add(Dense(1))
model.add(Activation('sigmoid')) # make sure to use sigmoid, not softmax (because it's binary classification)

In [None]:
model.compile(
  optimizer=tf.keras.optimizers.RMSprop(learning_rate=.01),
  loss='binary_crossentropy',
  metrics=['accuracy'])

history = model.fit(
    X, y, 
    #batch_size=batch, 
    epochs=10,
    #steps_per_epoch=16,
    validation_data=(X_test, y_test), # we'll use the test set to perform validation
    validation_freq=1, # we want to validate using the test set on each epoch
)

One thing that I didn't really try during the competition was feature engineering. I honestly wasn't sure where to start with feature engineering, and I wasn't sure how much of an AUC increase it would lead to, so I focused on trying different models and hyperparameters