In [50]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

# USING ONLY RAW DATA AND NO FEATURE ENGINEERING

In [43]:
import copy

def test_model_out_of_the_box(model, model_name):
    model_all_data = copy.deepcopy(model)

    #Load in data
    df = pd.read_csv('./data/raw/train_final.csv', index_col = 0)
    df_test = pd.read_csv('./data/raw/test_final.csv', index_col=0)
    X_train, X_test, y_train, y_test = train_test_split(df.drop('Y', axis=1), df['Y'], test_size=0.2, random_state=42) # Create split for dev stuff
    x = df.drop('Y', axis=1)
    y = df['Y']

    try:
        model.fit(X_train, y_train, verbose=False)
        model_all_data.fit(x, y, verbose=False)  
    except:
        model.fit(X_train, y_train)
        model_all_data.fit(x, y)  

    print('Using only train data AUC ROC score:', metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])) 

    # Spit Out
    df_test['Y'] = model_all_data.predict_proba(df_test)[:,1]
    df_test['Y'].to_csv(f'./preds/{model_name}_out_of_the_box.csv')



## XGBoost Out of the box - 0.87046

In [44]:
import xgboost as xgb

# Get an idea of how out of the box submissions will look
model_xgb = xgb.XGBClassifier()
test_model_out_of_the_box(model_xgb, model_name='xgb')

Using only train data AUC ROC score: 0.874332337973672


## CatBoost Out of the box - 0.89352

In [45]:
import catboost as cb

model_cb = cb.CatBoostClassifier()
test_model_out_of_the_box(model_cb, model_name='cb')

Using only train data AUC ROC score: 0.9237498422845607


## Lightgbm Out of the box - 0.86941

In [46]:
import lightgbm as lgbm

test_model_out_of_the_box(lgbm.LGBMClassifier(), model_name='lgbm')



Using only train data AUC ROC score: 0.8872650039954578


## Random Forest Out of the Box - 0.77

In [47]:
from sklearn.ensemble import RandomForestClassifier

test_model_out_of_the_box(RandomForestClassifier(), model_name='rf')

Using only train data AUC ROC score: 0.808386255625184


# Feature Engineering and Re-Testing

In [179]:
def find_categorical(df):
    cat = []
    const = []
    contin = []
    for c in df.columns:
        x = len(df[c].value_counts())
        if x == 1:
            const.append(c)
            #print(f'Column {c} is likely CONSTANT')
        elif x < 10:
            #print(f'Column {c} is likely categorical w/ {x} categories')
            cat.append((c, x))
        else:
            #print(f'Columns {c} is likely continuous... Has {x} unique values')
            contin.append((c, x))

    return np.array(cat), const, contin

def convert_categorical(df, info, onehot=False):
    cache = {}
    for c, n in info:
        col_cache = {}
        u = np.sort(df[c].unique())
        arr = np.arange(len(u))
        f = lambda x: arr[np.where(u==x)[0][0]] if len(np.where(u==x)[0]) > 0 else x
        df[c] = df[c].map(f)
        col_cache['f'] = f
        col_cache['arr'] = arr
        col_cache['unique'] = u

        if onehot:
            pass #TODO convert the now categorized column into onehot representations
            #Must store the one hotters for consistency

        cache[c] = col_cache

    return df, cache

def train_preprocess(df, onehot=False):
    categ, const, contin = find_categorical(df)
    df = df.drop(const[1:], axis=1) # Drop all constants except 1
    df, categ_cache = convert_categorical(df, categ, onehot)
    cache = {
        'categ': categ_cache,
        'const': const
    }
    return df, cache

def test_preprocess(df, cache):

    # Drop all columns except 1 from cache['const']
    df = df.drop(cache['const'][1:], axis=1)
    # Convert all categorical from cache['categ']
    for c in cache['categ']:
        col_cache = cache['categ'][c]
        arr = col_cache['arr']
        u = col_cache['unique']
        df[c] = df[c].map(col_cache['f'])

    return df
    

In [196]:
"""train_df = pd.read_csv('./data/raw/train_final.csv', index_col=0)
test_df = pd.read_csv('./data/raw/test_final.csv', index_col=0)

x = train_df.drop('Y', axis=1)
y = train_df['Y']
pp_train, cache = train_preprocess(x.copy())
pp_test = test_preprocess(test_df.copy(), cache)
x_train, x_test, y_train, y_test = train_test_split(pp_train, y, test_size=0.2, random_state=42) # Create split for dev stuff"""

df = pd.read_csv('./data/raw/train_final.csv', index_col = 0)
x = df.drop('Y', axis=1)
y = df['Y']
test_df = pd.read_csv('./data/raw/test_final.csv', index_col=0)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # Create split for dev stuff


## XGBOOST
- Hyperparameter Tuned
- Feature Engineered
- Hyperparameter Tuned + Feature Engineered

In [178]:
# ============== Feature Engineering ==================== <- Failure
import xgboost as xgb

model = xgb.XGBClassifier().fit(x_train, y_train)
print('Feature Engineered XGBoost AUC score: ', metrics.roc_auc_score(y_test, model.predict_proba(x_test)[:,1]))

model = xgb.XGBClassifier().fit(pp_train, y)
pp_test['Y'] = model.predict_proba(pp_test)[:,1]
pp_test['Y'].to_csv('./preds/xgb_feature_engineered.csv')

Feature Engineered XGBoost AUC score:  0.874332337973672


In [225]:
# ============== Hyperparameter Tuning ======================
params = {
    'objective':'binary:logistic',
    'eta': 0.04,
    'gamma':0,
    'max_depth':5,
    'min_child_weight':1
}
model = xgb.XGBClassifier()
model.set_params(**params)
model.fit(x_train, y_train)
print('Tuned XGBoost AUC score prediction: ', metrics.roc_auc_score(y_test, model.predict_proba(x_test)[:,1]))

model = xgb.XGBClassifier()
model.set_params(**params)
model.fit(x, y)
xg_preds = pd.DataFrame(model.predict_proba(test_df)[:,1], index=test_df.index, columns=['Y'])
xg_preds.to_csv('./preds/xgb_tuned.csv')

Tuned XGBoost AUC score prediction:  0.9226563485721495


## CatBoost

In [272]:
best_params = params = {
            'eta': 0.02,
            'max_depth':4,
            'n_estimators':1000
        }
best_score = 0

In [273]:
# ============== Hyperparameter Tuning ======================
n_folds = 10
print('=================Scores==================')
tot_score = 0
for i in range(n_folds):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # Create split for dev stuff
    params = {
            'eta': 0.01,
            'max_depth':5,
            'n_estimators':1200
        }
    model = cb.CatBoostClassifier()
    model.set_params(**params)
    model.fit(x_train, y_train, verbose=False)
    score = metrics.roc_auc_score(y_test, model.predict_proba(x_test)[:,1])
    tot_score += score
    print(f'\tm{i}: ', score)
print(f'Predicted AUC score: {tot_score/n_folds}\nBest Previous Score: {best_score}')
if best_score < tot_score/n_folds:
    print('!!!!!!!!!!!!!!!Found new best parameters!!!!!!!!')
    best_score = tot_score/n_folds
    best_params = params


	m0:  0.9306317539484621
	m1:  0.9052008863246791
	m2:  0.9108016067062522
	m3:  0.9176688063703583
	m4:  0.9020912938082912
	m5:  0.9058070989589866
	m6:  0.9037531476078181
	m7:  0.9089694486505745
	m8:  0.9040442741592167
	m9:  0.8552892561983472
Predicted AUC score: 0.9044257572732987
Best Score: 0
!!!!!!!!!!!!!!!Found new best parameters!!!!!!!!


In [274]:
model = cb.CatBoostClassifier()
model.set_params(**best_params)
model.fit(x, y, verbose=False)
cb_preds = pd.DataFrame(model.predict_proba(test_df)[:,1], index=test_df.index, columns=['Y'])
cb_preds.to_csv('./preds/cb_tuned.csv')

## Deep Learning CNN

In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
df = pd.read_csv('./data/raw/train_final.csv', index_col = 0)
x = df.drop('Y', axis=1)
y = df['Y']
test_df = pd.read_csv('./data/raw/test_final.csv', index_col=0)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # Create split for dev stuff

In [None]:
# ============== CNN Using Torch ==================
class CNN(nn.Module):
    def __init__(self):
        super.__init__()
        # Probably want to include parameters here for defining the structure of the neural net
        pass

    def forward(self, x):
        pass

    def train(self, train_df):
        pass

    def predict(self, df):
        pass

    def predict_proba(self, df):
        pass


# Ensemble
- Stacked model of each best architecture

In [None]:
# ========== Ensembling =================

# Stacking
# Create dataframe of predictions from xgboost, catboost, and neural net
# Train logistic regression, neural net, xgboost, and catboost on 
n_folds = 10
tot_score = 0
for i in range(n_folds):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # Create split for dev stuff
    #Train best_params catboost
    #Train best_params xgboost
    ens_df = xg_preds.join(cb_preds, lsuffix='xg', rsuffix='cat')

    #Train catboost on stacked df

    #Test ensembled model on test data produce AUC score

tot_score /= n_folds


