In [1]:
# imports
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold, train_test_split
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler # To preprocess data for SVM - greatly improves performance
from sklearn.svm import SVC
from pytorch_tabr import TabRClassifier as TabRClassifier_
from itertools import product
from typing import Tuple, Union, Optional, Callable
from tqdm import tqdm

In [2]:
def SVMClassifier(**hyperparams):
    return make_pipeline(StandardScaler(), SVC(**hyperparams))

In [21]:
class TabRClassifier(TabRClassifier_):
    def __init__(self, **kwargs):
        # selection_function_name="sparsemax",
        # context_dropout=0.5,
        # context_sample_size=2000,
        # num_embeddings={"type": "PLREmbeddings", "n_frequencies": 32, "frequency_scale": 32, "d_embedding": 32, "lite": False},
        super().__init__(**kwargs)
        self.type_embeddings="one-hot"
        self.device_name="cpu"
        self.optimizer_params={"lr": 2e-4}
        self.d_main=48
        self.d_multiplier = 1.5
        self.context_dropout = 0
        self.dropout0 = 0
        self.dropout1 = 0
        self.context_size = 12
        
    def fit(
        self,
        X_train: pd.DataFrame,
        y_train: pd.DataFrame,
        **kwargs
    ) -> None:
        super().fit(X_train=X_train.values, y_train=y_train.values,
                    max_epochs=1, batch_size=25, **kwargs)
        
    def predict(self, X: pd.DataFrame):
        return super().predict(X=X.values)
        

In [4]:
# constants
SEED = 42
NUM_SPLITS = 10
TARGET = "decision"
MODELS = [XGBClassifier, SVMClassifier, TabRClassifier]

# XGBCLASSIFIER_HYPERPARAMETERS = {
#     "max_depth": [3, 4, 5, 6, 7, 8],
#     "min_child_weight": [1, 4, 8, 16],
# }

XGBCLASSIFIER_HYPERPARAMETERS = {
    "max_depth": [3, 4],
    "min_child_weight": [1, 4],
}

SVMCLASSIFIER_HYPERPARAMETERS = {
    
}

TABRCLASSIFIER_HYPERPARAMETERS = {
   
}

HYPERPARAMETERS = {
    XGBClassifier.__name__ : XGBCLASSIFIER_HYPERPARAMETERS,
    SVMClassifier.__name__ : SVMCLASSIFIER_HYPERPARAMETERS,
    TabRClassifier.__name__ : TABRCLASSIFIER_HYPERPARAMETERS
}

In [5]:
def one_hot_encode(df, features):
    for feature in features:
        dummies = pd.get_dummies(df.loc[:, feature], prefix=feature)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(feature, axis=1)
    return df

In [6]:
# load dataset
dataset = pd.read_csv("SpeedDating.csv", index_col=0)

# remove redundant columns
subset = ['gender', 'age', 'age_o', 'race', 'race_o', 'importance_same_race', 'importance_same_religion',
          'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence',
          'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests', 'attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o',
          'ambitous_o', 'shared_interests_o', 'attractive_important', 'sincere_important', 'intellicence_important', 'funny_important', 'ambtition_important',
          'shared_interests_important', 'attractive', 'sincere', 'intelligence', 'funny', 'ambition', 'attractive_partner', 'sincere_partner',
          'intelligence_partner', 'funny_partner', 'ambition_partner', 'shared_interests_partner',
          'sports', 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies', 'concerts',
          'music', 'shopping', 'yoga',
          'interests_correlate', 'expected_happy_with_sd_people', 'expected_num_matches', 'expected_num_interested_in_me',
          'like', 'guess_prob_liked', 'decision']

dataset = dataset.loc[:, subset]
dataset.loc[:, 'gender'] = (dataset.loc[:, 'gender'] == 'female') # one hot encode gender
dataset = one_hot_encode(dataset, ['race', 'race_o'])
dataset = dataset.apply(pd.to_numeric, errors='coerce', axis=1)
dataset = dataset.fillna(dataset.mean())
print(dataset.head())
X, y = dataset.loc[:, dataset.columns != TARGET], dataset.loc[:, TARGET]
BASELINE = np.sum(y == 1) / np.sum(y == 0)


  dataset = pd.read_csv("SpeedDating.csv", index_col=0)


    gender   age  age_o  importance_same_race  importance_same_religion  \
id                                                                        
1      1.0  21.0   27.0                   2.0                       4.0   
2      1.0  21.0   22.0                   2.0                       4.0   
3      1.0  21.0   22.0                   2.0                       4.0   
4      1.0  21.0   23.0                   2.0                       4.0   
5      1.0  21.0   24.0                   2.0                       4.0   

    pref_o_attractive  pref_o_sincere  pref_o_intelligence  pref_o_funny  \
id                                                                         
1                35.0            20.0                 20.0          20.0   
2                60.0             0.0                  0.0          40.0   
3                19.0            18.0                 19.0          18.0   
4                30.0             5.0                 15.0          40.0   
5                3

In [7]:
def train_test_models(models, hyperparameters, X, y, ret_trained_models=False):
    trained_models = {}
    accuracies = {}
    kf = StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True, random_state=SEED)
    for model_class in models:
        model_hyperparameters = hyperparameters[model_class.__name__]
        for hyperparams in product(*model_hyperparameters.values()):
            s=0
            kwargs = dict(zip(model_hyperparameters.keys(), hyperparams))
            model = model_class(**kwargs)
            for train_idx, test_idx in kf.split(X, y):
                X_train, y_train = X.iloc[train_idx, :], y.iloc[train_idx]
                X_test, y_test = X.iloc[test_idx, :], y.iloc[test_idx]
                model.fit(X_train, y_train)
                s += np.mean(model.predict(X_test) == np.array(y_test))
            if ret_trained_models:
                trained_models[model_class.__name__, hyperparams] = model
            accuracies[model_class.__name__, hyperparams] = s/NUM_SPLITS
    return accuracies, trained_models if ret_trained_models else accuracies

In [22]:
train_test_models([TabRClassifier], HYPERPARAMETERS, X, y, ret_trained_models=True)



 epochs:   0%|          | 0/1 [00:00<?, ?it/s]

 batches:   0%|          | 0/301 [00:00<?, ?it/s]

 epochs:   0%|          | 0/1 [00:00<?, ?it/s]

 batches:   0%|          | 0/301 [00:00<?, ?it/s]

 epochs:   0%|          | 0/1 [00:00<?, ?it/s]

 batches:   0%|          | 0/301 [00:00<?, ?it/s]

 epochs:   0%|          | 0/1 [00:00<?, ?it/s]

 batches:   0%|          | 0/301 [00:00<?, ?it/s]

 epochs:   0%|          | 0/1 [00:00<?, ?it/s]

 batches:   0%|          | 0/301 [00:00<?, ?it/s]

 epochs:   0%|          | 0/1 [00:00<?, ?it/s]

 batches:   0%|          | 0/301 [00:00<?, ?it/s]

 epochs:   0%|          | 0/1 [00:00<?, ?it/s]

 batches:   0%|          | 0/301 [00:00<?, ?it/s]

 epochs:   0%|          | 0/1 [00:00<?, ?it/s]

 batches:   0%|          | 0/301 [00:00<?, ?it/s]

 epochs:   0%|          | 0/1 [00:00<?, ?it/s]

 batches:   0%|          | 0/301 [00:00<?, ?it/s]

 epochs:   0%|          | 0/1 [00:00<?, ?it/s]

 batches:   0%|          | 0/301 [00:00<?, ?it/s]

({('TabRClassifier', ()): 0.7504111741273954},
 {('TabRClassifier',
   ()): TabRClassifier(cat_indices=[], cat_cardinalities=[], bin_indices=[], num_embeddings=None, type_embeddings='one-hot', cat_emb_dims=2, d_main=48, d_multiplier=1.5, encoder_n_blocks=2, predictor_n_blocks=2, mixer_normalization='auto', context_dropout=0, dropout0=0, dropout1=0, normalization='LayerNorm', activation='ReLU', device_name='cpu', optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.0002}, scheduler_fn=None, scheduler_params={}, context_size=12, context_sample_size=None, memory_efficient=False, candidate_encoding_batch_size=None, selection_function_name='softmax', seed=0, verbose=0)})

In [9]:
# Train and test the model
# kf = StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True, random_state=SEED)
# svm_classifier = make_pipeline(StandardScaler(), SVC())
# s=0
# for train_idx, test_idx in kf.split(X, y):
#     X_train, y_train = X.iloc[train_idx, :], y.iloc[train_idx]
#     X_test, y_test = X.iloc[test_idx, :], y.iloc[test_idx]
#     svm_classifier.fit(X_train, y_train)
#     print('Split accuracy: ', np.mean(svm_classifier.predict(X_test) == np.array(y_test)))
#     s += np.mean(svm_classifier.predict(X_test) == np.array(y_test))
#     print('Accuracy for class 1 [person wanted to match]', np.sum( np.logical_and(svm_classifier.predict(X_test) == 1, svm_classifier.predict(X_test) == y_test))/np.sum(y_test))
# print('Average accuracy:')
# print(s/NUM_SPLITS)

In [10]:
# Train and test the model
# kf = StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True, random_state=SEED)
# xgboost = XGBClassifier(max_depth=4)
# s=0
# for train_idx, test_idx in kf.split(X, y):
#     X_train, y_train = X.iloc[train_idx, :], y.iloc[train_idx]
#     X_test, y_test = X.iloc[test_idx, :], y.iloc[test_idx]
#     xgboost.fit(X_train, y_train)
#     print('Split accuracy: ', np.mean(xgboost.predict(X_test) == np.array(y_test)))
#     s += np.mean(xgboost.predict(X_test) == np.array(y_test))
#     print('Accuracy for class 1 [person wanted to match]', np.sum( np.logical_and(xgboost.predict(X_test) == 1, xgboost.predict(X_test) == y_test))/np.sum(y_test))
# print('Average accuracy:')
# print(s/NUM_SPLITS)