In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from xgboost import XGBClassifier
from typing import Tuple
from tqdm import tqdm

In [2]:
# constants
SEED = 42
NUM_SPLITS = 10
TARGET = "decision"

In [3]:
def one_hot_encode(df, features):
    for feature in features:
        dummies = pd.get_dummies(df.loc[:, feature], prefix=feature)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(feature, axis=1)
    return df

In [9]:
# load dataset
dataset = pd.read_csv("SpeedDating_.csv", index_col=0)

# remove redundant columns
subset = ['gender', 'age', 'age_o', 'race', 'race_o', 'importance_same_race', 'importance_same_religion',
          'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence',
          'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests', 'attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o',
          'ambitous_o', 'shared_interests_o', 'attractive_important', 'sincere_important', 'intellicence_important', 'funny_important', 'ambtition_important',
          'shared_interests_important', 'attractive', 'sincere', 'intelligence', 'funny', 'ambition', 'attractive_partner', 'sincere_partner',
          'intelligence_partner', 'funny_partner', 'ambition_partner', 'shared_interests_partner',
          'sports', 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies', 'concerts',
          'music', 'shopping', 'yoga',
          'interests_correlate', 'expected_happy_with_sd_people', 'expected_num_matches', 'expected_num_interested_in_me',
          'like', 'guess_prob_liked', 'decision']

dataset = dataset.loc[:, subset]
dataset.loc[:, 'gender'] = (dataset.loc[:, 'gender'] == 'female') # one hot encode gender
dataset = one_hot_encode(dataset, ['race', 'race_o'])
dataset = dataset.apply(pd.to_numeric, errors='coerce', axis=1)
dataset = dataset.fillna(dataset.mean())
print(dataset.head())
X, y = dataset.loc[:, dataset.columns != TARGET], dataset.loc[:, TARGET]


  dataset = pd.read_csv("SpeedDating_.csv", index_col=0)


    gender   age  age_o  importance_same_race  importance_same_religion  \
id                                                                        
1      1.0  21.0   27.0                   2.0                       4.0   
2      1.0  21.0   22.0                   2.0                       4.0   
3      1.0  21.0   22.0                   2.0                       4.0   
4      1.0  21.0   23.0                   2.0                       4.0   
5      1.0  21.0   24.0                   2.0                       4.0   

    pref_o_attractive  pref_o_sincere  pref_o_intelligence  pref_o_funny  \
id                                                                         
1                35.0            20.0                 20.0          20.0   
2                60.0             0.0                  0.0          40.0   
3                19.0            18.0                 19.0          18.0   
4                30.0             5.0                 15.0          40.0   
5                3

In [10]:
# Train and test the model
kf = StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True, random_state=SEED)
xgboost = XGBClassifier(max_depth=4)
s=0
for train_idx, test_idx in kf.split(X, y):
    X_train, y_train = X.iloc[train_idx, :], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx, :], y.iloc[test_idx]
    xgboost.fit(X_train, y_train)
    print('Split accuracy: ', np.mean(xgboost.predict(X_test) == np.array(y_test)))
    s += np.mean(xgboost.predict(X_test) == np.array(y_test))
    print('Accuracy for class 1 [person wanted to match]', np.sum( np.logical_and(xgboost.predict(X_test) == 1, xgboost.predict(X_test) == y_test))/np.sum(y_test))
print('Average accuracy:')
print(s/NUM_SPLITS)

Split accuracy:  0.8317422434367542
Accuracy for class 1 [person wanted to match] 0.7840909090909091
Split accuracy:  0.8663484486873508
Accuracy for class 1 [person wanted to match] 0.8465909090909091
Split accuracy:  0.8400954653937948
Accuracy for class 1 [person wanted to match] 0.8153409090909091
Split accuracy:  0.8591885441527446
Accuracy for class 1 [person wanted to match] 0.8238636363636364
Split accuracy:  0.8400954653937948
Accuracy for class 1 [person wanted to match] 0.7897727272727273
Split accuracy:  0.8651551312649165
Accuracy for class 1 [person wanted to match] 0.8323863636363636
Split accuracy:  0.8400954653937948
Accuracy for class 1 [person wanted to match] 0.7982954545454546
Split accuracy:  0.8448687350835322
Accuracy for class 1 [person wanted to match] 0.8068181818181818
Split accuracy:  0.8482676224611708
Accuracy for class 1 [person wanted to match] 0.792022792022792
Split accuracy:  0.8327359617682198
Accuracy for class 1 [person wanted to match] 0.81766381