In [62]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.linear_model import SGDClassifier

In [2]:
train_data = pd.read_csv("../data/processed/train_data_baseline.csv")
test_data = pd.read_csv("../data/processed/test_data_baseline.csv")

In [3]:
class filter_add_attributes(BaseEstimator, TransformerMixin):
    '''Custom transformer based on Sklearn's classes.
    Takes in dataframe (train or test) and adds new features and returns
    a filtered version of the original train/test datasets.'''
    def fit(self, X, y=None):
        return self.fit_transform(X)
    def transform(self, X, y=None):
        return self.fit_transform(X)
    def fit_transform(self, X, y=None):
        '''Calculates and adds comment body length and account activity (based on frequency of comment author)
        as features. Returns a new dataframe with the added columns.'''
        data = X.copy()
        data["body_len"] = data.comment_body.apply(lambda x: len(x))
        data["acc_activity"] = data.author_ids.map(data.author_ids.value_counts())
        data["is_premium"] = data.is_premium.astype(int)
        return data.filter(items=["ups", "comment_karma", "link_karma", "is_premium", "comment_age_days", "acc_age_days", "body_len", "acc_activity"], axis=1)

In [4]:
pipeline = Pipeline([
        ('filter_add', filter_add_attributes()),
        ('scaler', StandardScaler()),
    ])

X_train = pipeline.fit_transform(train_data)
X_test = pipeline.transform(test_data)
y_train = train_data["gildings"].to_list()
y_test = test_data["gildings"].to_list()

In [5]:
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

In [6]:
def train_eval(clf, X, y):
    '''Takes in train and train datasets along a model to train and evaluate. Returns f1 and roc-auc scores.'''
    rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42)
    scores = cross_validate(clf, X, y, cv=rskf, scoring=['f1', 'roc_auc'], n_jobs=-1)
    return scores['test_f1'], scores['test_roc_auc']

Regular SVM doesn't scale well to our particular problem (due to the size of the dataset and limited resources available). So, we will use Stochastic Gradient Descent with hinge loss (equates to SVM) instead. Let's try with the default parameters first:

In [37]:
clf = SGDClassifier(loss="hinge",random_state=42)
f1, roc = train_eval(clf, X_train, y_train)
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}")

F1 score is: 0.07231340451914046
ROC-AUC is: 0.6709049027449308


That's the baseline. Let's try assigning weights to class. As with logistic regression, we will start off with balanced class weights.

In [12]:
clf = SGDClassifier(loss="hinge",random_state=42, class_weight='balanced')
f1, roc = train_eval(clf, X_train, y_train)
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}")

F1 score is: 0.028938602553323745
ROC-AUC is: 0.8750954918662222


In [42]:
def train_eval_weights(X_train, y_train, parameters):
    '''Function to train and evalute SGD Classifier (w/ Cross Validation) Model.'''
    model = SGDClassifier(loss="hinge")
    rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42)
    grid = GridSearchCV(model, parameters, scoring=['f1', 'roc_auc'], cv=rskf, refit='f1')
    grid.fit(X_train, y_train)
    return grid

In [30]:
parameters = {'class_weight':[{0:1,1:1}, {0:1,1:10}, {0:1,1:100}, {0:1,1:1000}, {0:1,1:10000}, {0:10,1:1}]}
grid_res = train_eval_weights(X_train, y_train, parameters)
print(f"Best results: {grid_res.best_score_} and Best parameters: {grid_res.best_params_}\n")
f1s = grid_res.cv_results_['mean_test_f1']
rocs = grid_res.cv_results_['mean_test_roc_auc']
params = grid_resgrid_res.cv_results_['params']
for f1, roc, param in zip(f1s, rocs, params):
    print(f"F1 Score: {round(f1,4)}, ROC_AUC: {round(roc,4)}, parameters: {param}")

Best results: 0.23172721564368265 and Best parameters: {'class_weight': {0: 1, 1: 10}}

F1 Score: 0.0833, ROC_AUC: 0.6834, parameters: {'class_weight': {0: 1, 1: 1}}
F1 Score: 0.2317, ROC_AUC: 0.8574, parameters: {'class_weight': {0: 1, 1: 10}}
F1 Score: 0.1319, ROC_AUC: 0.8911, parameters: {'class_weight': {0: 1, 1: 100}}
F1 Score: 0.013, ROC_AUC: 0.8804, parameters: {'class_weight': {0: 1, 1: 1000}}
F1 Score: 0.0047, ROC_AUC: 0.8603, parameters: {'class_weight': {0: 1, 1: 10000}}
F1 Score: 0.0271, ROC_AUC: 0.8565, parameters: {'class_weight': {0: 10, 1: 1}}


Great, better than our result from weighted Logistic regression (F1 score: 0.12); comparable ROC-AUC score as well (ROC-AUC: 0.88).

We can also fit with model with several other values for hyperparameters (such as using l1 norm or elastic net). We will also experiment with different values for alpha (regularization strength).

Note: l1_ratio is only used for elasticnet.

In [51]:
parameters = {'penalty':['l2', 'l1', 'elasticnet'], 'l1_ratio': [0.15, 0.30, 0.60, 0.90], 
              'alpha' : [0.0001, 0.001, 0.01] , 'class_weight':[{0:1,1:10}]}
grid_res = train_eval_weights(X_train, y_train, parameters)
print(f"Best results: {grid_res.best_score_} and Best parameters: {grid_res.best_params_}\n")
f1s = grid_res.cv_results_['mean_test_f1']
rocs = grid_res.cv_results_['mean_test_roc_auc']
params = grid_res.cv_results_['params']
for f1, roc, param in zip(f1s, rocs, params):
    print(f"F1 Score: {round(f1,4)}, ROC_AUC: {round(roc,4)}, parameters: {param}\n")

Best results: 0.2954858955870661 and Best parameters: {'alpha': 0.01, 'class_weight': {0: 1, 1: 10}, 'l1_ratio': 0.9, 'penalty': 'l2'}

F1 Score: 0.2436, ROC_AUC: 0.8557, parameters: {'alpha': 0.0001, 'class_weight': {0: 1, 1: 10}, 'l1_ratio': 0.15, 'penalty': 'l2'}

F1 Score: 0.2417, ROC_AUC: 0.8657, parameters: {'alpha': 0.0001, 'class_weight': {0: 1, 1: 10}, 'l1_ratio': 0.15, 'penalty': 'l1'}

F1 Score: 0.2598, ROC_AUC: 0.8838, parameters: {'alpha': 0.0001, 'class_weight': {0: 1, 1: 10}, 'l1_ratio': 0.15, 'penalty': 'elasticnet'}

F1 Score: 0.2489, ROC_AUC: 0.8557, parameters: {'alpha': 0.0001, 'class_weight': {0: 1, 1: 10}, 'l1_ratio': 0.3, 'penalty': 'l2'}

F1 Score: 0.2324, ROC_AUC: 0.8689, parameters: {'alpha': 0.0001, 'class_weight': {0: 1, 1: 10}, 'l1_ratio': 0.3, 'penalty': 'l1'}

F1 Score: 0.2465, ROC_AUC: 0.8722, parameters: {'alpha': 0.0001, 'class_weight': {0: 1, 1: 10}, 'l1_ratio': 0.3, 'penalty': 'elasticnet'}

F1 Score: 0.2405, ROC_AUC: 0.8719, parameters: {'alpha': 0.

We can use these hyperparameter values for sampling methods; we only need to set the class weight and alpha as SGD uses l2 norm (penalty) as default. For comparison, we will also run the models with default alpha value (0.001).

We will start with RandomOverSampler to duplicates records from the minority class. We will use a sampling ratio of 0.1 (i.e. ~10% increase in gilded class).

In [39]:
#Using RandomOverSampler to duplicate records belonging to class 1 (gilded)

random_sampler = RandomOverSampler(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = random_sampler.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = SGDClassifier(loss="hinge", class_weight={0: 1, 1: 10})
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("Random Over Sampling:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}\n")

Random Over Sampling:
F1 score is: 0.7292231443582522
ROC-AUC is: 0.9390245759403417



Let's compare to a model with larger regularization (higher alpha).

In [52]:
#Using RandomOverSampler to duplicate records belonging to class 1 (gilded)

random_sampler = RandomOverSampler(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = random_sampler.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = SGDClassifier(loss="hinge", class_weight={0: 1, 1: 10}, alpha=0.01)
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("Random Over Sampling:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}\n")

Random Over Sampling:
F1 score is: 0.728681741124321
ROC-AUC is: 0.9448275406809963



Not as big of a difference. How about with other oversampling techniques?

We can generate new samples with SMOTE and ADASYN based on existing samples. We will keep the sampling ratio the same for comparison.

In [53]:
#Using SMOTE to generate samples in gilded class

smote = SMOTE(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = SGDClassifier(loss="hinge", class_weight={0: 1, 1: 10})
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("SMOTE:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}\n")

#Using ADASYN to generate samples in gilded class

ada = ADASYN(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = ada.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = SGDClassifier(loss="hinge", class_weight={0: 1, 1: 10})
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("ADASYN:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}")

print("\n Higher Alpha (0.01)\n:")

smote = SMOTE(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = SGDClassifier(loss="hinge", class_weight={0: 1, 1: 10}, alpha=0.01)
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("SMOTE:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}\n")

#Using ADASYN to generate samples in gilded class

ada = ADASYN(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = ada.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = SGDClassifier(loss="hinge", class_weight={0: 1, 1: 10}, alpha=0.01)
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("ADASYN:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}")

SMOTE:
F1 score is: 0.7240862483229289
ROC-AUC is: 0.9522434730514199

ADASYN:
F1 score is: 0.6982751104719969
ROC-AUC is: 0.9456775552753072

 Higher Alpha (0.01)
:
SMOTE:
F1 score is: 0.7205337453810177
ROC-AUC is: 0.9611876119095077

ADASYN:
F1 score is: 0.6887987320458137
ROC-AUC is: 0.9549116887190431


Imbalanced learn also recommends combining oversampling with undersampling the majority class.

Ref: https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/combine/plot_comparison_combine.html

SMOTE can generate noisy samples (ex: when classes cannot be well separated), undersampling allows to clean the noisy data.

In [54]:
smote_tomek = SMOTETomek(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = SGDClassifier(loss="hinge", class_weight={0: 1, 1: 10})
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("SMOTE - Tomek's Link:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}\n")

smote_enn = SMOTEENN(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = SGDClassifier(loss="hinge", class_weight={0: 1, 1: 10})
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("SMOTE - Edited nearest neighbours:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}\n")

print("\n Higher Alpha (0.01)\n:")

smote_tomek = SMOTETomek(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = SGDClassifier(loss="hinge", class_weight={0: 1, 1: 10}, alpha=0.01)
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("SMOTE - Tomek's Link:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}\n")

smote_enn = SMOTEENN(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = SGDClassifier(loss="hinge", class_weight={0: 1, 1: 10}, alpha=0.01)
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("SMOTE - Edited nearest neighbours:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}\n")

SMOTE - Tomek's Link:
F1 score is: 0.7258589025637112
ROC-AUC is: 0.9538878971045017

SMOTE - Edited nearest neighbours:
F1 score is: 0.7588807574452947
ROC-AUC is: 0.9638376085750204


 Higher Alpha (0.01)
:
SMOTE - Tomek's Link:
F1 score is: 0.7211540744014311
ROC-AUC is: 0.9614624304063588

SMOTE - Edited nearest neighbours:
F1 score is: 0.7514398652168174
ROC-AUC is: 0.9674555574777809



As with Logistic regression, SMOTEENN and RandomOverSampler produces the best results (though the results are much closer with SGD). Let's try evaluating on our test set (for both f1 score and accuracy).

In [63]:
random_sampler = RandomOverSampler(sampling_strategy=0.1, random_state=42)
smote = SMOTE(sampling_strategy=0.1, random_state=42)
smote_enn = SMOTEENN(sampling_strategy=0.1, random_state=42)
models = [random_sampler, smote, smote_enn]
model_names = ["Random Oversampling", "SMOTE", "SMOTE ENN"]

for i in range(len(models)):
    parameters = {'class_weight':[{0:1,1:10}]}
    X_resampled, y_resampled = models[i].fit_resample(X_train, y_train)
    grid_res = train_eval_weights(X_resampled, y_resampled, parameters)
    y_preds = grid_res.predict(X_test)
    print(f"{model_names[i]} on test set:")
    print(f"F1 score: {f1_score(y_test, y_preds)}")
    print(f"ROC_AUC score: {roc_auc_score(y_test, y_preds)}")
    print(f"Accuracy score: {accuracy_score(y_test, y_preds)}")
    print("\n")

Random Oversampling on test set:
F1 score: 0.12749315604223702
ROC_AUC score: 0.8189138679038066
Accuracy score: 0.9825700401568774


SMOTE on test set:
F1 score: 0.11395027624309394
ROC_AUC score: 0.8216109394485629
Accuracy score: 0.9799528117626838


SMOTE ENN on test set:
F1 score: 0.0875768757687577
ROC_AUC score: 0.843190859862907
Accuracy score: 0.9710229847341365




Both Random Oversampling and SMOTE has achieved better results (.12 vs 0.06 in Logistic Regression). We will try Random Forests next.