In [19]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import SMOTE, ADASYN

In [2]:
train_data = pd.read_csv("../data/processed/train_data_baseline.csv")
test_data = pd.read_csv("../data/processed/test_data_baseline.csv")

In [4]:
class filter_add_attributes(BaseEstimator, TransformerMixin):
    '''Custom transformer based on Sklearn's classes.
    Takes in dataframe (train or test) and adds new features and returns
    a filtered version of the original train/test datasets.'''
    def fit(self, X, y=None):
        return self.fit_transform(X)
    def transform(self, X, y=None):
        return self.fit_transform(X)
    def fit_transform(self, X, y=None):
        '''Calculates and adds comment body length and account activity (based on frequency of comment author)
        as features. Returns a new dataframe with the added columns.'''
        data = X.copy()
        data["body_len"] = data.comment_body.apply(lambda x: len(x))
        data["acc_activity"] = data.author_ids.map(data.author_ids.value_counts())
        data["is_premium"] = data.is_premium.astype(int)
        return data.filter(items=["ups", "comment_karma", "link_karma", "is_premium", "comment_age_days", "acc_age_days", "body_len", "acc_activity"], axis=1)

In [5]:
pipeline = Pipeline([
        ('filter_add', filter_add_attributes()),
        ('scaler', StandardScaler()),
    ])

X_train = pipeline.fit_transform(train_data)
X_test = pipeline.transform(test_data)
y_train = train_data["gildings"].to_list()
y_test = test_data["gildings"].to_list()

In [6]:
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

Let's establish a baseline model where classifer simply predicts the minority class:

In [80]:
clf = DummyClassifier(strategy="constant", constant=1)
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42)
scores = cross_validate(clf, X_train, y_train, cv=rskf, scoring=['f1', 'roc_auc']) 
f1, roc = scores['test_f1'], scores['test_roc_auc']
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}")

F1 score is: 0.003879255539090453
ROC-AUC is: 0.5


A good model is one that can perform better than the baseline, in terms of F1 Score. Anything below is worse than a model that simply predicts minority class.

Note that 0.5 ROC-AUC score indicates that it's a random classifier.

In [7]:
def train_eval(clf, X, y):
    '''Takes in train and train datasets along a model to train and evaluate. Returns f1 and roc-auc scores.'''
    rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42)
    scores = cross_validate(clf, X, y, cv=rskf, scoring=['f1', 'roc_auc'])
    return scores['test_f1'], scores['test_roc_auc']

In [26]:
clf = LogisticRegression()
f1, roc = train_eval(clf, X_train, y_train)
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}")

F1 score is: 0.14332273886338903
ROC-AUC is: 0.8478763439152004


In [7]:
print(train_data.gildings.value_counts())

0    510995
1       995
Name: gildings, dtype: int64
0    127749
1       249
Name: gildings, dtype: int64


Due to the high class imbalance, F1 score is a much better metric to use than just accuracy (since 99% of the data belongs to class 0). We will also have ROC-AUC for comparison.

The low score is to expected. Let's try using the class weight functionality in Sklearn that assigns weights to each class based on their frequency.

In [27]:
clf = LogisticRegression(class_weight='balanced')
f1, roc = train_eval(clf, X_train, y_train)
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}")

F1 score is: 0.0734904371963277
ROC-AUC is: 0.9119952192683487


Balanced class weight results in a worse f1 score; let's try with bunch of different weight values. We will use F1 score for refitting (finding the best parameters) as ROC-AUC seems to be too optimistic.

In [38]:
def train_eval_weights(X_train, y_train, parameters):
    '''Function to train and evalute Logistic Regression (w/ Cross Validation) Model.'''
    model = LogisticRegression()
    rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42)
    clf = GridSearchCV(model, parameters, scoring=['f1', 'roc_auc'], cv=rskf, refit='f1')
    clf.fit(X_train, y_train)
    return clf

parameters = {'class_weight':[{0:1,1:1}, {0:1,1:10}, {0:1,1:100}, {0:1,1:1000}, {0:1,1:10000}, {0:10,1:1}]}
clf = train_eval_weights(X_train, y_train, parameters)
f1, roc, best =  clf.cv_results_['mean_test_f1'], clf.cv_results_['mean_test_roc_auc'], clf.best_params_
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}")
print(f"Best class weights: {best}")

F1 score is: 0.12137251216239651
ROC-AUC is: 0.8801967569473819
Best class weights: {'class_weight': {0: 1, 1: 10}}


We are making progress, but can we do even better?

Adjusting the weights were not enough, we will have to try different sampling techniques. Imbalanced-learn library will come in handy here.

We will start with RandomOverSampler to duplicates records from the minority class. We will use a sampling ratio of 0.1 (i.e. ~10% increase in gilded class).

Read more: https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html#a-practical-guide

In [46]:
#Using RandomOverSampler to duplicate records belonging to class 1 (gilded)

random_sampler = RandomOverSampler(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = random_sampler.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = LogisticRegression(class_weight={0: 1, 1: 10})
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("Random Over Sampling:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}\n")

Random Over Sampling:
F1 score is: 0.6137350074209578
ROC-AUC is: 0.8983875798873713



We can also generate new samples with SMOTE and ADASYN based on existing samples. We will keep the sampling ratio the same for comparison.

In [44]:
#Using SMOTE to generate samples in gilded class

smote = SMOTE(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = LogisticRegression(class_weight=[{0: 1, 1: 10}])
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("SMOTE:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}\n")

#Using ADASYN to generate samples in gilded class

ada = ADASYN(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = ada.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = LogisticRegression(class_weight=[{0: 1, 1: 10}])
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("ADASYN:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}")

Smote - F1 score is: 0.6097897316663436
Smote - ROC-AUC is: 0.8961593223446982
ADASYN - F1 score is: 0.557028301538202
ADASYN - ROC-AUC is: 0.8835737972917235


Imbalanced learn also recommends combining oversampling with undersampling the majority class.

Ref: https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/combine/plot_comparison_combine.html

SMOTE can generate noisy samples (ex: when classes cannot be well separated), undersampling allows to clean the noisy data.

In [45]:
smote_tomek = SMOTETomek(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = LogisticRegression(class_weight=[{0: 1, 1: 10}])
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("SMOTE - Tomek's Link:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}\n")

smote_enn = SMOTEENN(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = LogisticRegression(class_weight=[{0: 1, 1: 10}])
f1, roc = train_eval(clf, X_resampled, y_resampled)
print("SMOTE - Edited nearest neighbours:")
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}\n")

SMOTE - Tomek's Link:
F1 score is: 0.6117476376281973
ROC-AUC is: 0.8968529733151215

SMOTE - Edited nearest neighbours:
F1 score is: 0.700327337392521
ROC-AUC is: 0.9207147669601824



SMOTEENN and RandomOverSampler produces the best results so far.

Logistic regression predicts the class probabilities for each sample and decides class based on a threshold (default: 0.5). We can evaluate SMOTEENN and RandomOverSampler on our test set and check if a different threshold value produces better results.

Ref: https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/


In [76]:
def train_eval_probs(X, y, X_test):
    '''Takes in train and train datasets along a model to train and evaluate. Returns f1 and roc-auc scores.'''
    rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42)
    model = LogisticRegressionCV(cv=rskf, class_weight=[{0: 1, 1: 10}])
    model.fit(X, y)
    return model.predict_proba(X_test)[:,1]

def convert_probs(probs, threshold):
    return (probs >= threshold).astype('int')

random_sampler = RandomOverSampler(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = random_sampler.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
probs = train_eval_probs(X_resampled, y_resampled, X_test)

thresholds = np.arange(0, 1, 0.001)
f1_scores = [f1_score(y_test, convert_probs(probs, t)) for t in thresholds]
auc_scores = [roc_auc_score(y_test, convert_probs(probs, t)) for t in thresholds]
ind1 = np.argmax(f1_scores)
ind2 = np.argmax(auc_scores)

print("Random Over Sampling:\n")
print("Maxiziming F1 Score:")
print(f"Threshold: {thresholds[ind1]}, F1 Score: {f1_scores[ind1]}, ROC AUC: {auc_scores[ind1]}")
print("Maxiziming ROC-AUC Score:")
print(f"Threshold: {thresholds[ind2]}, F1 Score: {f1_scores[ind2]}, ROC AUC: {auc_scores[ind2]}")

Random Over Sampling:

Maxiziming F1 Score:
Threshold: 0.9470000000000001, F1 Score: 0.3143350604490501, ROC AUC: 0.681795495628806
Maxiziming ROC-AUC Score:
Threshold: 0.114, F1 Score: 0.06988058381247236, ROC AUC: 0.8011632750856418


In [78]:
smote_enn = SMOTEENN(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
probs = train_eval_probs(X_resampled, y_resampled, X_test)

f1_scores = [f1_score(y_test, convert_probs(probs, t)) for t in thresholds]
auc_scores = [roc_auc_score(y_test, convert_probs(probs, t)) for t in thresholds]
ind1 = np.argmax(f1_scores)
ind2 = np.argmax(auc_scores)

print("SMOTE - Edited nearest neighbours:\n")
print("Maxiziming F1 Score:")
print(f"Threshold: {thresholds[ind1]}, F1 Score: {f1_scores[ind1]}, ROC AUC: {auc_scores[ind1]}")
print("Maxiziming ROC-AUC Score:")
print(f"Threshold: {thresholds[ind2]}, F1 Score: {f1_scores[ind2]}, ROC AUC: {auc_scores[ind2]}")

SMOTE - Edited nearest neighbours:

Maxiziming F1 Score:
Threshold: 0.998, F1 Score: 0.3019431988041854, ROC AUC: 0.7015627029169681
Maxiziming ROC-AUC Score:
Threshold: 0.107, F1 Score: 0.08415217939027463, ROC AUC: 0.8214351900710419


In [20]:
random_sampler = RandomOverSampler(sampling_strategy=0.1, random_state=42)
smote = SMOTE(sampling_strategy=0.1, random_state=42)
smote_enn = SMOTEENN(sampling_strategy=0.1, random_state=42)
models = [random_sampler, smote, smote_enn]
model_names = ["Random Oversampling", "SMOTE", "SMOTE ENN"]

for i in range(len(models)):
    parameters = {'class_weight':[{0:1,1:10}]}
    X_resampled, y_resampled = models[i].fit_resample(X_train, y_train)
    grid_res = train_eval_weights(X_resampled, y_resampled, parameters)
    y_preds = grid_res.predict(X_test)
    print(f"{model_names[i]} on test set:")
    print(f"F1 score: {f1_score(y_test, y_preds)}")
    print(f"ROC_AUC score: {roc_auc_score(y_test, y_preds)}")
    print(f"Accuracy score: {accuracy_score(y_test, y_preds)}")
    print("\n")

Random Oversampling on test set:
F1 score: 0.0638904734740445
ROC_AUC score: 0.8183981729232407
Accuracy score: 0.9615384615384616


SMOTE on test set:
F1 score: 0.060296191819464044
ROC_AUC score: 0.8228175600742684
Accuracy score: 0.9583587243550681


SMOTE ENN on test set:
F1 score: 0.08897775556110973
ROC_AUC score: 0.8434413510604898
Accuracy score: 0.9715229925467586




Better than our baseline model, but still not good enough. We will have to experiment with a more complex model.