In [1]:
import pandas as pd
import seaborn as sns

In [2]:
train_data = pd.read_csv("../data/processed/train_data_baseline.csv")
test_data = pd.read_csv("../data/processed/test_data_baseline.csv")

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class filter_add_attributes(BaseEstimator, TransformerMixin):
    '''Custom transformer based on Sklearn's classes.
    Takes in dataframe (train or test) and adds new features and returns
    a filtered version of the original train/test datasets.'''
    def fit(self, X, y=None):
        return self.fit_transform(X)
    def transform(self, X, y=None):
        return self.fit_transform(X)
    def fit_transform(self, X, y=None):
        '''Calculates and adds comment body length and account activity (based on frequency of comment author)
        as features. Returns a new dataframe with the added columns.'''
        data = X.copy()
        data["body_len"] = data.comment_body.apply(lambda x: len(x))
        data["acc_activity"] = data.author_ids.map(data.author_ids.value_counts())
        data["is_premium"] = data.is_premium.astype(int)
        return data.filter(items=["ups", "comment_karma", "link_karma", "is_premium", "comment_age_days", "acc_age_days", "body_len", "acc_activity"], axis=1)

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
        ('filter_add', filter_add_attributes()),
        ('scaler', StandardScaler()),
    ])

X_train = pipeline.fit_transform(train_data)
X_test = pipeline.transform(test_data)
y_train = train_data["gildings"].to_list()
y_test = test_data["gildings"].to_list()

In [21]:
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
import numpy as np

def train_eval(clf, X, y):
    '''Takes in train and train datasets along a model to train and evaluate. Returns f1 and roc-auc scores.'''
    rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42)
    scores = cross_validate(clf, X, y, cv=rskf, scoring=['f1', 'roc_auc'])
    return scores['test_f1'], scores['test_roc_auc']

clf = LogisticRegression()
f1, roc = train_eval(clf, X_train, y_train)
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}")

F1 score is: 0.14332273886338903
ROC-AUC is: 0.8478763439152004


In [7]:
print(train_data.gildings.value_counts())

0    510995
1       995
Name: gildings, dtype: int64
0    127749
1       249
Name: gildings, dtype: int64


Due to the high class imbalance, F1 score is a much better metric to use than just accuracy (since 99% of the data belongs to class 0). We will also have ROC-AUC for comparison.

The low score is to expected. Let's try using the class weight functionality in Sklearn that assigns weights to each class based on their frequency.

In [27]:
clf = LogisticRegression(class_weight='balanced')
f1, roc = train_eval(clf, X_train, y_train)
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}")

F1 score is: 0.0734904371963277
ROC-AUC is: 0.9119952192683487


Balanced class weight results in a worse f1 score; let's try with bunch of different weight values. We will use F1 score for refitting (finding the best parameters) as ROC-AUC seems to be too optimistic.

In [38]:
from sklearn.model_selection import GridSearchCV

def train_eval_weights(X_train, y_train):
    '''Function to train and evalute Logistic Regression (w/ Cross Validation) Model.'''
    model = LogisticRegression()
    parameters = {'class_weight':[{0:1,1:1}, {0:1,1:10}, {0:1,1:100}, {0:1,1:1000}, {0:1,1:10000}, {0:10,1:1}]}
    rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42)
    clf = GridSearchCV(model, parameters, scoring=['f1', 'roc_auc'], cv=rskf, refit='f1')
    clf.fit(X_train, y_train)
    return clf.cv_results_['mean_test_f1'], clf.cv_results_['mean_test_roc_auc'], clf.best_params_

f1, roc, best = train_eval_weights(X_train, y_train)
print(f"F1 score is: {np.mean(f1)}")
print(f"ROC-AUC is: {np.mean(roc)}")
print(f"Best class weights: {best}")

F1 score is: 0.12137251216239651
ROC-AUC is: 0.8801967569473819
Best class weights: {'class_weight': {0: 1, 1: 10}}


We are making progress, but can we do even better?

Adjusting the weights were not enough, we will have to try different sampling techniques. We can use imbalanced-learn library for this. Let's start with oversampling class 1 (gilded) with SMOTE and ADASYN techniques with a 0.1 ratio (~10% increase in gilded class).

Read more: https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html#a-practical-guide

In [44]:
from imblearn.over_sampling import SMOTE, ADASYN

#Using SMOTE to generate samples in gilded class

smote = SMOTE(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = LogisticRegression(class_weight=[{0: 1, 1: 10}])
f1, roc = train_eval(clf, X_resampled, y_resampled)
print(f"Smote - F1 score is: {np.mean(f1)}")
print(f"Smote - ROC-AUC is: {np.mean(roc)}")

#Using ADASYN to generate samples in gilded class

ada = ADASYN(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = ada.fit_resample(X_train, y_train)
assert len(X_resampled) == len(y_resampled)
clf = LogisticRegression(class_weight=[{0: 1, 1: 10}])
f1, roc = train_eval(clf, X_resampled, y_resampled)
print(f"ADASYN - F1 score is: {np.mean(f1)}")
print(f"ADASYN - ROC-AUC is: {np.mean(roc)}")

Smote - F1 score is: 0.6097897316663436
Smote - ROC-AUC is: 0.8961593223446982
ADASYN - F1 score is: 0.557028301538202
ADASYN - ROC-AUC is: 0.8835737972917235
