In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.linear_model import SGDClassifier

In [2]:
train_data = pd.read_csv("../data/processed/train_data_baseline.csv")
test_data = pd.read_csv("../data/processed/test_data_baseline.csv")

In [3]:
class filter_add_attributes(BaseEstimator, TransformerMixin):
    '''Custom transformer based on Sklearn's classes.
    Takes in dataframe (train or test) and adds new features and returns
    a filtered version of the original train/test datasets.'''
    def fit(self, X, y=None):
        return self.fit_transform(X)
    def transform(self, X, y=None):
        return self.fit_transform(X)
    def fit_transform(self, X, y=None):
        '''Calculates and adds comment body length and account activity (based on frequency of comment author)
        as features. Returns a new dataframe with the added columns.'''
        data = X.copy()
        data["body_len"] = data.comment_body.apply(lambda x: len(x))
        data["acc_activity"] = data.author_ids.map(data.author_ids.value_counts())
        data["is_premium"] = data.is_premium.astype(int)
        return data.filter(items=["ups", "comment_karma", "link_karma", "is_premium", "comment_age_days", "acc_age_days", "body_len", "acc_activity"], axis=1)

In [4]:
pipeline = Pipeline([
        ('filter_add', filter_add_attributes()),
        ('scaler', StandardScaler()),
    ])

X_train = pipeline.fit_transform(train_data)
X_test = pipeline.transform(test_data)
y_train = train_data["gildings"].to_list()
y_test = test_data["gildings"].to_list()

In [5]:
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

In [18]:
# Defining the functions to get true positives...false negatives for Sklearn's make_scorer function
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
scoring = {'tp': make_scorer(tp), 'fp': make_scorer(fp), 
           'fn': make_scorer(fn), 'roc_auc': make_scorer(roc_auc_score),
          'f1': make_scorer(f1_score)}

def train(clf, X, y):
    '''Takes in train and train datasets along a model to train and evaluate. Returns f1 and roc-auc scores.
    
    Arguments:
        clf: classifier to use
        X: Dataframe corresponding to training set
        y: Targets corresponding to the training set
    Returns:
        Cross_validate output.
    '''
    rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42)
    scores = cross_validate(clf, X, y, cv=rskf, scoring=scoring, n_jobs=-1)
    return scores

def calc_metrics(scores):
    '''Takes in cross_validate outputs, gets arrays of true positives, false negatives and false positives 
    (corresponding to class 1; gilded) across k folds and calculates the f1 score. Calculates the mean of
    roc_auc scores from k folds. Returns both f1 and roc_auc scores.
    
    Arguments:
        Cross_validate outputs
    Returns:
        F1 score, calculated using the totals, as well as ROC_AUC score from scores.
    '''
    tps, fns, fps, roc = scores['test_tp'], scores['test_fn'], scores['test_fp'], scores['test_roc_auc']
    tp = np.sum(tps)
    fn = np.sum(fns)
    fp = np.sum(fps)
    f1 = (2*tp)/(2*tp+fn+fp)
    return f1, np.mean(scores['test_roc_auc'])

Regular SVM doesn't scale well to our particular problem (due to the size of the dataset and limited resources available). So, we will use Stochastic Gradient Descent with hinge loss (equates to SVM) instead. Let's try with the default parameters first:

In [7]:
clf = SGDClassifier(loss="hinge",random_state=42)
scores = train(clf, X_train, y_train)
f1, roc = calc_metrics(scores)
print(f"F1 score is: {f1}")
print(f"ROC-AUC is: {roc}")

F1 score is: 0.07898320472083523
ROC-AUC is: 0.5217944162821552


That's the baseline. Let's try assigning weights to class. As with logistic regression, we will start off with balanced class weights.

In [8]:
clf = SGDClassifier(loss="hinge",random_state=42, class_weight='balanced')
scores = train(clf, X_train, y_train)
f1, roc = calc_metrics(scores)
print(f"F1 score is: {f1}")
print(f"ROC-AUC is: {roc}")

F1 score is: 0.018119683335791906
ROC-AUC is: 0.7924024522725649


We will reuse the functions from Logistic Regression notebook to test SGD on various parameters.

In [15]:
def train_grid(X_train, y_train, parameters):
    '''Function to train and find the best hyperparameters for SVM (SGD).
    Uses GridSearch and crossvalidation for tuning.
    
    Arguments:
        X_train: Training set (pandas Dataframe)
        y_train: Targets corresponding to the training set
        parameters: Parameters to test/run GridSearch with.
        
    Returns:
        Classifier, which is used to parse the outputs/metrics.
    '''
    model = SGDClassifier(loss="hinge", random_state=42, max_iter=2000)
    rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42)
    clf = GridSearchCV(model, parameters, scoring=scoring, cv=rskf, refit='f1', n_jobs=-1)
    clf.fit(X_train, y_train)
    return clf

def calc_metrics_grid(clf):
    '''Arguments:
        Classifier which is used to parse all the relevant outputs
    Returns: 
        F1 score, calculated with True Positives, False Positives and False Negatives across k (k=20) folds
        ROC_AUC score, for the best parameters
    '''
    ind = clf.best_index_
    tp_list = ['split' + str(i) + '_test_tp' for i in range(20)]
    fp_list = ['split' + str(i) + '_test_fp' for i in range(20)]
    fn_list = ['split' + str(i) + '_test_fn' for i in range(20)]
    tps, fps, fns = 0, 0, 0
    for tp in tp_list:
        tps+= clf.cv_results_[tp][ind]
    for fp in fp_list:
        fps+= clf.cv_results_[fp][ind]
    for fn in fn_list:
        fns+= clf.cv_results_[fn][ind]
    f1 = (2*tps)/(2*fps+fns+fps)
    return f1, clf.cv_results_['mean_test_roc_auc'][ind]

In [16]:
parameters = {'class_weight':[{0:1,1:1}, {0:1,1:10}, {0:1,1:100}, {0:10,1:1}]}
clf = train_grid(X_train, y_train, parameters)
best = clf.best_params_
f1, roc = calc_metrics_grid(clf)
print(f"F1 score is: {f1}")
print(f"ROC-AUC is: {roc}")
print(f"Best class weights: {best}")

F1 score is: 0.14700104010169884
ROC-AUC is: 0.6586455337266552
Best class weights: {'class_weight': {0: 1, 1: 10}}


Worse F1 score (0.14 vs 0.21) compared to our results from weighted Logistic regression (same best weights) but comparable ROC-AUC score (0.65 vs 0.64).

SGD uses 2000 iterations (due to no convergence in the next block; just to check if we can get better results).

This may suggest that SVM is not a good model for our problem. However before we make any conclusions, let's 
fit the model with several other values for hyperparameters (such as using l1 norm or elastic net). We will also experiment with different values for alpha (regularization strength).

Note: l1_ratio is only used for elasticnet.

In [17]:
parameters = {'penalty':['l2', 'l1', 'elasticnet'], 'l1_ratio': [0.15, 0.30], 
              'alpha' : [0.0001, 0.001, 0.01, 0.1] , 'class_weight':[{0:1,1:10}]}
clf = train_grid(X_train, y_train, parameters)
best = clf.best_params_
f1, roc = calc_metrics_grid(clf)
print(f"F1 score is: {f1}")
print(f"ROC-AUC is: {roc}")
print(f"Best class weights: {best}")

F1 score is: 0.20868322315526175
ROC-AUC is: 0.6461119840565462
Best class weights: {'alpha': 0.01, 'class_weight': {0: 1, 1: 10}, 'l1_ratio': 0.15, 'penalty': 'l2'}


We can use these hyperparameter values for sampling methods; we only need to set the class weight and alpha as SGD uses l2 norm (penalty) as default. For comparison, we will also run the models with default alpha value (0.001).

We will start with RandomOverSampler to duplicates records from the minority class. We will use a sampling ratio of 0.1 (i.e. ~10% increase in gilded class).

As with Logistic Regression, we will try experimenting with oversampling techniques. Let's go try RandomOverSampler, SMOTE and ADASYN first.

RandomOverSampler duplicates samples belonging to minority class (gilded) while SMOTE and ADASYN creates synthentic samples that are similar to true ones.

In [20]:
random_sampler = RandomOverSampler(sampling_strategy=0.1, random_state=42)
smote = SMOTE(sampling_strategy=0.1, random_state=42)
ada = ADASYN(sampling_strategy=0.1, random_state=42)
models = [random_sampler, smote, ada]
model_names = ["Random OverSampler", "SMOTE", "ADASYN"]
alpha_vals = [0.001, 0.01]

for i in range(len(models)):
    X_resampled, y_resampled = models[i].fit_resample(X_train, y_train)
    assert len(X_resampled) == len(y_resampled)
    for j in range(len(alpha_vals)):
        clf = SGDClassifier(loss="hinge", class_weight={0: 1, 1: 10}, alpha=alpha_vals[j], random_state=42)
        scores = train(clf, X_resampled, y_resampled)
        f1, roc = calc_metrics(scores)
        print(f"{model_names[i]}. {alpha_vals[j]}:")
        print(f"F1 score is: {f1}")
        print(f"ROC-AUC is: {roc}")
        print("\n")

Random OverSampler. 0.001:
F1 score is: 0.7301899218218899
ROC-AUC is: 0.8278357489175485


Random OverSampler. 0.01:
F1 score is: 0.7290762216545105
ROC-AUC is: 0.8203429769984393


SMOTE. 0.001:
F1 score is: 0.7237177851492728
ROC-AUC is: 0.8242999468971522


SMOTE. 0.01:
F1 score is: 0.7205503573713654
ROC-AUC is: 0.8150874355518173


ADASYN. 0.001:
F1 score is: 0.696385671963372
ROC-AUC is: 0.8075312297638717


ADASYN. 0.01:
F1 score is: 0.6887483633729838
ROC-AUC is: 0.7952816823356728




Not as big of a difference between 0.001 and 0.01 as reg strength.

SMOTE can generate noisy samples (ex: when classes cannot be well separated). In such cases, Imbalanced learn recommends combining oversampling with undersampling the majority class. This can be done through SMOTETomek and SMOTEENN.

Ref: https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/combine/plot_comparison_combine.html

In [21]:
smote_tomek = SMOTETomek(sampling_strategy=0.1, random_state=42)
smote_enn = SMOTEENN(sampling_strategy=0.1, random_state=42)
models = [smote_tomek, smote_enn]
model_names = ["SMOTE TOMEK", "SMOTE ENN"]
alpha_vals = [0.001, 0.01]

for i in range(len(models)):
    X_resampled, y_resampled = models[i].fit_resample(X_train, y_train)
    assert len(X_resampled) == len(y_resampled)
    for j in range(len(alpha_vals)):
        clf = SGDClassifier(loss="hinge", class_weight={0: 1, 1: 10}, alpha=alpha_vals[j], random_state=42)
        scores = train(clf, X_resampled, y_resampled)
        f1, roc = calc_metrics(scores)
        print(f"{model_names[i]}. {alpha_vals[j]}:")
        print(f"F1 score is: {f1}")
        print(f"ROC-AUC is: {roc}")
        print("\n")

SMOTE TOMEK. 0.001:
F1 score is: 0.72427746895832
ROC-AUC is: 0.824775953000174


SMOTE TOMEK. 0.01:
F1 score is: 0.7212362485420306
ROC-AUC is: 0.8156752106861175


SMOTE ENN. 0.001:
F1 score is: 0.7594824714797236
ROC-AUC is: 0.8581641569754759


SMOTE ENN. 0.01:
F1 score is: 0.7514495425143278
ROC-AUC is: 0.8330756751889987




As with Logistic regression, SMOTEENN and RandomOverSampler produces the best results (though the results are much closer with SGD; note that SMOTE results are very close as well). Let's try evaluating on our test set (for both f1 score and accuracy) with 0.001 as the regularization strength (default value).

In [22]:
random_sampler = RandomOverSampler(sampling_strategy=0.1, random_state=42)
smote = SMOTE(sampling_strategy=0.1, random_state=42)
smote_enn = SMOTEENN(sampling_strategy=0.1, random_state=42)
models = [random_sampler, smote, smote_enn]
model_names = ["Random Oversampling", "SMOTE", "SMOTE ENN"]

for i in range(len(models)):
    parameters = {'class_weight':[{0:1,1:10}]}
    X_resampled, y_resampled = models[i].fit_resample(X_train, y_train)
    grid_res = train_grid(X_resampled, y_resampled, parameters)
    y_preds = grid_res.predict(X_test)
    print(f"{model_names[i]} on test set:")
    print(f"F1 score: {f1_score(y_test, y_preds)}")
    print(f"ROC_AUC score: {roc_auc_score(y_test, y_preds)}")
    print(f"Balanced accuracy score: {balanced_accuracy_score(y_test, y_preds)}")
    print("\n")

Random Oversampling on test set:
F1 score: 0.12220566318926976
ROC_AUC score: 0.8204287454870794
Balanced accuracy score: 0.8204287454870794


SMOTE on test set:
F1 score: 0.1163039600428113
ROC_AUC score: 0.8179510423630978
Balanced accuracy score: 0.8179510423630978


SMOTE ENN on test set:
F1 score: 0.08817733990147783
ROC_AUC score: 0.8452223755411945
Balanced accuracy score: 0.8452223755411944




All three methods have achieved better F1 scores and comparable ROC_AUC/balanced accuracy scores than Logistic Regression (0.06). We will try Decision Trees next.