### Load data

In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from itertools import chain
from collections import Counter

In [2]:
# Load data
train = pd.read_csv('../data/train.csv', index_col='ex_id')
val = pd.read_csv('../data/dev.csv', index_col='ex_id')
test = pd.read_csv('../data/test_no_label.csv', index_col='ex_id')

# Load tokenized data
train_data_tokens = pkl.load(open("../data/tokens/train_data_tokens.pkl", "rb"))
val_data_tokens = pkl.load(open("../data/tokens/val_data_tokens.pkl", "rb"))
test_data_tokens = pkl.load(open("../data/tokens/test_data_tokens.pkl", "rb"))
all_train_tokens = list(chain.from_iterable(train_data_tokens))

# Get labels
y_train = train.label.values
y_val = val.label.values
y_test = test.label.values

# Vocab
def build_vocab(all_tokens, threshold):
    c = Counter(all_tokens)
    vocab = [word for word, count in Counter(all_train_tokens).items() if count >= threshold]
    id2token = vocab
    token2id = dict(zip(vocab, range(len(vocab))))
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens, 20)

In [3]:
# TFIDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy(doc):
    return doc

tfidf_vec = TfidfVectorizer(lowercase=False, preprocessor=dummy, tokenizer=dummy, vocabulary=token2id)  

X_train_tfidf = tfidf_vec.fit_transform(train_data_tokens)
X_val_tfidf = tfidf_vec.transform(val_data_tokens)
X_test_tfidf = tfidf_vec.transform(test_data_tokens)

### Evaluation

In [4]:
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.metrics import average_precision_score

def evaluate(model, X=X_val_tfidf, y=y_val):
    y_scores = model.predict_proba(X)[:, 1]
    print('Accuracy: ', model.score(X, y))
    print('AUC: ', roc_auc_score(y, y_scores))
    print('AP: ', average_precision_score(y, y_scores))
    print('\nConfusion Matrix')
    print(confusion_matrix(y, model.predict(X)))

### Logistic Regression Baseline

In [5]:
from sklearn.linear_model import LogisticRegression

In [6]:
lr_base = LogisticRegression(penalty='l2', C=0.4, class_weight=None, max_iter=1000)
lr_base.fit(X_train_tfidf, y_train)

evaluate(lr_base, X_val_tfidf, y_val)

Accuracy:  0.8981012305807673
AUC:  0.7250515498684347
AP:  0.21535622828032727

Confusion Matrix
[[32245    25]
 [ 3635    13]]


In [7]:
X_train_tfidf.shape

(250874, 14765)

### Random Sample

In [8]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_train_ros, y_train_ros = ros.fit_resample(X_train_tfidf, y_train)

In [9]:
lr_ros = LogisticRegression(C=0.4, max_iter=1000)
lr_ros.fit(X_train_ros, y_train_ros)

LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
evaluate(lr_ros)

Accuracy:  0.6830558494348238
AUC:  0.7190135681870077
AP:  0.209044069832062

Confusion Matrix
[[22257 10013]
 [ 1371  2277]]


In [11]:
X_train_ros.shape

(450110, 14765)

# Oversample without rescale

### SMOTE

In [12]:
from imblearn.over_sampling import SMOTE
X_train_smote, y_train_smote = SMOTE().fit_resample(X_train_tfidf, y_train)

In [13]:
lr_smote = LogisticRegression(C=0.4, max_iter=1000)
lr_smote.fit(X_train_smote, y_train_smote)

LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
evaluate(lr_smote)

Accuracy:  0.7672476195779275
AUC:  0.6881618405082663
AP:  0.18530665233073135

Confusion Matrix
[[26080  6190]
 [ 2170  1478]]


In [15]:
X_train_smote.shape

(450110, 14765)

### ADASYN

In [16]:
from imblearn.over_sampling import ADASYN
X_train_ADASYN, y_train_ADASYN = ADASYN().fit_resample(X_train_tfidf, y_train)

In [17]:
lr_ADASYN = LogisticRegression(C=0.4, max_iter=1000)
lr_ADASYN.fit(X_train_ADASYN, y_train_ADASYN)

LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
evaluate(lr_ADASYN)

Accuracy:  0.766217495406203
AUC:  0.6876638663157351
AP:  0.18325155219905132

Confusion Matrix
[[26028  6242]
 [ 2155  1493]]


In [19]:
X_train_ADASYN.shape

(451900, 14765)