### Load data

In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from itertools import chain
from collections import Counter

In [2]:
# Load data
train = pd.read_csv('../data/train.csv', index_col='ex_id')
val = pd.read_csv('../data/dev.csv', index_col='ex_id')
test = pd.read_csv('../data/test_no_label.csv', index_col='ex_id')

# Load tokenized data
train_data_tokens = pkl.load(open("../data/tokens/train_data_tokens.pkl", "rb"))
val_data_tokens = pkl.load(open("../data/tokens/val_data_tokens.pkl", "rb"))
test_data_tokens = pkl.load(open("../data/tokens/test_data_tokens.pkl", "rb"))

all_train_tokens = list(chain.from_iterable(train_data_tokens))

# Get labels
y_train = train.label.values
y_val = val.label.values
y_test = test.label.values

In [3]:
# Vocab
def build_vocab(all_tokens, threshold):

    c = Counter(all_tokens)
    vocab = [word for count, word in enumerate(Counter(all_train_tokens)) if count >= 10]
    
    id2token = vocab
    token2id = dict(zip(vocab, range(len(vocab))))
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens, 10)

In [4]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

def dummy(doc):
    return doc

count_vec = CountVectorizer(lowercase=False, preprocessor=dummy, tokenizer=dummy, vocabulary=token2id)

X_train_count = count_vec.fit_transform(train_data_tokens)
X_val_count = count_vec.transform(val_data_tokens)
X_test_count = count_vec.transform(test_data_tokens)

In [5]:
# TFIDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy(doc):
    return doc

tfidf_vec = TfidfVectorizer(lowercase=False, preprocessor=dummy, tokenizer=dummy, vocabulary=token2id)  

X_train_tfidf = tfidf_vec.fit_transform(train_data_tokens)
X_val_tfidf = tfidf_vec.transform(val_data_tokens)
X_test_tfidf = tfidf_vec.transform(test_data_tokens)

---

### Evaluation

In [6]:
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.metrics import average_precision_score

def evaluate(model, X=X_val_count, y=y_val):
    y_scores = model.predict_proba(X)[:, 1]
    print('Accuracy: ', model.score(X, y))
    print('AUC: ', roc_auc_score(y, y_scores))
    print('AP: ', average_precision_score(y, y_scores))
    print('\nConfusion Matrix')
    print(confusion_matrix(y, model.predict(X)))

### Logistic Regression Baseline

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
lr_0 = LogisticRegression(penalty='l2', C=0.1, class_weight=None, max_iter=1000)
lr_0.fit(X_train_count, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
evaluate(lr_0, X_val_count, y_val)

Accuracy:  0.8972938359596859
AUC:  0.7153645451073453
AP:  0.20165813686684972

Confusion Matrix
[[32193    77]
 [ 3612    36]]


### Random Sample

In [10]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_train_ros, y_train_ros = ros.fit_resample(X_train_count, y_train)

In [11]:
lr_ros = LogisticRegression(penalty='l2', C=0.1, class_weight=None, max_iter=1000)
lr_ros.fit(X_train_ros, y_train_ros)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
evaluate(lr_ros)

Accuracy:  0.6541010078512166
AUC:  0.7042944349077683
AP:  0.1910089446621789

Confusion Matrix
[[21123 11147]
 [ 1277  2371]]


In [45]:
X_train_count.shape

(250874, 114041)

In [46]:
X_train_ros.shape

(450110, 114041)

# Oversample after rescale

In [13]:
from sklearn import preprocessing

In [None]:
# X_train_count_scaled = preprocessing.scale(X_train_count, axis=0, with_mean=False)

In [39]:
transformer = preprocessing.MaxAbsScaler(copy=True)
X_train_count_scaled = transformer.fit_transform(X_train_count)

### SMOTE (scaled)

In [40]:
from imblearn.over_sampling import SMOTE, ADASYN
X_train_smote_scaled, y_train_smote_scaled = SMOTE().fit_resample(X_train_count_scaled, y_train)

In [41]:
lr_smote_scaled = LogisticRegression(penalty='l2', C=0.1, class_weight=None, max_iter=1000)
lr_smote_scaled.fit(X_train_smote_scaled, y_train_smote_scaled)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
# MaxAbsScaler
X_val_count_scaled = transformer.transform(X_val_count)
evaluate(lr_smote_scaled, X=X_val_count_scaled)

Accuracy:  0.6605044824322067
AUC:  0.703046602746019
AP:  0.19775028449376478

Confusion Matrix
[[21420 10850]
 [ 1344  2304]]


In [33]:
# Standard scaler withoutmean
evaluate(lr_smote_scaled)

Accuracy:  0.30728325630603043
AUC:  0.6956765558146996
AP:  0.19001864481295583

Confusion Matrix
[[ 7619 24651]
 [  230  3418]]


In [47]:
X_train_smote_scaled.shape

(450110, 114041)

# OLD (without rescale)

### SMOTE

In [21]:
from imblearn.over_sampling import SMOTE, ADASYN
X_train_smote, y_train_smote = SMOTE().fit_resample(X_train_count, y_train)

In [22]:
lr_smote = LogisticRegression(penalty='l2', C=0.1, class_weight=None, max_iter=1000)
lr_smote.fit(X_train_smote, y_train_smote)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
evaluate(lr_smote)

Accuracy:  0.7778272732334762
AUC:  0.679911011598954
AP:  0.18556007964545324

Confusion Matrix
[[26573  5697]
 [ 2283  1365]]


### SMOTE (array)

In [None]:
from imblearn.over_sampling import SMOTE
X_train_smote_2, y_train_smote_2 = SMOTE().fit_resample(X_train_count.toarray(), y_train)

In [None]:
lr_smote_2 = LogisticRegression(penalty='l2', C=0.1, class_weight=None, max_iter=1000)
lr_smote_2.fit(X_train_smote, y_train_smote)

In [None]:
evaluate(lr_smote_2)

### ADASYN

In [24]:
from imblearn.over_sampling import SMOTE, ADASYN

In [25]:
X_train_ADASYN, y_train_ADASYN = ADASYN().fit_resample(X_train_count, y_train)

In [28]:
lr_ADASY = LogisticRegression(penalty='l2', C=0.1, class_weight=None, max_iter=1000)
lr_ADASY.fit(X_train_ADASYN, y_train_ADASYN)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
evaluate(lr_ADASY)

Accuracy:  0.776240325185144
AUC:  0.6784568185648503
AP:  0.18410894162718627

Confusion Matrix
[[26526  5744]
 [ 2293  1355]]


### SMOTENC (失败)

In [35]:
from imblearn.over_sampling import SMOTENC

In [None]:
smote_nc = SMOTENC(categorical_features=np.ones(X_train_count.shape[1], dtype='bool'))
X_train_smote_nc, y_train_smote_nc = smote_nc.fit_resample(X_train_count, y_train)

In [47]:
np.ones(X_train_count.shape[1], dtype='bool')

array([ True,  True,  True, ...,  True,  True,  True])

In [42]:
X_train_count.toarray().shape

(250874, 114041)