### Load data

In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from itertools import chain
from collections import Counter

In [2]:
# Load data
train = pd.read_csv('../data/train.csv', index_col='ex_id')
val = pd.read_csv('../data/dev.csv', index_col='ex_id')
test = pd.read_csv('../data/test_no_label.csv', index_col='ex_id')

# Load tokenized data
train_data_tokens = pkl.load(open("../data/tokens/train_data_tokens.pkl", "rb"))
val_data_tokens = pkl.load(open("../data/tokens/val_data_tokens.pkl", "rb"))
test_data_tokens = pkl.load(open("../data/tokens/test_data_tokens.pkl", "rb"))
all_train_tokens = list(chain.from_iterable(train_data_tokens))

# Get labels
y_train = train.label.values
y_val = val.label.values
y_test = test.label.values

# Vocab
def build_vocab(all_tokens, threshold):
    c = Counter(all_tokens)
    vocab = [word for word, count in Counter(all_train_tokens).items() if count >= threshold]
    id2token = vocab
    token2id = dict(zip(vocab, range(len(vocab))))
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens, 20)

In [3]:
# TFIDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy(doc):
    return doc

tfidf_vec = TfidfVectorizer(lowercase=False, preprocessor=dummy, tokenizer=dummy, vocabulary=token2id)  

X_train_tfidf = tfidf_vec.fit_transform(train_data_tokens)
X_val_tfidf = tfidf_vec.transform(val_data_tokens)
X_test_tfidf = tfidf_vec.transform(test_data_tokens)

In [4]:
X_train = X_train_tfidf
X_val = X_val_tfidf
X_test = X_test_tfidf

y_train = train.label.values
y_val = val.label.values
y_test = test.label.values

### Evaluation

In [5]:
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.metrics import average_precision_score

def evaluate(model, X=X_val_tfidf, y=y_val):
    y_scores = model.predict_proba(X)[:, 1]
    print('Accuracy: ', model.score(X, y))
    print('AUC: ', roc_auc_score(y, y_scores))
    print('AP: ', average_precision_score(y, y_scores))
    print('\nConfusion Matrix')
    print(confusion_matrix(y, model.predict(X)))

### Logistic Regression Baseline

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
lr_0 = LogisticRegression(C=0.4, max_iter=1000)
lr_0.fit(X_train, y_train)

LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
evaluate(lr_0)

Accuracy:  0.8981012305807673
AUC:  0.7250515498684347
AP:  0.21535622828032727

Confusion Matrix
[[32245    25]
 [ 3635    13]]


### Random Downsample

In [9]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [10]:
lr_rus = LogisticRegression(C=0.4, max_iter=1000)
lr_rus.fit(X_train_rus, y_train_rus)

LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
evaluate(lr_rus)

Accuracy:  0.649507210869202
AUC:  0.7196004050595577
AP:  0.2092126755668793

Confusion Matrix
[[20905 11365]
 [ 1224  2424]]


### NearMiss 1

In [12]:
from imblearn.under_sampling import NearMiss

In [13]:
nm1 = NearMiss(version=1)
X_train_nm1, y_train_nm1 = nm1.fit_resample(X_train, y_train)

In [14]:
lr_nm1 = LogisticRegression(C=0.4, max_iter=1000)
lr_nm1.fit(X_train_nm1, y_train_nm1)

LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
evaluate(lr_nm1)

Accuracy:  0.4429255526476975
AUC:  0.5981573672182082
AP:  0.14475622277187325

Confusion Matrix
[[13311 18959]
 [ 1050  2598]]


---

### NearMiss 2 (这个跑不出来...)

跑了好几次  每次kernel都死掉

In [None]:
nm2 = NearMiss(version=2)
X_train_nm2, y_train_nm2 = nm2.fit_resample(X_train, y_train)

In [None]:
lr_nm2 = LogisticRegression(C=0.4, max_iter=1000)
lr_nm2.fit(X_train_nm2, y_train_nm2)

In [None]:
evaluate(lr_nm2)

### NearMiss 3

In [None]:
nm3 = NearMiss(version=3)
X_train_nm3, y_train_nm3 = nm3.fit_resample(X_train, y_train)

In [None]:
lr_nm3 = LogisticRegression(C=0.4, max_iter=1000)
lr_nm3.fit(X_train_nm3, y_train_nm3)

In [None]:
evaluate(lr_nm3)