### Load data

In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from itertools import chain
from collections import Counter

In [2]:
# Load data
train = pd.read_csv('../data/train.csv', index_col='ex_id')
val = pd.read_csv('../data/dev.csv', index_col='ex_id')
test = pd.read_csv('../data/test_no_label.csv', index_col='ex_id')

# Load tokenized data
train_data_tokens = pkl.load(open("../data/tokens/train_data_tokens.pkl", "rb"))
val_data_tokens = pkl.load(open("../data/tokens/val_data_tokens.pkl", "rb"))
test_data_tokens = pkl.load(open("../data/tokens/test_data_tokens.pkl", "rb"))

all_train_tokens = list(chain.from_iterable(train_data_tokens))

# Get labels
y_train = train.label.values
y_val = val.label.values
y_test = test.label.values

In [3]:
# Vocab
def build_vocab(all_tokens, threshold):

    c = Counter(all_tokens)
    vocab = [word for count, word in enumerate(Counter(all_train_tokens)) if count >= 10]
    
    id2token = vocab
    token2id = dict(zip(vocab, range(len(vocab))))
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens, 10)

In [4]:
# TFIDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy(doc):
    return doc

tfidf_vec = TfidfVectorizer(lowercase=False, preprocessor=dummy, tokenizer=dummy, vocabulary=token2id)  

X_train_tfidf = tfidf_vec.fit_transform(train_data_tokens)
X_val_tfidf = tfidf_vec.transform(val_data_tokens)
X_test_tfidf = tfidf_vec.transform(test_data_tokens)

### Class Weights

In [5]:
from sklearn.utils.class_weight import compute_class_weight
weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=y_train)

In [6]:
weights

array([0.55736153, 4.85832139])

In [7]:
0.55736153 / 4.85832139

0.11472306693156006

### Evaluation

In [13]:
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.metrics import average_precision_score

def evaluate(model, X=X_val_tfidf, y=y_val, print_result=True):
    y_scores = model.predict_proba(X)[:, 1]
    
    auc_score = roc_auc_score(y, y_scores)
    ap_score = average_precision_score(y, y_scores)
    confusion_mx = confusion_matrix(y, model.predict(X))
    if print_result:
        print('Accuracy: ', model.score(X, y))
        print('AUC: ', auc_score)
        print('AP: ', ap_score)
        print('\nConfusion Matrix')
        print(confusion_mx)
    
    return

### Logistic Regression (class weight)

In [10]:
from sklearn.linear_model import LogisticRegression

lr_cw = LogisticRegression(C=0.4, class_weight='balanced', max_iter=1000)
lr_cw.fit(X_train_tfidf, y_train)

LogisticRegression(C=0.4, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
evaluate(lr_cw, X=X_val_tfidf, y=y_val)

Accuracy:  0.6829444846595022
AUC:  0.7223090008780084
AP:  0.20983320963089463

Confusion Matrix
[[22246 10024]
 [ 1364  2284]]
