In [1]:
import pandas as pd, numpy as np
import re, string
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [2]:
data = pd.read_feather('/home/nlashkarashvili/Documents/toxic_comments/data/comments.feather')

In [3]:
def tokenize(comment): 
    return re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])').sub(r' \1 ', comment).split()

def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def get_mdl(y):
    log = np.log(pr(1,y) / pr(0,y))
    model = LogisticRegression(C=2., dual=False)
    x_tr = x.multiply(log)
    return model.fit(x_tr, y), log

In [4]:
stfold = StratifiedKFold(n_splits=5, shuffle=True)
train_l = list()
test_l = list()
cnt = 0
for train_index, test_index in stfold.split(data['comment'], data['label']):
    cnt += 1
    train = data.iloc[train_index]
    train_x, train_y = train['comment'], train['label']
    test = data.iloc[test_index]
    test_x, test_y = test['comment'], test['label']
    n = train.shape[0]
    vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                   min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                   smooth_idf=1, sublinear_tf=1 )
    trn_term_doc = vec.fit_transform(train_x)
    test_term_doc = vec.transform(test_x)
    x = trn_term_doc
    test_x = test_term_doc
    #auc score
    model, log = get_mdl(train_y)
    t_preds = model.predict_proba(x)[:, 1]
    preds = model.predict_proba(test_x.multiply(log))[:,1]
    train_auc = roc_auc_score(train_y, t_preds)
    test_auc = roc_auc_score(test_y, preds)
    
    #acc score
    t_preds = model.predict(x)
    preds = model.predict(test_x.multiply(log))
    train_acc = roc_auc_score(train_y, t_preds)
    test_acc = roc_auc_score(test_y, preds)
    train_l.append([train_auc, train_acc])
    test_l.append([test_auc, test_acc])

train_l = np.array(train_l)
test_l = np.array(test_l)

In [5]:
print('train auc:', np.mean(train_l[:, 0]), '+/-',  np.std(train_l[:, 0]))
print('train acc:', np.mean(train_l[:, 1]), '+/-', np.std(train_l[:, 1]))
print('test auc:', np.mean(test_l[:, 0]), '+/-', np.std(test_l[:, 0]))
print('test acc:', np.mean(test_l[:, 1]), '+/-',np.std(test_l[:, 1]))

train auc: 0.8735964313617121 +/- 0.006420402749834755
train acc: 0.6805239521040019 +/- 0.01513150829559083
test auc: 0.8851960131396087 +/- 0.007305758090887333
test acc: 0.8159775855515387 +/- 0.008168510842160217
