In [10]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics

In [18]:
# load data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
test = test.drop(test.columns[0], axis=1)

In [4]:
# split data by class
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train['clean'] = 1-train[list_classes].max(axis=1)
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# create bag of words using ngrams
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()


In [6]:
# feature engineering, using tf-idf
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
all_comments = pd.concat([train['comment_text'], test['comment_text']])
vec.fit(all_comments)
trn_term_doc = vec.transform(train['comment_text'])
test_term_doc = vec.transform(test['comment_text'])

In [7]:
# naive bayes equation
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [8]:
x = trn_term_doc
test_x = test_term_doc

In [9]:
# for each label
preds = np.zeros((len(test), len(list_classes)))

for i, j in enumerate(list_classes):
    print('fit', j)
    train_y = train[j].values
    test_y = test[j].values
    r = np.log(pr(1,train_y) / pr(0,train_y))
    svm = LinearSVC(C=4, dual=False)
    train_x_nb = x.multiply(r)
    svm.fit(train_x_nb, train_y)
    preds[:,i] = svm.predict(test_x.multiply(r))
    print("test accuracy :", svm.score(test_x.multiply(r), test_y))


fit toxic
train accuracy : 0.9996866598567409
test accuracy : 0.9302241956161471
fit severe_toxic
train accuracy : 0.99983079632264
test accuracy : 0.9905881617210218
fit obscene
train accuracy : 0.9997430610825275
test accuracy : 0.9627747725211845
fit threat
train accuracy : 0.9999498655770785
test accuracy : 0.9968887777117663
fit insult
train accuracy : 0.9996741262510105
test accuracy : 0.9608673900128201
fit identity_hate
train accuracy : 0.9999373319713482
test accuracy : 0.9904005503267566


In [20]:
y_test = test.iloc[:, 2:]

print("roc auc score ", metrics.roc_auc_score(y_test, preds))
print("accuracy ", metrics.accuracy_score(y_test, np.round(preds).astype(np.int)))
print("precision score ", metrics.precision_score(y_test, np.round(preds).astype(np.int),  average='weighted'))
print("recall score ", metrics.recall_score(y_test, np.round(preds).astype(np.int),  average='weighted'))
print("f1 score ", metrics.f1_score(y_test, np.round(preds).astype(np.int), average='weighted'))

print("coverage error ", metrics.coverage_error(y_test, preds))
print("label ranking average precision score ", metrics.label_ranking_average_precision_score(y_test, preds))
print("label ranking loss ", metrics.label_ranking_loss(y_test, preds))


roc auc score  0.7395866638623074
accuracy  0.8813045245614584
precision score  0.6276575337329509
recall score  0.6578547763666482
f1 score  0.6337277433800093
coverage error  0.43058378412182235
label ranking average precision score  0.9709949787894754
label ranking loss  0.04027899377755542
