In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [3]:
def check_metrics(true_val, pred_val):
    print("Accuracy: ", accuracy_score(true_val, pred_val))
    print("Roc_Auc: ", roc_auc_score(true_val, pred_val))
    print("Precision: ", precision_score(true_val, pred_val), " - процент правильно токсичных")
    print("Recall: ", recall_score(true_val, pred_val), " - процент выбранных токсичных")

In [4]:
data = pd.read_csv("toxic_train.csv")

In [5]:
test_data = pd.read_csv("toxic_test.csv")

In [6]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,comment_text,is_toxic
0,0,Thank you for understanding. I think very high...,False
1,1,:Dear god this site is horrible.,False
2,2,"""::: Somebody will invariably try to add Relig...",False
3,3,""" \n\n It says it right there that it IS a typ...",False
4,4,""" \n\n == Before adding a new product to the l...",False


In [7]:
data.shape

(52061, 3)

In [8]:
train_texts = np.array(data['comment_text'])
test_texts = np.array(test_data['comment_text'])

In [9]:
for enum in range(train_texts.__len__()):
    train_texts[enum] = train_texts[enum].lower()
    train_texts[enum] = re.sub("[^0-9a-zA-Z]+", ' ', train_texts[enum])
    train_texts[enum] =  re.sub(' +', ' ', train_texts[enum])

In [10]:
train_texts[0]

'explanation why the edits made under my username hardcore metallica fan were reverted they weren t vandalisms just closure on some gas after i voted at new york dolls fac and please don t remove the template from the talk page since i m retired now 89 205 38 27'

In [11]:
for enum in range(test_texts.__len__()):
    test_texts[enum] = test_texts[enum].lower()
    test_texts[enum] = re.sub("[^0-9a-zA-Z]+", ' ', test_texts[enum])
    test_texts[enum] =  re.sub(' +', ' ', test_texts[enum])

In [12]:
test_texts[0]

'thank you for understanding i think very highly of you and would not revert without discussion '

In [13]:
train_target = np.array(data['is_toxic']).astype(int) * 2 - 1 
test_target = np.array(test_data['is_toxic']).astype(int) * 2 - 1

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vect = CountVectorizer(min_df=100)

In [16]:
train_matrix = vect.fit_transform(train_texts)

In [17]:
test_matrix = vect.transform(test_texts)

In [18]:
from optimization import GDClassifier

In [19]:
classifier = GDClassifier(tolerance=0.000001,max_iter=1000,step_alpha=0.2, step_beta=0,l2_coef = 1)

In [20]:
%time classifier.fit(train_matrix, train_target)

CPU times: user 269 ms, sys: 55.5 ms, total: 324 ms
Wall time: 323 ms


In [21]:
%time res = classifier.predict(test_matrix)

CPU times: user 3.08 ms, sys: 2.9 ms, total: 5.99 ms
Wall time: 4.61 ms


In [22]:
check_metrics(test_target, res)

Accuracy:  0.7566744051073708
Roc_Auc:  0.6548771331436418
Precision:  0.6613418530351438  - процент правильно токсичных
Recall:  0.3978856319077367  - процент выбранных токсичных


In [23]:
%time probabilities = classifier.predict_proba(test_matrix)

CPU times: user 2.28 ms, sys: 1.24 ms, total: 3.52 ms
Wall time: 2.37 ms


In [24]:
ans = (probabilities > 0.5) * 2 - 1

In [25]:
check_metrics(test_target, ans)

Accuracy:  0.7689591797252854
Roc_Auc:  0.6622221127945984
Precision:  0.7132053519488074  - процент правильно токсичных
Recall:  0.39275989107800735  - процент выбранных токсичных


In [89]:
for i in range(31, 51):
    classifier = GDClassifier(step_alpha = i / 10, l2_coef = 0)
    classifier.fit(train_matrix, train_target)
    print("Current step_alpha: ", i)
    check_metrics(test_target, classifier.predict(test_matrix))

Current step_alpha:  31
Accuracy:  0.797736506094022
Roc_Auc:  0.8373993963657087
Precision:  0.6068429237947123  - процент правильно токсичных
Recall:  0.9375300336376742  - процент выбранных токсичных
Current step_alpha:  32
Accuracy:  0.7975430450764172
Roc_Auc:  0.8375335062445163
Precision:  0.6064589586999275  - процент правильно токсичных
Recall:  0.9384911100432485  - процент выбранных токсичных
Current step_alpha:  33
Accuracy:  0.7964306442251886
Roc_Auc:  0.8368276148354957
Precision:  0.6049752270850537  - процент правильно токсичных
Recall:  0.9388114688451065  - процент выбранных токсичных
Current step_alpha:  34
Accuracy:  0.795898626426775
Roc_Auc:  0.8366283311513842
Precision:  0.6042031523642732  - процент правильно токсичных
Recall:  0.9394521864488227  - процент выбранных токсичных
Current step_alpha:  35
Accuracy:  0.795269878119559
Roc_Auc:  0.8362234211982771
Precision:  0.6033737914009463  - процент правильно токсичных
Recall:  0.9396123658497517  - процент выб

In [77]:
classifier = GDClassifier(step_alpha = 0.33, l2_coef = 0)

In [78]:
%time classifier.fit(train_matrix, train_target)

CPU times: user 14.8 s, sys: 2.42 s, total: 17.2 s
Wall time: 17.2 s


In [79]:
check_metrics(test_target, classifier.predict(test_matrix))

Accuracy:  0.8361868833430064
Roc_Auc:  0.8344001527813834
Precision:  0.6902478017585931  - процент правильно токсичных
Recall:  0.829889476213359  - процент выбранных токсичных


In [73]:
check_metrics(test_target, classifier.predict(test_matrix))

Accuracy:  0.8361868833430064
Roc_Auc:  0.8339911309687845
Precision:  0.6907051282051282  - процент правильно токсичных
Recall:  0.8284478616049976  - процент выбранных токсичных


In [69]:
check_metrics(test_target, classifier.predict(test_matrix))

Accuracy:  0.8359934223254014
Roc_Auc:  0.8335798784307932
Precision:  0.6906417112299466  - процент правильно токсичных
Recall:  0.8274867851994233  - процент выбранных токсичных


In [105]:
cl_r = GDClassifier(step_alpha = 2.1, l2_coef = 0)
cl_r.fit(train_matrix, train_target)

In [106]:
check_metrics(test_target, cl_r.predict(test_matrix))

Accuracy:  0.8610466241052428
Roc_Auc:  0.8462530289172261
Precision:  0.75037147102526  - процент правильно токсичных
Recall:  0.8089059746916547  - процент выбранных токсичных


In [109]:
w = cl_r.get_weights()

In [110]:
cl_r_2 = GDClassifier(w_0 = w, step_alpha=2.1, l2_coef)

array([ 0.02002737, -0.22318307, -0.02371039, ...,  0.56831814,
        0.17425659, -0.02310614])

In [104]:
for i in range(11):
    classifier = GDClassifier(step_alpha = 1.5, step_beta=i / 10, l2_coef = 0)
    classifier.fit(train_matrix, train_target)
    print("Step Beta: ", i / 10)
    check_metrics(test_target, classifier.predict(test_matrix))

Step Beta:  0.0
Accuracy:  0.8574675952795512
Roc_Auc:  0.8509609582104228
Precision:  0.7313307130825379  - процент правильно токсичных
Recall:  0.8345346788403012  - процент выбранных токсичных
Step Beta:  0.1
Accuracy:  0.837734571483846
Roc_Auc:  0.8305550147988653
Precision:  0.6990077177508269  - процент правильно токсичных
Recall:  0.8124299215120936  - процент выбранных токсичных
Step Beta:  0.2
Accuracy:  0.8402495647127104
Roc_Auc:  0.8414003688288022
Precision:  0.6933701657458563  - процент правильно токсичных
Recall:  0.8443056222969726  - процент выбранных токсичных
Step Beta:  0.3
Accuracy:  0.8348326562197718
Roc_Auc:  0.8313850444112455
Precision:  0.6899516389038152  - процент правильно токсичных
Recall:  0.8226814031715521  - процент выбранных токсичных
Step Beta:  0.4
Accuracy:  0.8301412265428516
Roc_Auc:  0.8234345559939209
Precision:  0.6860607712222374  - процент правильно токсичных
Recall:  0.8065032836777191  - процент выбранных токсичных
Step Beta:  0.5
Accur

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

In [33]:
cl = GradientBoostingClassifier()

In [34]:
%time cl.fit(train_matrix, train_target)

CPU times: user 14.2 s, sys: 579 ms, total: 14.8 s
Wall time: 14.9 s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [35]:
%time res = cl.predict(test_matrix)

CPU times: user 37.6 ms, sys: 3.51 ms, total: 41.1 ms
Wall time: 40.3 ms


In [36]:
check_metrics(test_target, res)

Accuracy:  0.8519055910234088
Roc_Auc:  0.7878961039981491
Precision:  0.8428540633757275  - процент правильно токсичных
Recall:  0.6263014576325484  - процент выбранных токсичных


In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
cl = LogisticRegression()

In [36]:
%time cl.fit(train_matrix, train_target)



CPU times: user 2.43 s, sys: 20.7 ms, total: 2.45 s
Wall time: 2.45 s




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
%time res = cl.predict(test_matrix)

CPU times: user 3.9 ms, sys: 1.66 ms, total: 5.56 ms
Wall time: 3.58 ms


In [40]:
check_metrics(test_target, res)

Accuracy:  0.8788450377248984
Roc_Auc:  0.8519573266893674
Precision:  0.8088235294117647  - процент правильно токсичных
Recall:  0.7840781675476534  - процент выбранных токсичных


In [50]:
classifier = GDClassifier(step_alpha=1, step_beta=0,
                 tolerance=0.00001, max_iter=100, l2_coef = 1)

In [51]:
classifier.fit(train_matrix, train_target)

In [68]:
w = classifier.get_weights()

In [69]:
w

array([-5.75103400e-04, -5.11659052e-04, -6.87462368e-04, ...,
       -1.00598986e-04, -1.95125313e-05,  2.02239601e-04])

In [58]:
mul = test_matrix * w


In [59]:
proba = 1 / (1 + np.exp(-1 * mul))

In [60]:
max(proba), min(proba)

(1.0, 2.26107839656812e-11)

In [41]:
from oracles import BinaryLogistic

In [42]:
oracle = BinaryLogistic(l2_coef=1)

In [61]:
type(test_matrix)

scipy.sparse.csr.csr_matrix

In [66]:
np.sum(test_matrix.toarray() * w, axis = 1)

20676

In [73]:
res = np.sign(test_matrix * w)

In [74]:
res[res == 0] = 1

In [76]:
check_metrics(test_target, res)

Accuracy:  0.7572064229057844
Roc_Auc:  0.6554399917722857
Precision:  0.6629363176125767  - процент правильно токсичных
Recall:  0.3985263495114528  - процент выбранных токсичных


In [73]:
sub = train_matrix * w

In [75]:
np.log(1 + np.exp(-1 * sub))

array([0.93895189, 0.73192902, 1.01481378, ..., 0.71199044, 0.98857926,
       0.76294273])

In [77]:
np.logaddexp(0, -1 * sub)

array([0.93895189, 0.73192902, 1.01481378, ..., 0.71199044, 0.98857926,
       0.76294273])