In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [3]:
def check_metrics(true_val, pred_val):
    print("Accuracy: ", accuracy_score(true_val, pred_val))
    print("Roc_Auc: ", roc_auc_score(true_val, pred_val))
    print("Precision: ", precision_score(true_val, pred_val), " - процент правильно токсичных")
    print("Recall: ", recall_score(true_val, pred_val), " - процент выбранных токсичных")

In [4]:
data = pd.read_csv("toxic_train.csv")

In [5]:
test_data = pd.read_csv("toxic_test.csv")

In [6]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,comment_text,is_toxic
0,0,Thank you for understanding. I think very high...,False
1,1,:Dear god this site is horrible.,False
2,2,"""::: Somebody will invariably try to add Relig...",False
3,3,""" \n\n It says it right there that it IS a typ...",False
4,4,""" \n\n == Before adding a new product to the l...",False


In [7]:
data.shape

(52061, 3)

In [8]:
train_texts = np.array(data['comment_text'])
test_texts = np.array(test_data['comment_text'])

In [9]:
for enum in range(train_texts.__len__()):
    train_texts[enum] = train_texts[enum].lower()
    train_texts[enum] = re.sub("[^0-9a-zA-Z]+", ' ', train_texts[enum])
    train_texts[enum] =  re.sub(' +', ' ', train_texts[enum])

In [10]:
train_texts[0]

'explanation why the edits made under my username hardcore metallica fan were reverted they weren t vandalisms just closure on some gas after i voted at new york dolls fac and please don t remove the template from the talk page since i m retired now 89 205 38 27'

In [11]:
for enum in range(test_texts.__len__()):
    test_texts[enum] = test_texts[enum].lower()
    test_texts[enum] = re.sub("[^0-9a-zA-Z]+", ' ', test_texts[enum])
    test_texts[enum] =  re.sub(' +', ' ', test_texts[enum])

In [12]:
test_texts[0]

'thank you for understanding i think very highly of you and would not revert without discussion '

In [13]:
train_target = np.array(data['is_toxic']).astype(int) * 2 - 1 
test_target = np.array(test_data['is_toxic']).astype(int) * 2 - 1

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vect = CountVectorizer(min_df=100)

In [16]:
train_matrix = vect.fit_transform(train_texts)

In [17]:
test_matrix = vect.transform(test_texts)

In [18]:
from optimization import GDClassifier

In [50]:
classifier = GDClassifier(tolerance=0.000001,max_iter=1000,step_alpha=10, step_beta=0,l2_coef = 0)

In [51]:
%time classifier.fit(train_matrix, train_target)

[0. 0. 0. ... 0. 0. 0.]
CPU times: user 15 s, sys: 2.79 s, total: 17.8 s
Wall time: 18 s


In [52]:
%time res = classifier.predict(test_matrix)

CPU times: user 4.24 ms, sys: 13.7 ms, total: 17.9 ms
Wall time: 17.3 ms


In [53]:
check_metrics(test_target, res)

Accuracy:  0.8562584639195202
Roc_Auc:  0.8159188426144328
Precision:  0.7897254207263065  - процент правильно токсичных
Recall:  0.7140797693416626  - процент выбранных токсичных


In [49]:
%time probabilities = classifier.predict_proba(test_matrix)

CPU times: user 2.62 ms, sys: 1.07 ms, total: 3.69 ms
Wall time: 2.13 ms


In [24]:
ans = (probabilities > 0.5) * 2 - 1

In [25]:
check_metrics(test_target, ans)

Accuracy:  0.7689591797252854
Roc_Auc:  0.6622221127945984
Precision:  0.7132053519488074  - процент правильно токсичных
Recall:  0.39275989107800735  - процент выбранных токсичных


In [89]:
for i in range(31, 51):
    classifier = GDClassifier(step_alpha = i / 10, l2_coef = 0)
    classifier.fit(train_matrix, train_target)
    print("Current step_alpha: ", i)
    check_metrics(test_target, classifier.predict(test_matrix))

Current step_alpha:  31
Accuracy:  0.797736506094022
Roc_Auc:  0.8373993963657087
Precision:  0.6068429237947123  - процент правильно токсичных
Recall:  0.9375300336376742  - процент выбранных токсичных
Current step_alpha:  32
Accuracy:  0.7975430450764172
Roc_Auc:  0.8375335062445163
Precision:  0.6064589586999275  - процент правильно токсичных
Recall:  0.9384911100432485  - процент выбранных токсичных
Current step_alpha:  33
Accuracy:  0.7964306442251886
Roc_Auc:  0.8368276148354957
Precision:  0.6049752270850537  - процент правильно токсичных
Recall:  0.9388114688451065  - процент выбранных токсичных
Current step_alpha:  34
Accuracy:  0.795898626426775
Roc_Auc:  0.8366283311513842
Precision:  0.6042031523642732  - процент правильно токсичных
Recall:  0.9394521864488227  - процент выбранных токсичных
Current step_alpha:  35
Accuracy:  0.795269878119559
Roc_Auc:  0.8362234211982771
Precision:  0.6033737914009463  - процент правильно токсичных
Recall:  0.9396123658497517  - процент выб

In [77]:
classifier = GDClassifier(step_alpha = 0.33, l2_coef = 0)

In [78]:
%time classifier.fit(train_matrix, train_target)

CPU times: user 14.8 s, sys: 2.42 s, total: 17.2 s
Wall time: 17.2 s


In [79]:
check_metrics(test_target, classifier.predict(test_matrix))

Accuracy:  0.8361868833430064
Roc_Auc:  0.8344001527813834
Precision:  0.6902478017585931  - процент правильно токсичных
Recall:  0.829889476213359  - процент выбранных токсичных


In [73]:
check_metrics(test_target, classifier.predict(test_matrix))

Accuracy:  0.8361868833430064
Roc_Auc:  0.8339911309687845
Precision:  0.6907051282051282  - процент правильно токсичных
Recall:  0.8284478616049976  - процент выбранных токсичных


In [69]:
check_metrics(test_target, classifier.predict(test_matrix))

Accuracy:  0.8359934223254014
Roc_Auc:  0.8335798784307932
Precision:  0.6906417112299466  - процент правильно токсичных
Recall:  0.8274867851994233  - процент выбранных токсичных


In [29]:
cl_r = GDClassifier(step_alpha = 2.1, l2_coef = 0)
cl_r.fit(train_matrix, train_target)

[0. 0. 0. ... 0. 0. 0.]


In [30]:
check_metrics(test_target, cl_r.predict(test_matrix))

Accuracy:  0.8610466241052428
Roc_Auc:  0.8462530289172261
Precision:  0.75037147102526  - процент правильно токсичных
Recall:  0.8089059746916547  - процент выбранных токсичных


In [31]:
w = cl_r.get_weights()

In [32]:
cl_r_2 = GDClassifier(step_alpha=2.1, l2_coef = 0)
cl_r_2.fit(train_matrix, train_target, w_0 = w)

[ 0.02002737 -0.22318307 -0.02371039 ...  0.56831814  0.17425659
 -0.02310614]


In [33]:
check_metrics(test_target, cl_r_2.predict(test_matrix))

Accuracy:  0.858434900367576
Roc_Auc:  0.8568802046860349
Precision:  0.7260703572402509  - процент правильно токсичных
Recall:  0.8529553099471407  - процент выбранных токсичных


In [34]:
w_2 = cl_r_2.get_weights()

In [35]:
np.allclose(w, w_2)

False

In [36]:
cl_r_3 = GDClassifier(step_alpha=2.1, l2_coef = 0)
cl_r_3.fit(train_matrix, train_target, w_0 = w_2)

[ 0.08704158 -0.37899983 -0.07573311 ...  0.50381537  0.25829648
 -0.04235682]


In [37]:
check_metrics(test_target, cl_r_3.predict(test_matrix))

Accuracy:  0.8585799961307796
Roc_Auc:  0.8575294956000272
Precision:  0.7256288239292998  - процент правильно токсичных
Recall:  0.8548774627582892  - процент выбранных токсичных


In [38]:
w_3 = cl_r_3.get_weights()

In [39]:
cl_r_4 = GDClassifier(step_alpha=2.1, l2_coef = 0)
cl_r_4.fit(train_matrix, train_target, w_0 = w_3)

[ 0.13374433 -0.48172998 -0.14774491 ...  0.47159356  0.26003006
 -0.04916996]


In [40]:
check_metrics(test_target, cl_r_4.predict(test_matrix))

Accuracy:  0.8525343393306248
Roc_Auc:  0.8603797467047941
Precision:  0.7048486403283736  - процент правильно токсичных
Recall:  0.8801858081050777  - процент выбранных токсичных


In [47]:
cl_r_5 = GDClassifier(step_alpha=2.1, l2_coef = 0)
cl_r_5.fit(train_matrix, train_target, w_0 = cl_r_4.get_weights())

[ 0.17726896 -0.54809413 -0.21616379 ...  0.46717748  0.2495427
 -0.05223828]


In [48]:
check_metrics(test_target, cl_r_5.predict(test_matrix))

Accuracy:  0.8408783130199264
Roc_Auc:  0.8593477698555951
Precision:  0.6766359612393827  - процент правильно токсичных
Recall:  0.9059746916546532  - процент выбранных токсичных


In [46]:
check_metrics(test_target, cl_r_5.predict(test_matrix))

Accuracy:  0.8075546527374734
Roc_Auc:  0.8416141855223725
Precision:  0.6214852972740932  - процент правильно токсичных
Recall:  0.9275989107800737  - процент выбранных токсичных


In [52]:
long_alg = GDClassifier(step_alpha=2.1,tolerance=0, max_iter = 10000, l2_coef = 0)

In [53]:
long_alg.fit(train_matrix, train_target)

[0. 0. 0. ... 0. 0. 0.]


In [54]:
check_metrics(test_target, long_alg.predict(test_matrix))

Accuracy:  0.8573708647707486
Roc_Auc:  0.8570724466026768
Precision:  0.7226277372262774  - процент правильно токсичных
Recall:  0.8563190773666507  - процент выбранных токсичных


In [104]:

for i in range(11):
    classifier = GDClassifier(step_alpha = 1.5, step_beta=i / 10, l2_coef = 0)
    classifier.fit(train_matrix, train_target)
    print("Step Beta: ", i / 10)
    check_metrics(test_target, classifier.predict(test_matrix))

Step Beta:  0.0
Accuracy:  0.8574675952795512
Roc_Auc:  0.8509609582104228
Precision:  0.7313307130825379  - процент правильно токсичных
Recall:  0.8345346788403012  - процент выбранных токсичных
Step Beta:  0.1
Accuracy:  0.837734571483846
Roc_Auc:  0.8305550147988653
Precision:  0.6990077177508269  - процент правильно токсичных
Recall:  0.8124299215120936  - процент выбранных токсичных
Step Beta:  0.2
Accuracy:  0.8402495647127104
Roc_Auc:  0.8414003688288022
Precision:  0.6933701657458563  - процент правильно токсичных
Recall:  0.8443056222969726  - процент выбранных токсичных
Step Beta:  0.3
Accuracy:  0.8348326562197718
Roc_Auc:  0.8313850444112455
Precision:  0.6899516389038152  - процент правильно токсичных
Recall:  0.8226814031715521  - процент выбранных токсичных
Step Beta:  0.4
Accuracy:  0.8301412265428516
Roc_Auc:  0.8234345559939209
Precision:  0.6860607712222374  - процент правильно токсичных
Recall:  0.8065032836777191  - процент выбранных токсичных
Step Beta:  0.5
Accur

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

In [33]:
cl = GradientBoostingClassifier()

In [34]:
%time cl.fit(train_matrix, train_target)

CPU times: user 14.2 s, sys: 579 ms, total: 14.8 s
Wall time: 14.9 s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [35]:
%time res = cl.predict(test_matrix)

CPU times: user 37.6 ms, sys: 3.51 ms, total: 41.1 ms
Wall time: 40.3 ms


In [36]:
check_metrics(test_target, res)

Accuracy:  0.8519055910234088
Roc_Auc:  0.7878961039981491
Precision:  0.8428540633757275  - процент правильно токсичных
Recall:  0.6263014576325484  - процент выбранных токсичных


In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
cl = LogisticRegression()

In [36]:
%time cl.fit(train_matrix, train_target)



CPU times: user 2.43 s, sys: 20.7 ms, total: 2.45 s
Wall time: 2.45 s




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
%time res = cl.predict(test_matrix)

CPU times: user 3.9 ms, sys: 1.66 ms, total: 5.56 ms
Wall time: 3.58 ms


In [40]:
check_metrics(test_target, res)

Accuracy:  0.8788450377248984
Roc_Auc:  0.8519573266893674
Precision:  0.8088235294117647  - процент правильно токсичных
Recall:  0.7840781675476534  - процент выбранных токсичных


In [50]:
classifier = GDClassifier(step_alpha=1, step_beta=0,
                 tolerance=0.00001, max_iter=100, l2_coef = 1)

In [51]:
classifier.fit(train_matrix, train_target)

In [68]:
w = classifier.get_weights()

In [69]:
w

array([-5.75103400e-04, -5.11659052e-04, -6.87462368e-04, ...,
       -1.00598986e-04, -1.95125313e-05,  2.02239601e-04])

In [58]:
mul = test_matrix * w


In [59]:
proba = 1 / (1 + np.exp(-1 * mul))

In [60]:
max(proba), min(proba)

(1.0, 2.26107839656812e-11)

In [41]:
from oracles import BinaryLogistic

In [42]:
oracle = BinaryLogistic(l2_coef=1)

In [61]:
type(test_matrix)

scipy.sparse.csr.csr_matrix

In [66]:
np.sum(test_matrix.toarray() * w, axis = 1)

20676

In [73]:
res = np.sign(test_matrix * w)

In [74]:
res[res == 0] = 1

In [76]:
check_metrics(test_target, res)

Accuracy:  0.7572064229057844
Roc_Auc:  0.6554399917722857
Precision:  0.6629363176125767  - процент правильно токсичных
Recall:  0.3985263495114528  - процент выбранных токсичных


In [73]:
sub = train_matrix * w

In [75]:
np.log(1 + np.exp(-1 * sub))

array([0.93895189, 0.73192902, 1.01481378, ..., 0.71199044, 0.98857926,
       0.76294273])

In [77]:
np.logaddexp(0, -1 * sub)

array([0.93895189, 0.73192902, 1.01481378, ..., 0.71199044, 0.98857926,
       0.76294273])

In [24]:
test_matrix[[1, 2, 3]

<3x2337 sparse matrix of type '<class 'numpy.int64'>'
	with 80 stored elements in Compressed Sparse Row format>

In [23]:
from optimization import SGDClassifier

In [32]:
stohastic = SGDClassifier(step_alpha=2.1, l2_coef = 0, batch_size= int(0.7 * train_matrix.shape[0]))

In [33]:
%time stohastic.fit(train_matrix, train_target)

0.6999865542344558
1.3999731084689115
2.0999596627033674
2.799946216937823
3.4999327711722787
4.199919325406735
4.8999058796411905
5.599892433875646
6.299878988110102
6.9998655423445575
7.699852096579013
8.39983865081347
9.099825205047926
9.799811759282383
10.49979831351684
11.199784867751296
11.899771421985752
12.599757976220209
13.299744530454666
13.999731084689122
14.699717638923579
15.399704193158035
16.09969074739249
16.799677301626947
17.499663855861403
18.19965041009586
18.899636964330316
19.599623518564773
20.29961007279923
20.999596627033686
21.699583181268142
22.3995697355026
23.099556289737055
23.799542843971512
24.49952939820597
25.199515952440425
25.89950250667488
26.599489060909338
27.299475615143795
27.99946216937825
28.699448723612708
29.399435277847164
30.09942183208162
30.799408386316077
31.499394940550534
32.19938149478499
32.899368049019444
33.5993546032539
34.29934115748836
34.99932771172281
35.69931426595727
36.399300820191726
37.09928737442618
37.79927392866064
3

312.1940031885639
312.89398974279834
313.5939762970328
314.2939628512672
314.99394940550167
315.6939359597361
316.39392251397055
317.093909068205
317.79389562243944
318.4938821766739
319.1938687309083
319.89385528514276
320.5938418393772
321.29382839361165
321.9938149478461
322.69380150208053
323.393788056315
324.0937746105494
324.79376116478386
325.4937477190183
326.19373427325274
326.8937208274872
327.59370738172163
328.29369393595607
328.9936804901905
329.69366704442496
330.3936535986594
331.09364015289384
331.7936267071283
332.4936132613627
333.19359981559717
333.8935863698316
334.59357292406605
335.2935594783005
335.99354603253494
336.6935325867694
337.3935191410038
338.09350569523826
338.7934922494727
339.49347880370715
340.1934653579416
340.89345191217603
341.5934384664105
342.2934250206449
342.99341157487936
343.6933981291138
344.39338468334824
345.0933712375827
345.79335779181713
346.4933443460516
347.193330900286
347.89331745452046
348.5933040087549
349.29329056298934
349.993

629.2879122567663
629.9878988110007
630.6878853652352
631.3878719194696
632.087858473704
632.7878450279385
633.4878315821729
634.1878181364074
634.8878046906418
635.5877912448763
636.2877777991107
636.9877643533451
637.6877509075796
638.387737461814
639.0877240160485
639.7877105702829
640.4876971245174
641.1876836787518
641.8876702329862
642.5876567872207
643.2876433414551
643.9876298956896
644.687616449924
645.3876030041584
646.0875895583929
646.7875761126273
647.4875626668618
648.1875492210962
648.8875357753307
649.5875223295651
650.2875088837995
650.987495438034
651.6874819922684
652.3874685465029
653.0874551007373
653.7874416549718
654.4874282092062
655.1874147634406
655.8874013176751
656.5873878719095
657.287374426144
657.9873609803784
658.6873475346129
659.3873340888473
660.0873206430817
660.7873071973162
661.4872937515506
662.1872803057851
662.8872668600195
663.587253414254
664.2872399684884
664.9872265227228
665.6872130769573
666.3871996311917
667.0871861854262
667.787172739660

951.2817272046098
951.9817137588442
952.6817003130786
953.3816868673131
954.0816734215475
954.781659975782
955.4816465300164
956.1816330842508
956.8816196384853
957.5816061927197
958.2815927469542
958.9815793011886
959.6815658554231
960.3815524096575
961.081538963892
961.7815255181264
962.4815120723608
963.1814986265953
963.8814851808297
964.5814717350642
965.2814582892986
965.981444843533
966.6814313977675
967.3814179520019
968.0814045062364
968.7813910604708
969.4813776147053
970.1813641689397
970.8813507231741
971.5813372774086
972.281323831643
972.9813103858775
973.6812969401119
974.3812834943463
975.0812700485808
975.7812566028152
976.4812431570497
977.1812297112841
977.8812162655186
978.581202819753
979.2811893739874
979.9811759282219
980.6811624824563
981.3811490366908
982.0811355909252
982.7811221451597
983.4811086993941
984.1810952536285
984.881081807863
985.5810683620974
986.2810549163319
986.9810414705663
987.6810280248008
988.3810145790352
989.0810011332696
989.780987687504

In [34]:
stohastic.get_weights()

array([ 0.04796764, -0.30325198, -0.04798111, ...,  0.54783208,
        0.2262439 , -0.03046244])

In [35]:
check_metrics(test_target, stohastic.predict(test_matrix))

Accuracy:  0.819452505320178
Roc_Auc:  0.8492728317990104
Precision:  0.638919636927164  - процент правильно токсичных
Recall:  0.9245555021624219  - процент выбранных токсичных


In [33]:
from sklearn.linear_model import SGDClassifier as SGDC

In [34]:
model = SGDC()

In [35]:
model.fit(train_matrix, train_target)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [37]:
check_metrics(test_target, model.predict(test_matrix))

Accuracy:  0.8807312826465468
Roc_Auc:  0.8607162366477352
Precision:  0.7979176526265973  - процент правильно токсичных
Recall:  0.810187409899087  - процент выбранных токсичных


In [45]:
train_matrix.shape[1]

2337

In [36]:
import nltk

In [37]:
from nltk.corpus import stopwords

In [42]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maksimkornilov/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [44]:
stopwords.

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r