In [1]:
import numpy as np
import pandas as pd

import re

import matplotlib.pyplot as plt

from oracles import BinaryLogistic
from optim import GDClassifier

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
train = pd.read_csv('toxic_train.csv')
test = pd.read_csv('toxic_test.csv')

In [3]:
train.tail()

Unnamed: 0.1,Unnamed: 0,comment_text,is_toxic
52056,159494,"""\n\n our previous conversation \n\nyou fuckin...",True
52057,159514,YOU ARE A MISCHIEVIOUS PUBIC HAIR,True
52058,159541,Your absurd edits \n\nYour absurd edits on gre...,True
52059,159546,"""\n\nHey listen don't you ever!!!! Delete my e...",True
52060,159554,and i'm going to keep posting the stuff u dele...,True


In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,comment_text,is_toxic
0,0,Explanation\nWhy the edits made under my usern...,False
1,1,D'aww! He matches this background colour I'm s...,False
2,2,"Hey man, I'm really not trying to edit war. It...",False
3,3,"""\nMore\nI can't make any real suggestions on ...",False
4,4,"You, sir, are my hero. Any chance you remember...",False


1. Предобработка данных

In [5]:

train_target = np.array(train['is_toxic']).astype(int)
test_target = np.array(test['is_toxic']).astype(int)

train_target

array([0, 0, 0, ..., 1, 1, 1])

In [6]:
del train['is_toxic']
del test['is_toxic']
train.head()

Unnamed: 0.1,Unnamed: 0,comment_text
0,0,Explanation\nWhy the edits made under my usern...
1,1,D'aww! He matches this background colour I'm s...
2,2,"Hey man, I'm really not trying to edit war. It..."
3,3,"""\nMore\nI can't make any real suggestions on ..."
4,4,"You, sir, are my hero. Any chance you remember..."


In [7]:
train_text = np.array(train['comment_text'].tolist())
test_text = np.array(test['comment_text'].tolist())

In [8]:
# приведение к нижнему регистру
for i, elem in enumerate(train_text):
    train_text[i] = elem.lower()
for i, elem in enumerate(test_text):
    test_text[i] = elem.lower()


In [9]:
# удаление лишних символов

train_text = [re.sub('[^[a-z0-9]',' ', i) for i in train_text]
test_text = [re.sub('[^[a-z0-9]',' ', i) for i in test_text]

2. Преобразование выборки

In [10]:
vectorizer = CountVectorizer(min_df=0.3)
train_set = vectorizer.fit_transform(train_text)
test_set = vectorizer.transform(test_text)

type(train_set)

scipy.sparse.csr.csr_matrix

In [11]:
train_target.shape

(52061,)

3. Исследование поведения градиентного спуска

3.0 Работа с параметрами по умолчанию

In [12]:
clf = GDClassifier(loss_function='logistic')
history = clf.fit(train_set, train_target, trace=True)
answer = clf.predict(test_set)
print('accuracy = ' + str(np.sum(np.abs(answer - test_target)) / len(answer)))
print('function = ' + str(clf.get_objective(test_set, answer)))
print('time = ' + str(sum(history['time'])))

20676
accuracy = 0.6980557167730702
function = 0.10624973634571983
time = 3.0745821639999864


In [13]:
clf1 = GDClassifier(step_alpha = 0.01, loss_function='logistic', tolerance = 0.1)
history = clf1.fit(train_set, train_target, trace=True)
answer = clf1.predict(test_set)
print('accuracy = ' + str(np.sum(np.abs(answer - test_target)) / len(answer)))
print('function = ' + str(clf.get_objective(test_set, answer)))
print('time = ' + str(sum(history['time'])))

20676
accuracy = 0.6980557167730702
function = 0.10624973634571983
time = 0


3.1 Изменяемый параметр - step_alpha

In [None]:
accuracy_array = np.array([])
time_array = np.array([])
func_array = np.array([])

step = 0
while step <= 5:
    clf = GDClassifier(step_alpha=step, loss_function='logistic')
    history = clf.fit(train_set, train_target, trace=True)
    answer = clf.predict(test_set)
    time_array = np.append(time_array, sum(history['time']))
    func_array = np.append(func_array, clf.get_objective(test_set, answer))
    accuracy_array = np.append(accuracy_array, np.sum(np.abs(answer - test_target)) / len(answer))
    step += 0.005

20676
20676


In [None]:
accuracy_array

3.2 Изменяемый параметр - step_beta

In [None]:
accuracy_array = np.array([])
time_array = np.array([])
func_array = np.array([])

step = 0
while step <= 50:
    clf = GDClassifier(step_beta=step, loss_function='logistic')
    history = clf.fit(train_set, train_target, trace=True)
    answer = clf.predict(test_set)
    time_array = np.append(time_array, sum(history['time']))
    func_array = np.append(func_array, clf.get_objective(test_set, answer))
    accuracy_array = np.append(accuracy_array, np.sum(np.abs(answer - test_target)) / len(answer))
    step += 0.1

In [None]:
func_array

3.3 Изменяемый параметр - начальное приближение

In [None]:
# почитать, как это вообще выбирают

4. Исследование поведения стохастического градиентного спуска

5. Сравнение методов

6. Лемматизация

7. 