# 1. Подготовка

In [1]:
!pip install catboost
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from pymystem3 import Mystem
from tqdm import notebook
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
pd.set_option('display.max_colwidth', None)

## Загрузка данных.

In [4]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/toxic_comments.csv')

## Ознакомление.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


## Приведение всех значений в нижний регистр.

In [6]:
df['text'] = df['text'].str.lower()

## Проверка и удаление дубликатов.

In [7]:
print(df.duplicated().sum())
df = df.drop_duplicates().reset_index(drop = True)
print(df.duplicated().sum())

45
0


##### Дубликаты удалены.

## Удаление лишних символов.

In [8]:
df['text'] = df['text'].str.replace(r'\n',' ')
df['text'] = df['text'].str.replace(r'\t',' ')
#df['text'] = df['text'].str.replace(r'@',' ')
df['text'] = df['text'].str.strip()

In [9]:
for i in notebook.tqdm(range(len(df))):
    df['text'][i] = ' '.join(("".join([a for a in df['text'][i] if a.isalpha() or a==" "])).split())

HBox(children=(FloatProgress(value=0.0, max=159526.0), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  





## Проверка баланса классов.

In [10]:
print('Положительный класс:',df[df['toxic']==1]['toxic'].count())
print('Отрицательный класс:',df[df['toxic']==0]['toxic'].count())
print('Отрицательных в {} раз больше положительных'.format(round(df[df['toxic']==0]['toxic'].count()/df[df['toxic']==1]['toxic'].count(),5)))

Положительный класс: 16210
Отрицательный класс: 143316
Отрицательных в 8.84121 раз больше положительных


##### Наблюдается дисбаланс классов. Чтобы не увеличивать и без того большую выборку, буду указывать class_weight='balanced' при обучении моделей.

## Создание target.

In [11]:
features = df['text']
target = df['toxic']

## Разбиение на train, valid, test.

In [12]:
x_train, x_to_split, y_train, y_to_split = train_test_split(features, target, test_size=0.4, random_state=12345)
x_valid, x_test, y_valid, y_test = train_test_split(x_to_split, y_to_split, test_size=0.5, random_state=12345)

In [13]:
del x_to_split, y_to_split
print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)
print(x_test.shape)
print(y_test.shape)

(95715,)
(95715,)
(31905,)
(31905,)
(31906,)
(31906,)


## Создание массива TF-IDF для обучения регрессии.

In [14]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
count_tf_idf = TfidfVectorizer(stop_words=stop_words)
tfidf_train = count_tf_idf.fit_transform(x_train)
tfidf_valid = count_tf_idf.transform(x_valid)
tfidf_test = count_tf_idf.transform(x_test)
del x_train, x_valid, x_test, features, target

# 2. Обучение

## LogisticRegression.

In [16]:
#solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
#c = [1,5,10,20,30]

In [17]:
#%%time
#result = pd.DataFrame(columns = ['solver','c','f1_score'])
#for i in solver:
#    for k in c:
#        model = LogisticRegression(solver=i, C=k,random_state=12345, class_weight='balanced', n_jobs=-1).fit(tfidf_train, y_train)
#        answers = model.predict(tfidf_valid)
#        f1 = f1_score(y_valid, answers)
#        result = result.append({'solver':i,'c':k,'f1_score':round(f1,5)}, ignore_index=True)
#print(result.sort_values('f1_score', ascending=False))

##### Лучшая модель LogisticRegression: solver='sag', C=20. F1_score = 0.77399

## LightGBM.

In [18]:
#num_leaves = [31,40,50,60]
#min_child_samples=[20,30,40,50]
#max_depth =[10,20,30,50,100,150]

In [19]:
#%%time
#acc = pd.DataFrame(columns = ['num_leaves','max_depth','min_child_samples','f1_score'])
#counter = 0
#for i in max_depth:
#    for j in min_child_samples:
#        for k in num_leaves:
#            gbm = lgb.LGBMClassifier(max_depth=i,min_child_samples=j,num_leaves=k,random_state=12345, n_jobs=-1,class_weight='balanced').fit(tfidf_train, y_train)
#            answers_gbm = gbm.predict(tfidf_valid, num_iteration=gbm.best_iteration_)
#            f1 = f1_score(y_valid, answers_gbm)
#            acc = acc.append({'num_leaves':k,'max_depth':i, 'min_child_samples':j,'f1_score':round(f1,5)}, ignore_index=True)
#            print(counter)
#            counter += 1
#print(acc.sort_values('f1_score', ascending=False))

##### Лучшая модель LightGBM - num_leaves=60, max_depth=100, min_child_samples=10, f1_score=0.72898. Модель не достигла необходимой точности.

## Catboost.

In [20]:
#iterations = [1000,1500]
#depth = [3,7,10]

In [21]:
#%%time
#acc1 = pd.DataFrame(columns = ['depth','iterations','f1_score'])
#counter = 0
#for i in iterations:
#    for j in depth:
#        cbc = CatBoostClassifier(iterations=i, depth=j, random_state=12345, verbose=500,class_weights=[0.1, 9]).fit(tfidf_train, y_train)
#        answers_cbr = cbc.predict(tfidf_valid)
#        f1 = f1_score(y_valid, answers_cbr)
#        acc1 = acc1.append({'iterations':i, 'depth':j,'f1_score':round(f1,5)}, ignore_index=True)
#        print(counter)
#        counter += 1
#print(acc1.sort_values('f1_score', ascending=False))

##### Обучить модели не смог - ЮпитерХаб крашился:)

## XGBoost.

In [22]:
#booster = ['gbtree','gblinear']
#max_depth = [10,15,20,30,40]

In [23]:
#%%time
#acc = pd.DataFrame(columns = ['booster','max_depth','f1_score'])
#for i in booster:
#    for j in max_depth:
#        xgbc = xgb.XGBClassifier(scale_pos_weight=8.85,booster=i,max_depth=j,random_state=12345,n_jobs=-1).fit(tfidf_train, y_train)
#        answers_xgc = xgbc.predict(tfidf_valid)
#        f1 = f1_score(y_valid, answers_xgc)
#        acc = acc.append({'booster':i, 'max_depth':j,'f1_score':round(f1,5)}, ignore_index=True)
#print(acc.sort_values('f1_score', ascending=False))

##### Лучшая модель XGBoost - booster='gbtree', max_depth=30, f1_score=0.71620. Модель не достигла необходимой точности.

## KNeighborsClassifier.

In [24]:
#n_neighbors = [5,10,15]
#weights = ['uniform', 'distance']
#leaf_size = [30,40,50]

In [25]:
#%%time
#accknn = pd.DataFrame(columns = ['n_neighbors','weights','leaf_size','f1_score'])
#for i in weights:
#    for j in n_neighbors:
#        for k in leaf_size:
#            knn = KNeighborsClassifier(weights=i, n_neighbors=j, leaf_size=k, n_jobs=-1).fit(tfidf_train, y_train)
#            answers_knn = knn.predict(tfidf_valid)
#            f1 = f1_score(y_valid, answers_knn)
#            accknn = accknn.append({'weights':i, 'n_neighbors':j,'leaf_size':k,'f1_score':round(f1,5)}, ignore_index=True)
#print(accknn.sort_values('f1_score', ascending=False))

##### Лучшая модель KNeighborsClassifier - weights='distance', n_neighbors=15, leaf_size=50, f1_score=0.67931. Модель не достигла необходимой точности.

## Проверка на test - выборке.

In [27]:
model = LogisticRegression(max_iter=2000, solver='sag', C=20,random_state=12345, class_weight='balanced', n_jobs=-1).fit(tfidf_train, y_train)
answers = model.predict(tfidf_test)
f1 = f1_score(y_test, answers)
print('F1 score:', round(f1,5))

F1 score: 0.76251




# 3. Выводы

1. Была проведена загрузка датасета с токсичными комментариями.
2. Датасет очищен от посторонних символов.
3. Был рассчитан признак TF-IDF.
4. Был выявлен дисбаланс классов для обучения моделей с учётом этого.
5. Были обучены модели LogisticRegression, XGBoost, LightGBM.
6. Требуемой метрики f1 score больше 0,75 достигла модель LogisticRegression.
7. Это было протестировано на тестовой выборке.
8. По всей видимости, именно LogisticRegression проще всего воспринимать подобные векторы. С этим и связаны её наилучшие показатели.  
9. Скорее всего с использованием лемматизации показатели были бы выше.