In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import *

Sample dummy submission:

In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

File with answers:

In [4]:
y_test = pd.read_csv('test_data.csv')

In [5]:
y_test.sample()

Unnamed: 0,comment_id,comment
3115,3115,"не играл в мгс, так что кефир, лично для меня ..."


In [6]:
y_test = list(y_test['toxic'])

KeyError: 'toxic'

In [None]:
sample_submission = [[i, randint(0,1)] for i in range(len(test))]
sample_submission = pd.DataFrame(sample_submission, columns=['comment_id', 'toxic'])

sample_submission.head()

In [None]:
sample_submission.to_csv('sample_submission.csv', index=False)

In [None]:
train['toxic'] = train.toxic.astype(int)

In [None]:
y_train = train['toxic']

##  baseline 1: no preprocessing + bow -> 4

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vec = CountVectorizer(ngram_range=(1, 1)) # строим BoW для слов
bow = vec.fit_transform(train['comment'])

In [None]:
list(vec.vocabulary_.items())[:10]

In [None]:
clf = LogisticRegression(random_state=42, max_iter=500)
clf.fit(bow, y_train)

In [None]:
pred = clf.predict(vec.transform(test['comment']))
print(classification_report(pred, y_test))

In [None]:
accuracy_score(pred, y_test)

In [None]:
baseline_4 = [[i, pred[i]] for i in range(len(test))]
baseline_4 = pd.DataFrame(baseline_4, columns=['comment_id', 'toxic'])

baseline_4.head()

In [None]:
baseline_4.to_csv('baseline_4.csv', index=False)

## baseline 2: preprocessing + bow -> 5

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import re
from pymorphy2 import MorphAnalyzer
from functools import lru_cache
from nltk.corpus import stopwords

m = MorphAnalyzer()
regex = re.compile("[А-Яа-яA-z]+")

def words_only(text, regex=regex):
    try:
        return regex.findall(text.lower())
    except:
        return []

In [None]:
@lru_cache(maxsize=128)
def lemmatize_word(token, pymorphy=m):
    return pymorphy.parse(token)[0].normal_form

def lemmatize_text(text):
    return [lemmatize_word(w) for w in text]


mystopwords = stopwords.words('russian') 
def remove_stopwords(lemmas, stopwords = mystopwords):
    return [w for w in lemmas if not w in stopwords and len(w) > 3]

def clean_text(text):
    tokens = words_only(text)
    lemmas = lemmatize_text(tokens)
    
    return ' '.join(remove_stopwords(lemmas))

In [None]:
from tqdm import tqdm

lemmas = list(tqdm(map(clean_text, train['comment']), total=len(train)))
    
train['lemmas'] = lemmas
train.sample(5)

In [None]:
lemmas_test = list(tqdm(map(clean_text, test['comment']), total=len(test)))
    
test['lemmas'] = lemmas_test

In [None]:
vec = CountVectorizer(ngram_range=(1, 2)) # строим BoW для слов
bow = vec.fit_transform(train['lemmas'])

clf = LogisticRegression(random_state=42, max_iter=500)
clf.fit(bow, y_train)

pred = clf.predict(vec.transform(test['lemmas']))
accuracy_score(pred, y_test)

In [None]:
baseline_5 = [[i, pred[i]] for i in range(len(test))]
baseline_5 = pd.DataFrame(baseline_5, columns=['comment_id', 'toxic'])

baseline_5.to_csv('baseline_5.csv', index=False)

## baseline 3: preproc + fasttext -> 6

In [None]:
import fasttext

In [None]:
with open('train_ft.txt', 'w') as f:
    for pair in list(zip(train['lemmas'], train['toxic'])):
        text, label = pair
        f.write(f'__label__{label} {text.lower()}\n')

In [None]:
with open('test_ft.txt', 'w') as f:
    for pair in list(zip(test['lemmas'], test['toxic'])):
        text, label = pair
        f.write(f'__label__{label} {text.lower()}\n')

In [None]:
classifier = fasttext.train_supervised('train_ft.txt')#, 'model')
result = classifier.test('test_ft.txt')
print('P@1:', result[1])#.precision)
print('R@1:', result[2])#.recall)
print('Number of examples:', result[0])#.nexamples)

In [None]:
pred = classifier.predict(list(test['lemmas']))[0]
pred = [int(label[0][-1]) for label in pred]

accuracy_score(list(y_test), pred)

In [None]:
baseline_6 = [[i, pred[i]] for i in range(len(test))]
baseline_6 = pd.DataFrame(baseline_6, columns=['comment_id', 'toxic'])

baseline_6.to_csv('baseline_6.csv', index=False)