In [84]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer 


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ekaterinakastaleva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [85]:
pd.set_option("display.max_colwidth", None)
toxic_comments = pd.read_csv("/Users/ekaterinakastaleva/russian_comments_from_2ch_pikabu.csv")

In [86]:
toxic_comments.head()

Unnamed: 0,comment,toxic,translated
0,"Верблюдов-то за что? Дебилы, бл...\n",1.0,"Camels, for what? Morons, bl ..."
1,"Хохлы, это отдушина затюканого россиянина, мол, вон, а у хохлов еще хуже. Если бы хохлов не было, кисель их бы придумал.\n",1.0,"Ukrainians, this is an outlet vent zatyukanogo Russians, they say, out, and Ukrainians worse. If there were no Ukrainians, jelly would have invented them."
2,Собаке - собачья смерть\n,1.0,Dog - Dog Death
3,"Страницу обнови, дебил. Это тоже не оскорбление, а доказанный факт - не-дебил про себя во множественном числе писать не будет. Или мы в тебя верим - это ты и твои воображаемые друзья?\n",1.0,"Refresh the page, moron. This is also not an insult, but a proven fact - I will not write a non-moron to myself in the plural. Or do we believe in you - are you and your imaginary friends?"
4,"тебя не убедил 6-страничный пдф в том, что Скрипалей отравила Россия? Анализировать и думать пытаешься? Ватник что ли?)\n",1.0,did not convince you of the 6-page pdf that Skripale was poisoned by Russia? Are you trying to analyze and think? Padded jacket or what?)


In [87]:
toxic_comments.columns=[ "comment", "toxic","translated"]
#I dont need the translated-column, because of the bad translation
toxic_comments=toxic_comments[["comment","toxic"]]
toxic_comments.columns

Index(['comment', 'toxic'], dtype='object')

In [88]:
mystem = Mystem()
russian_stopwords = stopwords.words("russian")
other_exclusions = ["#ff", "ff", "rt"]
russian_stopwords.extend(other_exclusions)

In [89]:
stemmer = SnowballStemmer("russian")

In [105]:
def preprocessing(string):
    space_pattern = '\s+'
    url = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', string)
    parsed_text = re.sub(url, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

In [106]:
preprocessing("Это моя почта kashtaliowa@gmail.com и мой сайт http://wwww.")

'Это моя почта kashtaliowa.com и мой сайт '

In [92]:
mystem = Mystem()
russian_stopwords = stopwords.words("russian")


In [93]:
def tokenize(text, threshold = 1):
    threshold = threshold
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token !=" "\
             and token.strip() not in punctuation]
    text = " ".join([word for word in tokens if len(word) > threshold])
    return re.sub('[0-9]+', '', text)

In [94]:
tokenize('Я сегодня был в магазине.') #function tokenize works well

'сегодня быть магазин'

In [95]:
only_toxic_comments = toxic_comments[toxic_comments["toxic"]==1.0]
print(len(only_toxic_comments))
only_toxic_comments[5:10]

4826


Unnamed: 0,comment,toxic
5,Для каких стан является эталоном современная система здравоохранения РФ? Для Зимбабве? Ты тупой? хохлы\n,1.0
7,УПАД Т! ТАМ НЕЛЬЗЯ СТРОИТЬ! ТЕХНОЛОГИЙ НЕТ! РАЗВОРУЮТ КАК ВСЕГДА! УЖЕ ТРЕЩИНАМИ ПОШ Л! ТУПЫЕ КИТАЗЫ НЕ МОГУТ НИЧЕГО НОРМАЛЬНО СДЕЛАТЬ!\n,1.0
8,"Ебать тебя разносит, шизик.\n",1.0
9,"Обосрался, сиди обтекай\n",1.0
10,"Зачем ты пишешь хуйню, дегенерат? Поцелуй в губы ! поцелую в засос.\n",1.0


In [96]:
toxic_comments['toxic'].value_counts()

0.0    9586
1.0    4826
Name: toxic, dtype: int64

In [97]:
normal_comments = toxic_comments[toxic_comments["toxic"]==0.0]

In [98]:
np.sum(toxic_comments.isnull().any(axis=1))

0

In [99]:
#I have now two classes: normal_comments and only_toxic_comments
normal_comments = normal_comments.iloc[:int(4800)]
print("There are " + str(len(normal_comments)) + " comments with non toxic meaning.")
only_toxic_comments = only_toxic_comments.iloc[:int(4800)]
print("There are " + str(len(only_toxic_comments)) + " comments with toxic meaning.")

There are 4800 comments with non toxic meaning.
There are 4800 comments with toxic meaning.


In [100]:
toxic_dataset = pd.concat([only_toxic_comments, normal_comments])
print(len(toxic_dataset))

9600


In [101]:
#TF-IDF verctorarization
#tokenizer?
vectorizer = TfidfVectorizer(
tokenizer = tokenize,
preprocessor = preprocessing,
ngram_range = (1, 3),
stop_words = russian_stopwords,
use_idf = True,
smooth_idf = False,
norm = None,
decode_error = 'replace',
max_features = 10000,
max_df = 1, #max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
min_df = 1) #min_df = 5 means "ignore terms that appear in less than 5 documents".

In [102]:
tfidf = vectorizer.fit_transform(toxic_dataset).toarray()



In [103]:
tfidf.T.shape

(23, 2)

In [83]:
print(tfidf)

[[1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 0.
  0.         3.38629436 1.69314718 1.69314718 1.69314718 1.69314718
  1.69314718 1.69314718 1.69314718 1.69314718 0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         1.69314718
  1.69314718 0.         0.         0.         0.         0.
  0.         0.         0.         0.         1.69314718 1.69314718
  1.69314718 1.69314718 1.69314718 1.69314718 1.69314718]]
