In [1]:
import nltk
import gensim.downloader as api
import pymorphy2
import multiprocessing
import pandas as pd
from nltk.corpus import stopwords
import string
from gensim.models import Word2Vec
import json
import gensim
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

morph = pymorphy2.MorphAnalyzer()
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DjGle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DjGle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
dataset = json.load(open("custom_dataset.json", "rb"))
test_dataset = pd.read_csv("C://Users//DjGle//Desktop//Gleb//aiijc//EduRu//test.csv")

In [3]:
texts = list(dataset.keys())
classes = list(dataset.values())

texts_for_w2v = texts + list(test_dataset["task"])

In [4]:
stopWordsRu = set(stopwords.words("russian"))
punctuation = set(string.punctuation + "—" + "«" + "»")


def validateWord(word):
    if word in stopWordsRu or word in punctuation:
        return False
    allSymValid = False

    for sym in word:
        if not sym in punctuation:
            allSymValid = True
            break

    return allSymValid


def prepareText(text):
    text.replace("ё", "е")
    text.replace("\n", " ")

    out = []
    for word in nltk.word_tokenize(text):
        word = word.strip()
        if not validateWord(word):
            continue
        out.append(morph.parse(word.lower())[0].normal_form)
    return out

In [5]:
texts_for_train = [prepareText(text) for text in texts_for_w2v]
texts_for_valid = [prepareText(text) for text in texts]

In [6]:
model = gensim.models.Word2Vec(texts_for_train, window=10, min_count=1)

In [7]:
model.train(texts_for_train, total_examples=len(texts_for_train), epochs=250)

(203808558, 207603500)

In [15]:
model.wv.similarity("литература", "поэзия")

0.5219052

In [209]:
most_similar_words = {category: model.wv.most_similar(category, topn=10)
                      for category in ["литература", "музыка", "спорт", "животное"]}

In [210]:
def mean_word_distance(word, theme):
    distance = model.distances(word, most_similar_words[theme])
    distance.sort()
    return distance[:min(len(distance), 3)].mean()

In [198]:
def mean_sentence_similar(sentence, theme):
    return np.array([mean_word_similar(word, theme) for word in sentence]).mean()

In [199]:
def predict(sentence):
    predicts_dict = {k: mean_sentence_similar(sentence, k) for k in most_similar_words.keys()}
    predicts_dict = {v: k for k, v in predicts_dict.items()}
    predict = predicts_dict[min(list(predicts_dict))]
    if predict == "животное":
        predict = "животные"
    return predict

In [30]:
vectorizer = CountVectorizer(max_features=1500, min_df=1, stop_words=stopwords.words('russian'))
X = vectorizer.fit_transform([" ".join(text) for text in texts_for_valid]).toarray()

tfidf = TfidfTransformer()
X = tfidf.fit_transform(X).toarray()

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, classes, test_size=0.1, random_state=42)

In [32]:
classifier = RandomForestClassifier(n_estimators=50, random_state=42)
classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50, random_state=42)

In [33]:
y_pred = classifier.predict(X_test)

In [34]:
(y_pred == y_test).mean()

0.7339791356184798

In [38]:
sample_sub = pd.read_csv("sample_submission.csv")
test_df = pd.read_csv("test.csv")

In [171]:
predicts = []
for i, text in enumerate(list(test_df["task"])):
    text = prepareText(text)
    predicts.append(predict(text))

In [172]:
sample_sub["category"] = predicts

In [173]:
sample_sub.to_csv("sub2.csv")