In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from natasha import Segmenter, Doc, MorphVocab, NewsEmbedding, NewsMorphTagger, NewsSyntaxParser
from razdel import tokenize
from sklearn import model_selection

In [2]:
df = pd.read_excel('leto.xls', usecols=[0,1])
df.head()

Unnamed: 0,Rating,Content
0,5,It just works!
1,4,–í —Ü–µ–ª–æ–º —É–¥–æ–±–Ω–æ–Ω–æ–µ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–µ...–∏–∑ –º–∏–Ω—É—Å–æ–≤ —Ö–æ—Ç—è...
2,5,–û—Ç–ª–∏—á–Ω–æ –≤—Å–µ
3,5,–°—Ç–∞–ª –∑–∞–≤–∏—Å–∞—Ç—å –Ω–∞ 1% —Ä–∞–±–æ—Ç—ã –∞–Ω—Ç–∏–≤–∏—Ä—É—Å–∞. –î–∞–ª—å—à–µ ...
4,5,"–û—á–µ–Ω—å —É–¥–æ–±–Ω–æ, —Ä–∞–±–æ—Ç–∞–µ—Ç –±—ã—Å—Ç—Ä–æ."


In [3]:
df = df.dropna()

In [4]:
df.loc[df['Rating'] <= 3, 'Rating'] = 0
df.loc[df['Rating'] > 3, 'Rating'] = 1

In [9]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(df['Content'], df['Rating'], train_size=0.8)
print(train_X.shape, test_X.shape, train_y.shape, test_y.shape)

(16524,) (4132,) (16524,) (4132,)


In [6]:
emoticon_list = {
    'üëç': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üòé': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üòÅ': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üòä': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üòÄ': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üòá': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üòç': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üòô': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üòâ': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üî•': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üëå': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üòò': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üëè': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üôå': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üíã': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üí™': ' –Ω—Ä–∞–≤–∏—Ç—Å—è ',
    'üëé': ' –ø–ª–æ—Ö–æ–µ ',
    'üòñ': ' –ø–ª–æ—Ö–æ–µ ',
    'üòë': ' –ø–ª–æ—Ö–æ–µ ',
    'üòà': ' –ø–ª–æ—Ö–æ–µ ',
    'üò°': ' –ø–ª–æ—Ö–æ–µ ',
    'üôÖ': ' –ø–ª–æ—Ö–æ–µ ',
    ':\)': ' ',
    ':\(': ' ',
    ';\)': ' '
}

In [7]:
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

In [10]:
sw = set(get_stop_words("ru"))
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt).lower()
    for emoji, word in emoticon_list.items():
        txt = txt.replace(emoji, word)
    txt = re.sub(r'[^\w\s]', ' ', txt)
    txt = re.sub(r'[^–∞-—è–ê-–Øa-zA-Z]', ' ', txt)
    txt = ' '.join([w for w in txt.split() if len(w) > 1])
    txt = [morpher.parse(word)[0].normal_form for word in txt.split()]# if word not in sw]
    return " ".join(txt)

train_X = train_X.apply(preprocess_text)
test_X = test_X.apply(preprocess_text)

In [11]:
train_corpus = " ".join(train_X)

In [12]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)
len(tokens)

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1108)>


125117

In [13]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [14]:
max_words = 200
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]


In [15]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [16]:
import numpy as np
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [17]:
max_len = 30

train_X = np.asarray([text_to_sequence(text, max_len) for text in train_X], dtype=np.int32)
test_X = np.asarray([text_to_sequence(text, max_len) for text in test_X], dtype=np.int32)
