In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from natasha import Segmenter, Doc, MorphVocab, NewsEmbedding, NewsMorphTagger, NewsSyntaxParser
from razdel import tokenize
from sklearn import model_selection

In [2]:
df = pd.read_excel('leto.xls', usecols=[0,1])
df.head()

Unnamed: 0,Rating,Content
0,5,It just works!
1,4,В целом удобноное приложение...из минусов хотя...
2,5,Отлично все
3,5,Стал зависать на 1% работы антивируса. Дальше ...
4,5,"Очень удобно, работает быстро."


In [3]:
df = df.dropna()

In [4]:
df.loc[df['Rating'] <= 3, 'Rating'] = 0
df.loc[df['Rating'] > 3, 'Rating'] = 1

In [9]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(df['Content'], df['Rating'], train_size=0.8)
print(train_X.shape, test_X.shape, train_y.shape, test_y.shape)

(16524,) (4132,) (16524,) (4132,)


In [6]:
emoticon_list = {
    '👍': ' нравится ',
    '😎': ' нравится ',
    '😁': ' нравится ',
    '😊': ' нравится ',
    '😀': ' нравится ',
    '😇': ' нравится ',
    '😍': ' нравится ',
    '😙': ' нравится ',
    '😉': ' нравится ',
    '🔥': ' нравится ',
    '👌': ' нравится ',
    '😘': ' нравится ',
    '👏': ' нравится ',
    '🙌': ' нравится ',
    '💋': ' нравится ',
    '💪': ' нравится ',
    '👎': ' плохое ',
    '😖': ' плохое ',
    '😑': ' плохое ',
    '😈': ' плохое ',
    '😡': ' плохое ',
    '🙅': ' плохое ',
    ':\)': ' ',
    ':\(': ' ',
    ';\)': ' '
}

In [7]:
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

In [10]:
sw = set(get_stop_words("ru"))
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt).lower()
    for emoji, word in emoticon_list.items():
        txt = txt.replace(emoji, word)
    txt = re.sub(r'[^\w\s]', ' ', txt)
    txt = re.sub(r'[^а-яА-Яa-zA-Z]', ' ', txt)
    txt = ' '.join([w for w in txt.split() if len(w) > 1])
    txt = [morpher.parse(word)[0].normal_form for word in txt.split()]# if word not in sw]
    return " ".join(txt)

train_X = train_X.apply(preprocess_text)
test_X = test_X.apply(preprocess_text)

In [11]:
train_corpus = " ".join(train_X)

In [12]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)
len(tokens)

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1108)>


125117

In [13]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [14]:
max_words = 200
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]


In [15]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [16]:
import numpy as np
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [17]:
max_len = 30

train_X = np.asarray([text_to_sequence(text, max_len) for text in train_X], dtype=np.int32)
test_X = np.asarray([text_to_sequence(text, max_len) for text in test_X], dtype=np.int32)
