In [1]:
import pandas as pd
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, LSTM, GRU, MaxPool1D
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard
from keras.metrics import categorical_crossentropy
from keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv('otziv.csv', sep=';')
df = df[['Rating', 'Content']]
df

Unnamed: 0,Rating,Content
0,5,It just works!
1,4,В целом удобноное приложение...из минусов хотя...
2,5,Отлично все
3,5,Стал зависать на 1% работы антивируса. Дальше ...
4,5,"Очень удобно, работает быстро."
...,...,...
20654,1,"Ну и шляпа,с роот правами бесполезная прога,ра..."
20655,5,Ок
20656,4,Доволен
20657,1,"Песопаснасть, рут ни нужын"


In [3]:
df_train = df[:15000]
df_train

Unnamed: 0,Rating,Content
0,5,It just works!
1,4,В целом удобноное приложение...из минусов хотя...
2,5,Отлично все
3,5,Стал зависать на 1% работы антивируса. Дальше ...
4,5,"Очень удобно, работает быстро."
...,...,...
14995,5,Люблю сбербанк
14996,3,Хорошее приложение
14997,5,Всё огонь!
14998,1,"Плохо пишет,что в телефоне есть рут,а его срод..."


In [4]:
df_val = df[15000:]
df_val

Unnamed: 0,Rating,Content
15000,5,Топчик.
15001,5,Супер!!!
15002,5,Прекрасно. Мне нравиться
15003,5,Удобно управлять счетами
15004,4,Очень медленно работает.
...,...,...
20654,1,"Ну и шляпа,с роот правами бесполезная прога,ра..."
20655,5,Ок
20656,4,Доволен
20657,1,"Песопаснасть, рут ни нужын"


In [5]:
max_words = 10000
max_len = 40
num_classes = 1

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [6]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['Content'] = df_train['Content'].apply(preprocess_text)
df_val['Content'] = df_val['Content'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['Content'] = df_train['Content'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val['Content'] = df_val['Content'].apply(preprocess_text)


In [7]:
train_corpus = " ".join(df_train['Content'])
train_corpus = train_corpus.lower()

In [8]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [10]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [11]:
tokens_filtered_top[:10]

['приложение',
 'удобно',
 'работать',
 'удобный',
 'отлично',
 'нравиться',
 'хороший',
 'отличный',
 'супер',
 'телефон']

In [12]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [13]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [14]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in df_train['Content']], dtype=np.int32)
x_val = np.asarray([text_to_sequence(text, max_len) for text in df_val['Content']], dtype=np.int32)

In [15]:
x_train.shape

(15000, 40)

In [16]:
max_len

40

In [17]:
x_train[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,  101, 3525, 3526,  118,  168,  116,
         88, 1187,  477, 3527,   15,  537,  458])

In [18]:
df_train['Rating'] = df_train['Rating']-1
df_val['Rating'] = df_val['Rating']-1
df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['Rating'] = df_train['Rating']-1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val['Rating'] = df_val['Rating']-1


Unnamed: 0,Rating,Content
0,4,it just works
1,3,целое удобноной приложениеиз минус хотеть боль...
2,4,отлично
3,4,зависать 1 работа антивирус ранее пользоваться...
4,4,удобно работать быстро
...,...,...
14995,4,любить сбербанк
14996,2,хороший приложение
14997,4,огонь
14998,0,плохо пишетчто телефон рута сроду сдесьнебыть


In [19]:
df_val

Unnamed: 0,Rating,Content
15000,4,топчик
15001,4,супер
15002,4,нравиться
15003,4,удобно управлять счёт
15004,3,медленно работать
...,...,...
20654,0,шляпас роот право бесполезный прогаразрабыв ох...
20655,4,около
20656,3,довольный
20657,0,песопаснастя рута нужын


In [20]:
df_train

Unnamed: 0,Rating,Content
0,4,it just works
1,3,целое удобноной приложениеиз минус хотеть боль...
2,4,отлично
3,4,зависать 1 работа антивирус ранее пользоваться...
4,4,удобно работать быстро
...,...,...
14995,4,любить сбербанк
14996,2,хороший приложение
14997,4,огонь
14998,0,плохо пишетчто телефон рута сроду сдесьнебыть


In [21]:
num_classes = 5
y_train = keras.utils.to_categorical(df_train['Rating'], num_classes)
y_val = keras.utils.to_categorical(df_val['Rating'], num_classes)

In [22]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(100))
model.add(Dense(50))
model.add(Dense(10))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 128)           1280000   
                                                                 
 conv1d (Conv1D)             (None, 38, 128)           49280     
                                                                 
 activation (Activation)     (None, 38, 128)           0         
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 100)               12900     
                                                                 
 dense_1 (Dense)             (None, 50)                5050      
                                                        

In [24]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [25]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


In [26]:
score = model.evaluate(x_val, y_val, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.7485322952270508
Test accuracy: 0.7453613877296448


In [27]:
lstm = Sequential()
lstm.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len, trainable=True, mask_zero=True))
lstm.add(LSTM(128, recurrent_dropout=0.2))
lstm.add(Dense(50))
lstm.add(Dense(10))
lstm.add(Activation("relu"))
lstm.add(Dense(num_classes))
lstm.add(Activation('softmax'))

In [28]:
lstm.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 40, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense_4 (Dense)             (None, 50)                6450      
                                                                 
 dense_5 (Dense)             (None, 10)                510       
                                                                 
 activation_2 (Activation)   (None, 10)                0         
                                                                 
 dense_6 (Dense)             (None, 5)                 55        
                                                                 
 activation_3 (Activation)   (None, 5)                

In [29]:
lstm.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [30]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')


history = lstm.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


In [31]:
score_lstm = lstm.evaluate(x_val, y_val, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score_lstm[0])
print('Test accuracy:', score_lstm[1])



Test score: 0.8566953539848328
Test accuracy: 0.729987621307373


In [32]:
gru_cnn = Sequential()
gru_cnn.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
gru_cnn.add(Conv1D(128, 3, activation='relu'))
gru_cnn.add(MaxPool1D(3))
gru_cnn.add(GRU(128, recurrent_dropout=0.2))
gru_cnn.add(Dense(50))
gru_cnn.add(Dense(10))
gru_cnn.add(Activation("relu"))
gru_cnn.add(Dense(num_classes))
gru_cnn.add(Activation('softmax'))

In [33]:
gru_cnn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 40, 128)           1280000   
                                                                 
 conv1d_1 (Conv1D)           (None, 38, 128)           49280     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 12, 128)          0         
 )                                                               
                                                                 
 gru (GRU)                   (None, 128)               99072     
                                                                 
 dense_7 (Dense)             (None, 50)                6450      
                                                                 
 dense_8 (Dense)             (None, 10)                510       
                                                      

In [34]:
gru_cnn.summary()
gru_cnn.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 40, 128)           1280000   
                                                                 
 conv1d_1 (Conv1D)           (None, 38, 128)           49280     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 12, 128)          0         
 )                                                               
                                                                 
 gru (GRU)                   (None, 128)               99072     
                                                                 
 dense_7 (Dense)             (None, 50)                6450      
                                                                 
 dense_8 (Dense)             (None, 10)                510       
                                                      

In [35]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')


history = gru_cnn.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


In [36]:
score_gru_cnn = gru_cnn.evaluate(x_val, y_val, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score_gru_cnn[0])
print('Test accuracy:', score_gru_cnn[1])



Test score: 0.8607456088066101
Test accuracy: 0.7250397801399231


Обучил 3 модели: обычную CNN на 3 промежуточных слоя Dense, lstm + добавил 2 слоя промежуточных и комбинированную модель GRU с пулингом и 2мя Dense слоями.</br>
Как ни удивительно лидирует обычная Cnn по accuracy, возможно это связано с переобучением последующих 2х сетей. Lstm пробовал обучать без Dense, метрика значительно не растет. Первая модель получается все равно сильнее