In [1]:
import numpy as np
import pandas as pd

In [2]:
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking, Flatten
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.callbacks import TensorBoard
from keras.losses import categorical_crossentropy

In [6]:
from nltk.corpus import stopwords
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

In [7]:
from google.colab import files
uploaded = files.upload()

Saving leto.csv to leto.csv


In [24]:
df_row = pd.read_csv("leto.csv")
df_row = df_row.rename(columns = {'Rating':'class', 'Content':'text'})
df_train = df_row.drop(columns=['Date'])

df_train["class"] = np.where(df_train["class"] >= 4, 1, 0)
df_test = df_train.iloc[0:4000]
df_test = df_test[['text', 'class']]
df_train = df_train[['text', 'class']]

df_train = df_train.dropna()
df_test = df_test.dropna()

df_train.head()

Unnamed: 0,text,class
0,It just works!,1
1,В целом удобноное приложение...из минусов хотя...,1
2,Отлично все,1
3,Стал зависать на 1% работы антивируса. Дальше ...,1
4,"Очень удобно, работает быстро.",1


**Предобработка**

In [9]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['text'] = df_train['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(preprocess_text)

In [10]:
text_corpus_train = df_train['text'].values
text_corpus_test = df_test['text'].values

In [11]:
tokenizer = Tokenizer(num_words=None,
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_test = pad_sequences(sequences_test, maxlen=training_length)


In [12]:
y_train = df_train['class'].values
y_test = df_test['class'].values

**CNN**

In [21]:
model = Sequential()

model.add(Embedding(input_dim=word_count,
                    output_dim=256,
                    input_length=training_length))
model.add(Conv1D(256, 3))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Conv1D(256, 3))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(GlobalMaxPool1D())
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          batch_size=512,
          epochs=3,
          verbose=1,
          validation_split=0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa51b49a410>

**RNN**

In [23]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=256,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(SimpleRNN(256))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=3,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa5179e6bf0>

**LSTM** + **CNN**

In [25]:
model = Sequential()

model.add(Embedding(input_dim=word_count,
                    output_dim=256,
                    input_length=training_length,
                    trainable=True,
                    mask_zero=True))
model.add(LSTM(256, return_sequences=True))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Conv1D(256, 3))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          batch_size=512,
          epochs=3,
          verbose=1,
          validation_split=0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa5129b5ab0>

**GRU + CNN**

In [27]:
model = Sequential()

model.add(Embedding(input_dim=word_count,
                    output_dim=256,
                    input_length=training_length,
                    trainable=True,
                    mask_zero=True))
model.add(GRU(256, return_sequences=True))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Conv1D(256, 3))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          batch_size=512,
          epochs=3,
          verbose=1,
          validation_split=0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa51c2c2c80>

**SUMMARY**

CNN (accuracy) = 0.9051

RNN (accuracy) = 0.9353

LSTM + CNN (accuracy) = 0.9439

GRU + CNN (accuracy) = 0.9439