# Рекуррентные блоки

1. построить свёрточную архитектуру
2. построить различные архитектуры с RNN
3. построить совместные архитектуры CNN -> RNN и/или (RNN -> CNN)
4. сделать выводы что получилось лучше

In [1]:
# pip install pymorphy2

In [2]:
# pip install stop-words

In [3]:
# Импорт библиотек

import pandas as pd
import numpy as np
import re
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from string import punctuation
# import annoy
from gensim.models import Word2Vec, FastText

In [4]:
# Загрузка данных Google Colab

# from google.colab import files
# files.upload()

In [5]:
# Загрузка данных

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_val = pd.read_csv("val.csv")

df_train.head()

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Зап...,1
4,4,@KarineKurganova @Yess__Boss босапопа есбоса н...,1


In [6]:
# Препроцессинг текста

sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(text):
    text = str(text)
    text = re.sub(r'RT*', '', text)
    text = re.sub(r'@[\w]*', ' ', text)
    text = "".join(word for word in text if word not in exclude)
    text = text.lower()
    text = re.sub("не\s", "не", text)
    text = [morpher.parse(word)[0].normal_form for word in text.split() if word not in sw]
    
    return " ".join(text)

df_train['text'] = df_train['text'].apply(preprocess_text)
df_val['text'] = df_val['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(preprocess_text)

df_train.head()

Unnamed: 0,id,text,class
0,0,неуезжаааааааать ❤ нехотеть уезжать,0
1,1,ребята девчата кино любовь завтра вотэтолюбовь,1
2,2,ненавидеть пробка ретвит,0
3,3,хотеться котлета покиевск запретный плод,1
4,4,босапоп есбосой небояться мороз,1


In [7]:
# Импорт библиотек для построения сети

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
# from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping  

In [8]:
# Создание корпусов

text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values
text_corpus_test = df_test['text'].values

In [9]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)

y_train = df_train['class'].values
y_val = df_val['class'].values

In [10]:
# RNN - Recurrent neural network - Рекуррентная нейронная сеть

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(SimpleRNN(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [11]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.5493119955062866
Test accuracy: 0.7361019253730774


In [12]:
results = []

results.append(['RNN', score[0], score[1]])

In [13]:
# LSTM - Long short-term memory - Нейронная сеть долгой краткосрочной памяти

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(LSTM(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [14]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['LSTM', score[0], score[1]])



Test score: 0.5334282517433167
Test accuracy: 0.7433760762214661


In [15]:
# GRU - Gated Recurrent Units - Управляемые рекуррентные блоки

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(GRU(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [16]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['GRU', score[0], score[1]])



Test score: 0.5367250442504883
Test accuracy: 0.7438169717788696


In [17]:
# CNN - Convolutional neural network - Сверточная нейронная сеть

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [18]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['CNN', score[0], score[1]])



Test score: 0.529025137424469
Test accuracy: 0.7375126481056213


In [19]:
# RNN + CNN

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(SimpleRNN(64, recurrent_dropout=0.2, return_sequences="True"))
model.add(Conv1D(64, 3, activation="linear"))
model.add(Conv1D(64, 1, activation="linear")) 
model.add(Flatten())                      
model.add(Dropout(0.5)) 
model.add(Dense(1, activation="sigmoid"))      


model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [20]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['RNN + CNN', score[0], score[1]])



Test score: 0.5558750033378601
Test accuracy: 0.7341180443763733


In [21]:
# LSTM + CNN

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0)) 
model.add(LSTM(64,  recurrent_dropout=0.2, return_sequences="True")) 
model.add(Conv1D(64, 3, activation="linear"))
model.add(Conv1D(64, 1, activation="linear")) 
model.add(Flatten())                      
model.add(Dropout(0.5)) 
model.add(Dense(1, activation="sigmoid"))      


model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [22]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['LSTM + CNN', score[0], score[1]])



Test score: 0.5437334179878235
Test accuracy: 0.7373804450035095


In [23]:
# GRU + CNN

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0)) 
model.add(GRU(64,  recurrent_dropout=0.2, return_sequences="True")) 
model.add(Conv1D(64, 3, activation="linear"))
model.add(Conv1D(64, 1, activation="linear")) 
model.add(Flatten())                      
model.add(Dropout(0.5)) 
model.add(Dense(1, activation="sigmoid"))      


model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [24]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['GRU + CNN', score[0], score[1]])



Test score: 0.5476436614990234
Test accuracy: 0.7364546060562134


In [25]:
results_df = pd.DataFrame(results, columns = ['Model', 'Test score', 'Test accuracy'])
results_df

Unnamed: 0,Model,Test score,Test accuracy
0,RNN,0.549312,0.736102
1,LSTM,0.533428,0.743376
2,GRU,0.536725,0.743817
3,CNN,0.529025,0.737513
4,RNN + CNN,0.555875,0.734118
5,LSTM + CNN,0.543733,0.73738
6,GRU + CNN,0.547644,0.736455


При совместном использовании сверточных и рекуррентных слоев качество отличается незначительно, но скорость работы такой сети сильно ниже.