## Введение в обработку естественного языка

### Урок 8. Рекуррентные нейронные сети RNN LSTM GRU

На вебинаре мы говорили, что долгое время CNN и RNN архитектуры были конкурирующими, выяснить какая архитектура больше подходит для задачи сентимент анализа на данных с вебинара

1. построить свёрточную архитектуру
2. построить различные архитектуры с RNN
3. построить совместные архитектуры CNN -> RNN и/или (RNN -> CNN)
4. сделать выводы что получилось лучше

In [4]:
!pip install stop_words
!pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import numpy as np

import pandas as pd

from string import punctuation

from stop_words import get_stop_words

from pymorphy2 import MorphAnalyzer

import re

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPool1D, Dense, Flatten, Dropout, AveragePooling2D, Activation, MaxPooling2D
from tensorflow.keras.layers import  BatchNormalization, Concatenate, Masking, SimpleRNN, LSTM, GRU
from tensorflow.keras import layers
from tensorflow.keras import utils
from tensorflow.keras import models
from tensorflow.keras import callbacks
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.layers import TextVectorization

from keras.callbacks import TensorBoard 
from keras.callbacks import EarlyStopping 

In [6]:
df_train = pd.read_csv("train.csv")

df_test = pd.read_csv("test.csv")

df_val = pd.read_csv("val.csv")

In [7]:
df_train.head()

Unnamed: 0,id,text,class
0,0,"@alisachachka не уезжаааааааай. :(❤ я тоже не хочу, чтобы ты уезжала.",0
1,1,"RT @GalyginVadim: Ребята и девчата!\nВсе в кино!!! ""Вот Это Любовь!""\nСегодня! Завтра! И потом!)))))\n#вотэтолюбовь",1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретвит((((( RT,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Запретный плод. :),1
4,4,@KarineKurganova @Yess__Boss босапопа есбоса не боится мороза и .......)),1


In [8]:
df_train.shape, df_test.shape, df_val.shape

((181467, 3), (22684, 2), (22683, 3))

In [9]:
sw = set(get_stop_words("ru"))

exclude = set(punctuation)

morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['text'] = df_train['text'].apply(preprocess_text)

df_val['text'] = df_val['text'].apply(preprocess_text)

df_test['text'] = df_test['text'].apply(preprocess_text)

In [11]:
text_corpus_train = df_train['text'].values

text_corpus_valid = df_val['text'].values

text_corpus_test = df_test['text'].values

In [13]:
y_train = df_train['class'].values

y_val = df_val['class'].values

In [14]:
train_data = tf.data.Dataset.from_tensor_slices((df_train['text'], y_train))

valid_data = tf.data.Dataset.from_tensor_slices((df_val['text'], y_val))

train_data = train_data.batch(64)

valid_data = valid_data.batch(64)

max_len = max([len(i.split()) for i in text_corpus_train]) 

vectorize_layer = tf.keras.layers.TextVectorization(output_mode='int',
                                                    output_sequence_length=max_len)

text_data = train_data.map(lambda x, y: x) 

vectorize_layer.adapt(text_data)

In [15]:
vectorize_layer.vocabulary_size()

258109

In [16]:
max_features = 258108 

In [17]:
model = Sequential()

model.add(tf.keras.Input(shape=(1,), dtype=tf.string))

model.add(vectorize_layer)

model.add(Embedding(input_dim=max_features,   
                    output_dim=30,        
                    input_length=max_len, 
                    trainable=True,
                    mask_zero=True))

model.add(Masking(mask_value=0.0))

model.add(SimpleRNN(64))

model.add(Dense(64, activation='relu'))

model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 27)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 27, 30)            7743240   
                                                                 
 masking (Masking)           (None, 27, 30)            0         
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                6080      
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                        

In [19]:
early_stopping=EarlyStopping(monitor='val_loss')  

history = model.fit(train_data,
                    batch_size=512,
                    epochs=10,
                    verbose=1, 
                    validation_data=valid_data,             
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [20]:
score = model.evaluate(valid_data, batch_size=512, verbose=1)

print('\n')

print('Test score:', score[0])

print('Test accuracy:', score[1])



Test score: 0.6887406706809998
Test accuracy: 0.7152934074401855


In [21]:
total_scores_df = pd.DataFrame( data=np.array([['SimpleRNN'], [score[0]], [score[1]]]).T, columns=['name', 'test_loss', 'test_accuracy'])

In [22]:
total_scores_df.head()

Unnamed: 0,name,test_loss,test_accuracy
0,SimpleRNN,0.6887406706809998,0.7152934074401855


In [23]:
model = Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(Embedding(input_dim=max_features,   
                    output_dim=30,         
                    input_length=max_len,  
                    trainable=True,
                    mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(LSTM(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  

history = model.fit(train_data,
                    batch_size=512,
                    epochs=10,
                    verbose=1, 
                    validation_data=valid_data,
                    callbacks=[early_stopping])



Epoch 1/10
Epoch 2/10


In [24]:
score = model.evaluate(valid_data, batch_size=512, verbose=1)



In [25]:
new_row = {'name':'LSTM', 'test_loss':score[0], 'test_accuracy':score[1]}

total_scores_df = total_scores_df.append(new_row, ignore_index=True)

total_scores_df.head()

Unnamed: 0,name,test_loss,test_accuracy
0,SimpleRNN,0.6887406706809998,0.7152934074401855
1,LSTM,0.62894,0.74174


In [26]:
model = Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(Embedding(input_dim=max_features,  
                    output_dim=30,         
                    input_length=max_len,  
                    trainable=True,
                    mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(GRU(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  

history = model.fit(train_data,
                    batch_size=512,
                    epochs=10,
                    verbose=1, 
                    validation_data=valid_data,
                    callbacks=[early_stopping])



Epoch 1/10
Epoch 2/10


In [27]:
score = model.evaluate(valid_data, batch_size=512, verbose=1)

new_row = {'name':'GRU', 'test_loss':score[0], 'test_accuracy':score[1]}

total_scores_df = total_scores_df.append(new_row, ignore_index=True)

total_scores_df.head()



Unnamed: 0,name,test_loss,test_accuracy
0,SimpleRNN,0.6887406706809998,0.7152934074401855
1,LSTM,0.62894,0.74174
2,GRU,0.69312,0.50474


In [28]:
model = Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(Embedding(input_dim=max_features,  
                    output_dim=30,         
                    input_length=max_len, 
                    trainable=True,
                    mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  

history = model.fit(train_data,
                    batch_size=512,
                    epochs=10,
                    verbose=1, 
                    validation_data=valid_data,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [29]:
score = model.evaluate(valid_data, batch_size=512, verbose=1)

new_row = {'name':'CNN', 'test_loss':score[0], 'test_accuracy':score[1]}

total_scores_df = total_scores_df.append(new_row, ignore_index=True)

total_scores_df.head()



Unnamed: 0,name,test_loss,test_accuracy
0,SimpleRNN,0.6887406706809998,0.7152934074401855
1,LSTM,0.62894,0.74174
2,GRU,0.69312,0.50474
3,CNN,0.5512,0.74598


In [30]:
model = Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(Embedding(input_dim=max_features,   
                    output_dim=30,        
                    input_length=max_len, 
                    trainable=True,
                    mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(LSTM(64,  recurrent_dropout=0.2, return_sequences="True")) 
model.add(Conv1D(32, 3, activation="linear"))
model.add(Flatten())                      
model.add(Dropout(0.5)) 
model.add(Dense(1, activation="sigmoid"))    

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  

history = model.fit(train_data,
                    batch_size=512,
                    epochs=10,
                    verbose=1, 
                    validation_data=valid_data,
                    callbacks=[early_stopping])



Epoch 1/10
Epoch 2/10


In [31]:
score = model.evaluate(valid_data, batch_size=512, verbose=1)

new_row = {'name':'LSTM+CNN', 'test_loss':score[0], 'test_accuracy':score[1]}

total_scores_df = total_scores_df.append(new_row, ignore_index=True)

total_scores_df.head()



Unnamed: 0,name,test_loss,test_accuracy
0,SimpleRNN,0.6887406706809998,0.7152934074401855
1,LSTM,0.62894,0.74174
2,GRU,0.69312,0.50474
3,CNN,0.5512,0.74598
4,LSTM+CNN,0.59858,0.7417


In [32]:
model = Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(Embedding(input_dim=max_features,  
                    output_dim=30,       
                    input_length=max_len,  
                    trainable=True,
                    mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(Conv1D(32, 3, activation="linear"))
model.add(LSTM(64,  recurrent_dropout=0.2, ))                     
model.add(Dropout(0.5)) 
model.add(Dense(1, activation="sigmoid"))    

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  

history = model.fit(train_data,
                    batch_size=512,
                    epochs=10,
                    verbose=1, 
                    validation_data=valid_data,
                    callbacks=[early_stopping])



Epoch 1/10
Epoch 2/10


In [33]:
score = model.evaluate(valid_data, batch_size=512, verbose=1)

new_row = {'name':'CNN+LSTM', 'test_loss':score[0], 'test_accuracy':score[1]}

total_scores_df = total_scores_df.append(new_row, ignore_index=True)

total_scores_df



Unnamed: 0,name,test_loss,test_accuracy
0,SimpleRNN,0.6887406706809998,0.7152934074401855
1,LSTM,0.62894,0.74174
2,GRU,0.69312,0.50474
3,CNN,0.5512,0.74598
4,LSTM+CNN,0.59858,0.7417
5,CNN+LSTM,0.62485,0.73169


In [34]:
total_scores_df[['test_loss', 'test_accuracy']] = total_scores_df[['test_loss', 'test_accuracy']].astype(float)

In [35]:
total_scores_df.sort_values('test_accuracy', ascending=False)

Unnamed: 0,name,test_loss,test_accuracy
3,CNN,0.5512,0.74598
1,LSTM,0.62894,0.74174
4,LSTM+CNN,0.59858,0.7417
5,CNN+LSTM,0.62485,0.73169
0,SimpleRNN,0.68874,0.71529
2,GRU,0.69312,0.50474


**Наилучший результат имеет модель CNN. Чуть менее точна модель LSTM. При этом обучение модели LSTM заняло всего одну эпоху (4 секунды) против двух эпох (56 секунд) у модели CNN.**