# Тема "Рекуррентные блоки"

1. построить свёрточную архитектуру
2. построить различные архитектуры с RNN
3. построить совместные архитектуры CNN -> RNN и/или (RNN -> CNN)
4. сделать выводы что получилось лучше

In [1]:
import pandas as pd
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
df_val = pd.read_csv("data/val.csv")

In [2]:
df_train.head()

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Зап...,1
4,4,@KarineKurganova @Yess__Boss босапопа есбоса н...,1


In [3]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['text'] = df_train['text'].apply(preprocess_text)
df_val['text'] = df_val['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(preprocess_text)

In [4]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.losses import categorical_crossentropy
from keras.callbacks import EarlyStopping  

In [5]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values
text_corpus_test = df_test['text'].values

In [6]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)



In [7]:
y_train = df_train['class'].values
y_val = df_val['class'].values

In [8]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(SimpleRNN(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [10]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.5780249238014221
Test accuracy: 0.7356169819831848


#### Сверточная сеть

In [11]:
model2 = Sequential()
model2.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model2.add(Masking(mask_value=0.0))
model2.add(Conv1D(64, 3))
model2.add(Activation("relu"))
model2.add(GlobalMaxPool1D())
model2.add(Dense(1, activation='sigmoid'))

model2.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
X_train.shape

(181467, 27)

In [13]:
history2 = model2.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

score = model2.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

Epoch 1/10
Epoch 2/10


Test score: 0.5663742423057556
Test accuracy: 0.7370718121528625


#### Совместные архитектуры

In [14]:
model3 = Sequential()
model3.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model3.add(Masking(mask_value=0.0))

model3.add(Conv1D(64, 3))

model3.add(SimpleRNN(64, return_sequences=True))
model3.add(Dense(64, activation='relu'))
model3.add(Dropout(0.5))

model3.add(Conv1D(64, 3))
model3.add(Activation("relu"))
model3.add(GlobalMaxPool1D())

model3.add(Dense(1, activation='sigmoid'))

model3.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [15]:
history3 = model3.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

score = model3.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

Epoch 1/10
Epoch 2/10


Test score: 0.5722575783729553
Test accuracy: 0.7354847192764282


In [16]:
model4 = Sequential()
model4.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model4.add(Masking(mask_value=0.0))
model4.add(SimpleRNN(64, return_sequences=True))
model4.add(Dense(64, activation='relu'))
model4.add(Dropout(0.5))

model4.add(Conv1D(64, 3))
model4.add(Activation("relu"))
model4.add(GlobalMaxPool1D())

model4.add(Dense(1, activation='sigmoid'))

model4.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
history4 = model4.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

score = model4.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

Epoch 1/10
Epoch 2/10


Test score: 0.5670084357261658
Test accuracy: 0.7390997409820557


In [25]:
model5 = Sequential()
model5.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model5.add(Masking(mask_value=0.0))

model5.add(Conv1D(64, 3))
model5.add(SimpleRNN(64))
model5.add(Dense(64, activation='relu'))
model5.add(Dropout(0.5))
model5.add(Dense(1, activation='sigmoid'))

model5.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [26]:
history5 = model5.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

score = model5.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

Epoch 1/10
Epoch 2/10


Test score: 0.5796382427215576
Test accuracy: 0.7320901155471802


Выводы:
1) У меня все решения почему-то отработали примерно одинаково и какой-то прямо принципиальной разницы в метриках я не увидел.

2) Сети с RNN учатся дольше всего