<a href="https://colab.research.google.com/github/MattWroclaw/neural-networks/blob/main/07_rnn/03_simple_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Simple RNN

In [1]:
import numpy as np
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

In [2]:
!wget https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
!unzip -q reviews.zip

--2024-10-24 17:06:25--  https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.253.115.207, 172.253.122.207, 172.253.63.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.253.115.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42878657 (41M) [application/x-zip-compressed]
Saving to: ‘reviews.zip’


2024-10-24 17:06:28 (23.2 MB/s) - ‘reviews.zip’ saved [42878657/42878657]



In [3]:
data_dir = './reviews'
train_dir = os.path.join(data_dir, 'train')

train_texts = []
train_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            train_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                train_labels.append(0)
            else:
                train_labels.append(1)

In [4]:
test_dir = os.path.join(data_dir, 'test')

test_texts = []
test_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            test_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                test_labels.append(0)
            else:
                test_labels.append(1)

In [5]:
maxlen = 100   # skracamy recenzje do 100 słów
num_words = 10000    # 10000 najczęściej pojawiających się słów
embedding_dim = 100

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_texts)

In [6]:
sequences = tokenizer.texts_to_sequences(train_texts)
print(sequences[:3])

[[5340, 2103, 242, 233, 288, 2, 13, 3, 52, 3930, 434, 13, 28, 4, 24, 1123, 9259, 26, 7693, 3, 4287, 9092, 4, 105, 2, 35, 26, 13, 2718, 5, 3, 367, 30, 219, 28, 55, 8, 24, 110, 18, 1, 105, 26, 314, 9457, 852, 170, 4138, 8, 2706, 4, 8422, 236, 13, 52, 8, 1, 2813, 4, 1, 102, 847, 3070, 13, 3, 4588, 1583, 57, 44, 5340, 2128, 3, 8288, 236, 2615, 3815, 13, 157, 384, 177, 2, 2753, 9458, 6, 14, 14, 91, 8289, 91, 695, 982, 6, 6792, 8, 3, 6436, 93, 7, 7, 2, 187, 1, 1853, 323, 384, 5674, 3, 3559, 209, 3, 375, 1425, 31, 2, 34, 5514, 65, 365, 6, 8, 366, 724, 8, 3, 3022, 5, 294, 16, 65, 5515, 33, 3931, 5, 170, 1166, 16, 254, 82, 2, 180, 89, 137, 14, 4252, 85, 1, 4, 65, 149, 862, 95, 1, 104, 1838, 377, 5, 41, 1, 4, 65, 375, 7, 7, 8, 2706, 4, 5456, 1381, 2, 1, 2425, 815, 26, 66, 30, 24, 937, 188, 1918, 5, 199, 110, 5, 11, 497, 16, 2070, 3606, 1, 111, 1157, 3, 70, 4716, 6437, 16, 102, 34, 25, 3366, 70, 570, 8423, 2, 5171, 12, 23, 41, 34, 33, 63, 23, 6887, 39, 901, 695, 299, 396, 804, 1032, 3, 75, 799, 7

In [7]:
# skracamy recenzje do pierwszych 100 słów
train_data = pad_sequences(sequences, maxlen=maxlen)
train_data.shape

(25000, 100)

In [8]:
train_labels = np.asarray(train_labels)
train_labels

array([0, 0, 0, ..., 1, 1, 1])

In [9]:
# przemieszanie próbek
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)
train_data = train_data[indices]
train_labels = train_labels[indices]

train_data.shape

(25000, 100)

In [10]:
# podział na zbiór treningowy i walidacyjny
training_samples = 15000
validation_samples = 10000

X_train = train_data[:training_samples]
y_train = train_labels[:training_samples]
X_val = train_data[training_samples: training_samples + validation_samples]
y_val = train_labels[training_samples: training_samples + validation_samples]

In [11]:
from tensorflow.keras.layers import SimpleRNN, LSTM

In [12]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(SimpleRNN(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [13]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [14]:
history = model.fit(X_train,
                    y_train,
                    batch_size=32,
                    epochs=5,
                    validation_data=(X_val, y_val))

Epoch 1/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 27ms/step - accuracy: 0.6603 - loss: 0.6067 - val_accuracy: 0.8113 - val_loss: 0.4253
Epoch 2/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 28ms/step - accuracy: 0.8507 - loss: 0.3653 - val_accuracy: 0.7938 - val_loss: 0.4770
Epoch 3/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 29ms/step - accuracy: 0.8985 - loss: 0.2725 - val_accuracy: 0.8333 - val_loss: 0.3928
Epoch 4/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.9307 - loss: 0.1923 - val_accuracy: 0.8133 - val_loss: 0.4261
Epoch 5/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 29ms/step - accuracy: 0.9551 - loss: 0.1359 - val_accuracy: 0.8280 - val_loss: 0.4653


In [16]:
def plot_hist(history):
    import pandas as pd
    import plotly.graph_objects as go
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['accuracy'], name='accuracy', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_accuracy'], name='val_accuracy', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='accuracy vs. val accuracy', xaxis_title='Epoki', yaxis_title='accuracy', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name='loss', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name='val_loss', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='loss vs. val loss', xaxis_title='Epoki', yaxis_title='loss', yaxis_type='log')
    fig.show()

plot_hist(history)

In [17]:
plot_hist(history)