In [None]:
import numpy as np
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

In [None]:
train_dir = './reviews/train'

train_texts = []
train_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(str(train_dir) + "/" + label_type + "/" + str(fname), encoding="utf-8")
            train_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                train_labels.append(0)
            else:
                train_labels.append(1)

In [None]:
test_dir = './reviews/test'

test_texts = []
test_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(str(test_dir) + "/" + label_type + "/" + str(fname), encoding="utf-8")
            test_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                test_labels.append(0)
            else:
                test_labels.append(1)

In [None]:
train_texts[:10]

In [None]:
train_labels[:10]

In [None]:
train_labels[-10:]

In [None]:
maxlen = 100   # skracamy recenzje do 100 słów
num_words = 10000    # 10000 najczęściej pojawiających się słów
embedding_dim = 100

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_texts)

In [None]:
list(tokenizer.index_word.items())[:20]

In [None]:
sequences = tokenizer.texts_to_sequences(train_texts)
print(sequences[:3])

In [None]:
word_index = tokenizer.word_index
print(f'{len(word_index)} unikatowych słów.')

In [None]:
# skracamy recenzje do pierwszych 100 słów
train_data = pad_sequences(sequences, maxlen=maxlen)
train_data.shape

In [None]:
train_data[:3]

In [None]:
train_labels = np.asarray(train_labels)
train_labels

In [None]:
# przemieszanie próbek
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)
train_data = train_data[indices]
train_labels = train_labels[indices]

train_data.shape

In [None]:
# podział na zbiór treningowy i walidacyjny
training_samples = 15000
validation_samples = 10000

X_train = train_data[:training_samples]
y_train = train_labels[:training_samples]
X_val = train_data[training_samples: training_samples + validation_samples]
y_val = train_labels[training_samples: training_samples + validation_samples]

In [None]:
# budowa modelu
# Embedding(input_dim, output_dim)

model = Sequential()
model.add(Embedding(num_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_val, y_val))

In [None]:
def plot_hist(history):
    import pandas as pd
    import plotly.graph_objects as go
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['accuracy'], name='accuracy', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_accuracy'], name='val_accuracy', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='accuracy vs. val accuracy', xaxis_title='Epoki', yaxis_title='accuracy', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name='loss', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name='val_loss', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='loss vs. val loss', xaxis_title='Epoki', yaxis_title='loss', yaxis_type='log')
    fig.show()

plot_hist(history)

In [None]:
sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test_labels)

model.evaluate(X_test, y_test, verbose=0)

# Simple RNN

In [None]:
from tensorflow.keras.layers import SimpleRNN, LSTM

In [None]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(SimpleRNN(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

In [None]:
plot_hist(history)

In [None]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, batch_size=64, epochs=5, validation_data=(X_val, y_val))

In [None]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, batch_size=32, epochs=3, validation_data=(X_val, y_val))

In [None]:
sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test_labels)

model.evaluate(X_test, y_test, verbose=0)