In [3]:
# Imports

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from collections import Counter
from string import punctuation


In [4]:
# Loading data

with open("reviews.txt", "r") as f:
    reviews = f.read()
with open("labels.txt", "r") as f:
    labels = f.read()

In [5]:
# Text preprocessing

full_text = ''.join([c for c in reviews if c not in punctuation])
reviews_split = full_text.split("\n")
full_text = ' '.join(reviews_split)
words = full_text.split()

In [6]:
# Creating vocabulary and mapping for int

counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [7]:
# Converting reviews to int

reviews_ints = [[vocab_to_int[word] for word in review.split()] for review in reviews_split]

In [8]:
# Preparing labels

labels_split = labels.split("\n")
labels = np.array([1 if label == "positive" else 0 for label in labels_split])

In [9]:
# Filtering reviews (length 0)

reviews_ints = [review for review in reviews_ints if len(review) > 0]
labels = labels[:len(reviews_ints)]

In [10]:
# Creating pattern to sequencies length

seq_len = 200
features = pad_sequences(reviews_ints, maxlen=seq_len, padding="post", truncating="post")

In [11]:
# Splitting data (Train, Evaluation and Test)

split_frac = 0.7
split_idx = int(len(features) * split_frac)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x) * 0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

In [12]:
# Model construction

embed_size = 300
lstm_size = 256

model = Sequential([
    Embedding(input_dim=len(vocab) + 1,
              output_dim=embed_size, input_length=seq_len),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Bidirectional(LSTM(lstm_size, return_sequences=True)),
    Dropout(0.5),
    LSTM(lstm_size, return_sequences=True),
    GlobalMaxPooling1D(),
    Dense(128, activation="relu"),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(32, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])



In [13]:
# Compiling the model

model.compile(optimizer=Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"])

In [14]:
# Training the model

batch_size = 500
epochs = 15

history = model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, validation_data=(val_x, val_y), verbose=1)

Epoch 1/15
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 7s/step - accuracy: 0.5005 - loss: 0.6940 - val_accuracy: 0.5491 - val_loss: 0.6913
Epoch 2/15
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 6s/step - accuracy: 0.5259 - loss: 0.6956 - val_accuracy: 0.5189 - val_loss: 0.6868
Epoch 3/15
[1m 6/35[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m2:34[0m 5s/step - accuracy: 0.5571 - loss: 0.6770

In [15]:
# Model evaluate (test)

test_loss, test_acc = model.evaluate(test_x, test_y, batch_size=batch_size, verbose=1)

print(f"Test accuracy: {test_acc:.3f}")

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2s/step - accuracy: 0.7670 - loss: 0.9983
Test accuracy: 0.764
