In [None]:
import os

import numpy as np

from utils.Tokenizer import Tokenizer

imdb_dir = './resources/aclImdb'
glove_dir = './resources/glove.6B'

seed = 123  # Random seed
np.random.seed(seed)

In [None]:
# Process the labels of the raw IMDB data

train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if not fname.endswith('.txt'):
            continue
        with open(os.path.join(dir_name, fname)) as f:
            texts.append(f.read())
        labels.append(0 if label_type == 'neg' else 1)

In [None]:
# Tokenizing the text of the raw IMDB data

maxlen = 100  # Max len of a seq
max_words = 10000  # Size of the tokenizer vocabulary

tokenizer = Tokenizer(max_words)
tokenizer.fit_on_texts(texts)
seqs = tokenizer.texts2seqs(texts, seq_len=maxlen)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

In [None]:
# Train/Val Split

training_samples = 200  # Thanks to GloVe, a fewer samples are enough
validation_samples = 10000

data = np.asarray(seqs)
labels = np.asarray(labels)
print(f'Shape of data tensor: {data.shape}')
print(f'Shape of labels tensor: {labels.shape}')

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

print(f'{x_train[0]=}')

In [None]:
# Parsing the Glove word-embeddings file

embedding_index = {}
with open(os.path.join(glove_dir, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding_vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = embedding_vector
print(f'Found {len(embedding_index)} word vectors.')

In [None]:
# Preparing the Glove word-embeddings matrix

embedding_dim = 100

embedding_matrix =np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_words:
        # Consider the words' indices may not be ascending
        # ! index_word[max_words] will not be taken into consideration,
        # ! and this may raise errors
        continue
    embedding_vector = embedding_index.get(word, None)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print(f'Shape of embedding matrix: {embedding_matrix.shape}')

In [None]:
# Model definition

