In [1]:

import numpy as np
import pandas as pd
import random

import pickle

import re
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

!pip install contractions
from contractions import contractions_dict
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter, defaultdict

import torch
from torch.utils.data import Dataset, Subset, DataLoader
import torch.nn.functional as F

from gensim.models import Word2Vec, KeyedVectors



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jmanu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jmanu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jmanu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jmanu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




# Text Processing

In [None]:
# Sample text to test on the main functions, each sentence is a special case
sample_texts = ["Capital LeTTers arounD",
                "contraction's didn't you're won't can't",
                "carch3 @ @keep num#ber keep! did%nt (keep)  ^^^^keep u.s.",
                "123 adios23 d2do5 3not",
                "you them a are not"]

In [2]:
# Convert to lowercase
def lowercase_text(raw_text):
    lowercase_text = [text.lower() for text in raw_text]
    return lowercase_text

In [None]:
# Test lower case
lowercase_text(sample_texts)

NameError: name 'sample_texts' is not defined

In [3]:
# Expand contractions
def expand_contractions(text):
    cleaned_text = []
    for sentence in text:
        expanded_text = [contractions_dict.get(word, word) for word in sentence.split()]
        cleaned_text.append(' '.join(expanded_text))
    return cleaned_text

In [None]:
# Test contraction expand
expand_contractions(sample_texts)

['capital letters around',
 "contraction's did not you are will not cannot",
 'carch3 @ @keep num#ber keep! did%nt (keep) ^^^^keep',
 '123 adios23 d2do5 3not',
 'you them a are not']

In [6]:
# Remove characters from the text and words with characters in them
# It keeps words with characters at the beggining or at the end
def remove_characters(raw_text):
    cleaned_text = []
    for text in raw_text:
        new_words = []
        words = text.split()
        for word in words:
            # Check if special characters are present in the word
            if re.search(r'[^a-zA-Z0-9]', word):
                # Check if they are at the start or end
                if word[0].isalnum() and word[-1].isalnum():
                    continue  # Special character inside the word
                else:
                    # Remove special characters from start and end
                    word = re.sub(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$', '', word)
            new_words.append(word)
        cleaned_text.append(' '.join(new_words))
    return cleaned_text


In [None]:
# Test the function
remove_characters(sample_texts)

['Capital LeTTers arounD',
 '',
 'carch3  keep keep keep keep u.s',
 '123 adios23 d2do5 3not',
 'you them a are not']

In [7]:
# Remove numbers or words with numbers
def remove_numbers(raw_text):
    cleaned_text =  [' '.join(re.sub(r'\b\w*\d\w*\b', '', text).split()) for text in raw_text]
    return cleaned_text

In [None]:
# Test the function
remove_numbers(sample_texts)

['Capital LeTTers arounD',
 "contraction's didn't you're won't can't",
 '@ @keep num#ber keep! did%nt (keep) ^^^^keep u.s.',
 '',
 'you them a are not']

In [8]:
# Remove Stop Words
def remove_stopwords(raw_text):
    stop_words = set(stopwords.words('english'))
    cleaned_text = []

    for sentence in raw_text:
        expanded_text = [word for word in sentence.split() if word not in stop_words]
        cleaned_text.append(' '.join(expanded_text))
    return cleaned_text

In [None]:
# Sample remove stopwords
remove_stopwords(sample_texts)

['Capital LeTTers arounD',
 "contraction's can't",
 'carch3 @ @keep num#ber keep! did%nt (keep) ^^^^keep u.s.',
 '123 adios23 d2do5 3not',
 '']

In [9]:
# Now we are going to do lemmantization, before this, we want to understand the word context on sentence
# We want to know if the word represents: adjetive, adverbe, verb, noun

# Convert the part-of-speech naming scheme from the treebank to the wordnet format
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Lemantaze with a word tag: adjetive, adverbe, verb, noun
def lemmatize_sentence(tagged_tokens):
    lemmatizer = WordNetLemmatizer()
    finish_text = []

    for sentence in tagged_tokens:
        lemmatized_sentence = []

        for word, tag in sentence:
            wntag = get_wordnet_pos(tag)
            lemmatized_sentence.append(lemmatizer.lemmatize(word, pos=wntag))

        finish_text.append(lemmatized_sentence)
    return finish_text

In [10]:
# Build a function that will do each process
# The function will return the process text, with tokenized words on each sentence
def clean_text(text):
    lowr_text = lowercase_text(text)
    contraction_text = expand_contractions(lowr_text)
    char_text = remove_characters(contraction_text)
    number_text = remove_numbers(char_text)
    stopw_text = remove_stopwords(number_text)
    tag_tokenized_text = [pos_tag(word_tokenize(text)) for text in stopw_text]
    final_text = lemmatize_sentence(tag_tokenized_text)
    return final_text


In [None]:
# Test on sample text
clean_text(sample_texts)

[['capital', 'letter', 'around'],
 ['can', 'not'],
 ['keep', 'keep', 'keep', 'keep', 'u.s'],
 [],
 []]

# Prepare Data

In [None]:
# Mount to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Connect to the folder where I have the code
import os
!pwd
os.chdir('/content/drive/MyDrive/UNISA - Machine Learning/Project')
!pwd

/content
/content/drive/MyDrive/UNISA - Machine Learning/Project


In [11]:
# Import the data I will use
df_train = pd.read_csv('./Genre Classification Dataset/train_data.txt',sep=' ::: ',header=None,engine='python', names=['Title','Genre','Description'])
df_test = pd.read_csv('./Genre Classification Dataset/test_data_solution.txt',sep=' ::: ',header=None,engine='python', names=['Title','Genre','Description'])

In [12]:
# look at the sapes
print("Train:", df_train.shape)
print("Test:", df_test.shape)

Train: (54214, 3)
Test: (54200, 3)


In [13]:
# I'm going to merge them and do the splits myself
df = pd.concat([df_train, df_test])

In [14]:
# Get the top 10 most used movie genres
df['Genre'].value_counts()[0:10].index

Index(['drama', 'documentary', 'comedy', 'short', 'horror', 'thriller',
       'action', 'western', 'reality-tv', 'family'],
      dtype='object')

In [15]:
# Is a really unbalanced dataset, I will work with the top 10 classes and make an even split, I will take off short and keep adventure instead.
df['Genre'].value_counts()

drama          27225
documentary    26192
comedy         14893
short          10145
horror          4408
thriller        3181
action          2629
western         2064
reality-tv      1767
family          1567
adventure       1550
music           1462
romance         1344
sci-fi          1293
adult           1180
crime           1010
animation        996
sport            863
talk-show        782
fantasy          645
mystery          637
musical          553
biography        529
history          486
game-show        387
news             362
war              264
Name: Genre, dtype: int64

In [16]:
# Create list for the genres I will use to clasiffy
genres = ['drama', 'documentary', 'comedy', 'adventure', 'horror', 'thriller',
       'action', 'western', 'reality-tv', 'family']

In [17]:
# Short the dataframe for the top 10 most used genres
df = df[df["Genre"].isin(genres)]
df = df.reset_index(drop=True)

In [18]:
# Now I will balance the dataset to 1550 samples
# Create a new df
df_balanced= pd.DataFrame()

# Loop through each genre and sample 1550 rows
for genre in df['Genre'].unique():
    genre_subset = df[df['Genre'] == genre]
    samples = genre_subset.sample(n=min(len(genre_subset), 1550), random_state=42)
    df_balanced = pd.concat([df_balanced, samples])

# Reset the index of the new DataFrame
df_balanced = df_balanced.reset_index(drop=True)

In [19]:
# Check if it worked
df_balanced["Genre"].value_counts()

drama          1550
thriller       1550
documentary    1550
comedy         1550
reality-tv     1550
horror         1550
action         1550
adventure      1550
western        1550
family         1550
Name: Genre, dtype: int64

In [20]:
# Get the descriptions
corpus = df_balanced[["Description"]].values

In [21]:
# Clean all descriptions
process_corpus = []
for sample in corpus:
    process_corpus.append(clean_text(sample)[0])

In [26]:
# Have a look at a sample
print("Original:", df_balanced.iloc[0,2],"Length:",  len(df_balanced.iloc[0,2].split()))
print("Process:", process_corpus[0], "Length:", len(process_corpus[0]))

Original: An American Squad of GI's capture one Chinese POW and while they try to get back to their command post, they are killed one-by-one until only the POW is left. During this ordeal, the POW becomes close to his captors to the point he is heart-broken when they are killed and he is left to return home to his wife and baby. Length: 62
Process: ['american', 'squad', 'capture', 'one', 'chinese', 'pow', 'try', 'get', 'back', 'command', 'post', 'kill', 'pow', 'leave', 'ordeal', 'pow', 'become', 'close', 'captor', 'point', 'kill', 'leave', 'return', 'home', 'wife', 'baby'] Length: 26


In [None]:
# Look at average description word length per genre
movies = {}
for genre in genres:
    total = 0
    for i in df_balanced[df_balanced["Genre"] == genre].index:
        total += len(process_corpus[i])
    movies[genre] = round(total/1550, 0)

# Print
movies

{'drama': 58.0,
 'documentary': 58.0,
 'comedy': 52.0,
 'adventure': 50.0,
 'horror': 57.0,
 'thriller': 53.0,
 'action': 53.0,
 'western': 66.0,
 'reality-tv': 49.0,
 'family': 53.0}

In [27]:
# Now we can do the splits for the data
# Number of classes
num_classes = 10

# Number of samples per class
samples_per_class = 1550

# Splits for train, validate, and test
train_split = int(0.5 * samples_per_class)
val_split = int(0.3 * samples_per_class)
test_split = 1550 - train_split - val_split

# Total Samples per Class
print("Train:", train_split)
print("Val:", val_split)
print("Test:", test_split)

Train: 775
Val: 465
Test: 310


In [28]:
# Create lists to hold indices for each set
train_indices, val_indices, test_indices = [], [], []

# Set random state
rng = np.random.default_rng(seed=42)

# Split indices for each class
for class_index in range(num_classes):
    start_index = class_index * samples_per_class
    indices = list(range(start_index, start_index + samples_per_class))
    rng.shuffle(indices)
    train_indices.extend(indices[:train_split])
    val_indices.extend(indices[train_split:train_split + val_split])
    test_indices.extend(indices[train_split + val_split:])

# Final lengths per class
print("Train:", len(train_indices))
print("Val:", len(val_indices))
print("Test:", len(test_indices))

Train: 7750
Val: 4650
Test: 3100


In [29]:
# Create a full vocabulary
flatten_tokens = [word for sentence in process_corpus for word in sentence]
flatten_tokens = flatten_tokens + genres
word_counts = Counter(flatten_tokens)
vocab = {word: idx for idx, (word, _) in enumerate(word_counts.items())}
idx_to_word = {idx: word for word, idx in vocab.items()}

In [30]:
len(vocab)

49732

Courpus with less words - *not used*

In [None]:
# remove non frequent words
remove_words = [word for word, count in word_counts.items() if count < 5]
len(remove_words)

35972

In [None]:
# Placeholder for unknown words
placeholder = "<UNKNOWN>"
processed_corpus = []

# Update corups
for sentence in process_corpus:
    new_sentence = [word if word not in remove_words else placeholder for word in sentence]
    processed_corpus.append(new_sentence)


In [None]:
with open('./processed_corpus.pkl', 'wb') as f:
    pickle.dump(processed_corpus, f)

In [None]:
with open('my_list.pkl', 'rb') as f:
    processed_corpus = pickle.load(f)

In [None]:
# Update vocabulary
flatten_tokens = [word for sentence in processed_corpus for word in sentence]
flatten_tokens = flatten_tokens + genres
word_counts = Counter(flatten_tokens)
vocab = {word: idx for idx, (word, _) in enumerate(word_counts.items())}
idx_to_word = {idx: word for word, idx in vocab.items()}

In [None]:
# Total words in new vocabulary
len(vocab)

13762

Training Corpus - *Currently full corpus for traing the word2vec*

In [None]:
# Creating a training corpus with the training indices
train_corpus = [process_corpus[index] for index in train_indices]

# Word2vec

1.- Gensim - Word2vec

In [31]:
# Create the Word2Vec model
word2vec_g = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)

In [32]:
# Train the model on our corpus
# Model trains fast, no need to send to device
word2vec_g.build_vocab(process_corpus)
word2vec_g.train(process_corpus, total_examples=word2vec_g.corpus_count, epochs=20)


KeyboardInterrupt: 

In [35]:
# Find words similar to 'american'
similar_words = word2vec_g.wv.most_similar('american')
print(similar_words)

[('african', 0.583674430847168), ('floridian', 0.5626319050788879), ('korean', 0.5395526885986328), ('expat', 0.5301021933555603), ('british', 0.5290839076042175), ('giannini', 0.5182136297225952), ('arab', 0.517073392868042), ('america', 0.5146300196647644), ('aboriginal', 0.5141898989677429), ('asian', 0.5075013637542725)]


In [34]:
# Look at the vector representation of the word
word2vec_g.wv['american']

array([ 0.6352159 ,  0.22406244, -1.5482981 , -1.0203586 ,  1.0783644 ,
        0.38738745,  1.308556  , -2.4453192 , -1.4578745 , -1.1502886 ,
       -0.17273381, -2.8118844 ,  1.3380165 , -3.726514  , -1.4046497 ,
       -2.1051283 , -1.6193353 ,  0.29824474,  0.37169066, -0.46930778,
       -0.9469707 ,  1.7773088 ,  0.05977992, -1.1591473 , -1.5467453 ,
        1.2519972 , -0.9482009 , -0.64813644, -2.8115933 , -0.5376743 ,
        1.3203669 ,  0.45369166, -0.96778727, -0.21026962,  0.5115575 ,
       -0.3134072 , -0.8402765 , -0.36334297,  0.8413578 , -1.6149932 ,
        0.25404006, -0.29458848,  0.7217553 ,  2.1788254 ,  0.19984618,
        0.61873984,  0.75563824,  0.05571314, -0.43087366, -2.1158206 ,
       -0.07347316, -1.5492835 ,  1.30679   , -3.2067845 , -2.6090105 ,
        2.7399142 ,  0.297596  ,  0.5840321 , -3.9002209 , -0.503715  ,
       -0.6403259 , -0.5064867 ,  0.19869536, -1.7935792 , -0.14431955,
        0.9113393 , -0.29766744, -0.5748825 , -0.49926427,  0.95

In [None]:
# Comparing the sizes
print("Vocabulary:", len(vocab))
print("Model Voc:", len(word2vec_g.wv.key_to_index))

Vocabulary: 49732
Model Voc: 49732


In [None]:
# Now I will create a word to vector dictonary
word_vector_word2vec_g = {word: torch.tensor(word2vec_g.wv[word], dtype=torch.float64) for word in word2vec_g.wv.key_to_index}


In [None]:
# Let's look at a vector
word_vector_word2vec_g['american']

tensor([-0.0017,  0.0517, -0.0601, -0.1193,  0.0810, -0.0496,  0.1552, -0.1221,
        -0.1558, -0.1227, -0.0630, -0.2072,  0.0954, -0.1832, -0.0859, -0.0925,
        -0.0982,  0.0189,  0.0013, -0.0910, -0.1946,  0.0706,  0.0363, -0.0395,
        -0.0470,  0.1158, -0.0036, -0.0930, -0.2082, -0.0980,  0.1134, -0.0618,
         0.0077,  0.0110,  0.0144, -0.0236, -0.0692, -0.0868,  0.0715, -0.1377,
         0.0241, -0.0865,  0.1088,  0.1833,  0.0884,  0.1197,  0.0771, -0.0391,
        -0.0354, -0.1800,  0.0297, -0.0776,  0.0789, -0.1959, -0.1693,  0.2475,
         0.0293, -0.0040, -0.1872, -0.0751,  0.0331,  0.0013,  0.0260,  0.0571,
         0.0355,  0.0668,  0.0119, -0.0027, -0.0550,  0.0474, -0.1716,  0.0815,
         0.0351, -0.0857,  0.0347,  0.1434, -0.0383,  0.0140, -0.0289,  0.1766,
        -0.0040, -0.0828, -0.2161,  0.0448,  0.0115,  0.1626,  0.0422, -0.1833,
         0.0167,  0.1673, -0.0160, -0.0841, -0.0659, -0.0414, -0.0339, -0.0357,
        -0.0113,  0.0326,  0.0860,  0.07

In [None]:
# Save the embeddings
with open('./word_vector_word2vec_g.pkl', 'wb') as f:
    pickle.dump(word_vector_word2vec_g, f)

In [33]:
# Load embeddings
with open('./word_vector_word2vec_g.pkl', 'rb') as f:
    word_vector_word2vec_g = pickle.load(f)

2.- Manuel - Word2vec

SKIPGRAM

In [None]:
# Get a training data
window_size = 2
training = []

for sentence in process_corpus:
    indices = [vocab[word] for word in sentence]
    for i in range(len(indices)):
        target = indices[i]
        context = indices[max(i - window_size, 0) : i] + \
                  indices[i + 1 : i + window_size + 1]
        for ctx_word in context:
            training.append((target, ctx_word))

# Shuffle the training pairs
random.shuffle(training)


In [None]:
# Size of training data
len(training)

3320216

Minimise Training Data - *Not used*

In [None]:
training_clean = []

# Dictionary to keep track of word occurrences
word_occurrences = defaultdict(int)

for target, ctx_word in training:
    if word_occurrences[target] < 500:
        training_clean.append((target, ctx_word))
        word_occurrences[target] += 1

In [None]:
# Final lenght of the data
len(training_clean)

1902373

In [None]:
# Create a costum dataset for our project
class Test_skipgram(Dataset):
    def __init__(self, training):
        self.training = training

    def __len__(self):
        return len(self.training)

    def __getitem__(self, idx):
        input = torch.tensor(self.training[idx][0])
        output = torch.tensor(self.training[idx][1])
        return input, output

In [None]:
# Create a dataset
train_dataset = Test_skipgram(training)

In [None]:
# Create a batch loader
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)

In [36]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim = 100):
        super(SkipGramModel, self).__init__()
        # Embeding
        self.embeddings = nn.Embedding(
            num_embeddings = vocab_size,
            embedding_dim = embedding_dim)

        # Linear
        self.linear = nn.Linear(
            in_features = embedding_dim,
            out_features = vocab_size)

    def forward(self, input_word):
        embed = self.embeddings(input_word)
        out = self.linear(embed)
        return out

In [37]:
# configure device, use 'cpu' if cuda is not available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cpu


In [38]:
# Create the model and move it to the GPU if available
wrod2vec_skp = SkipGramModel(vocab_size = len(vocab)).to(device)

In [39]:
# Calculate the total number of parameters
total_params = sum(p.numel() for p in wrod2vec_skp.parameters())

# Calculate the number of trainable parameters
trainable_params = sum(p.numel() for p in wrod2vec_skp.parameters() if p.requires_grad)

print(f"Total Parameters: {total_params}")
print(f"Trainable Parameters: {trainable_params}")

Total Parameters: 9996132
Trainable Parameters: 9996132


In [None]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(wrod2vec_skp.parameters(), lr=0.01, weight_decay=5e-4)

In [None]:
# Define number of epochs
num_epochs = 10

# Training loop
for epoch in tqdm(range(num_epochs), desc="Training"):
    # Training parametres for each epoch
    wrod2vec_skp.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    # Loop for each batch of the training
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = wrod2vec_skp(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        # Calculate training accuracy for the current batch
        _, predicted = torch.max(outputs, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    # Update the learning rate using the scheduler
    #scheduler.step()

    # Calculate training accuracy for the entire epoch
    train_accuracy = 100 * correct_train / total_train

    # Calculate learning rate for the epoch
    #current_lr = scheduler.get_last_lr()[0]

    # # Validation phase
    # model.eval()
    # val_loss = 0.0
    # correct_val = 0
    # total_val = 0

    # # Disable gradient calculation
    # with torch.no_grad():
    #     for inputs, labels in val_loader:
    #         inputs, labels = inputs.to(device), labels.to(device)
    #         outputs = model(inputs)
    #         loss = criterion(outputs, labels)
    #         val_loss += loss.item()

    #         _, predicted = torch.max(outputs, 1)
    #         total_val += labels.size(0)
    #         correct_val += (predicted == labels).sum().item()

    # # Calculate validation statistics
    # val_accuracy = 100 * correct_val / total_val

    # Save the model every five epochs
    if (epoch + 1) % 5 == 0:
        model_checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': wrod2vec_skp.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': running_loss
        }
        model_save_path = f'wrod2vec_skp_checkpoint_epoch_{epoch + 1}.pt'
        torch.save(model_checkpoint, model_save_path)

    # Print training and validation statistics
    tqdm.write(f'\nEpoch {epoch + 1}/{num_epochs}, '
               #f'Learning Rate: {current_lr:.4f}\n'
               f'Training Loss: {running_loss/len(train_loader):.4f}, '
               f'Training Accuracy: {train_accuracy:.2f}%\n'
              # f'Validation Loss: {val_loss/len(val_loader):.4f}, '
              # f'Validation Accuracy: {val_accuracy:.2f}%\n\n'
               )



print('Finished Training')

Training:  10%|█         | 1/10 [02:17<20:35, 137.26s/it]


Epoch 1/10, Training Loss: 10.8329, Training Accuracy: 0.19%



Training:  20%|██        | 2/10 [04:31<18:04, 135.61s/it]


Epoch 2/10, Training Loss: 10.5761, Training Accuracy: 0.76%



Training:  30%|███       | 3/10 [06:47<15:48, 135.48s/it]


Epoch 3/10, Training Loss: 10.3837, Training Accuracy: 0.84%



Training:  40%|████      | 4/10 [09:01<13:30, 135.16s/it]


Epoch 4/10, Training Loss: 10.2397, Training Accuracy: 0.85%



Training:  50%|█████     | 5/10 [11:18<11:18, 135.71s/it]


Epoch 5/10, Training Loss: 10.1321, Training Accuracy: 0.86%



Training:  60%|██████    | 6/10 [13:33<09:02, 135.52s/it]


Epoch 6/10, Training Loss: 10.0509, Training Accuracy: 0.86%



Training:  70%|███████   | 7/10 [15:47<06:45, 135.16s/it]


Epoch 7/10, Training Loss: 9.9888, Training Accuracy: 0.84%



Training:  80%|████████  | 8/10 [18:11<04:35, 137.85s/it]


Epoch 8/10, Training Loss: 9.9405, Training Accuracy: 0.83%



Training:  90%|█████████ | 9/10 [20:39<02:21, 141.03s/it]


Epoch 9/10, Training Loss: 9.9025, Training Accuracy: 0.82%



Training: 100%|██████████| 10/10 [22:54<00:00, 137.50s/it]


Epoch 10/10, Training Loss: 9.8722, Training Accuracy: 0.81%

Finished Training





In [42]:
# Load the saved parametres
checkpoint = torch.load('./wrod2vec_skp_checkpoint_epoch_10.pt', map_location= device)

# Load state dict into the model
wrod2vec_skp.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [None]:
def find_closest_words(model, word, vocab, idx_to_word, device, n=10):
    if word not in vocab:
        return "Word not in vocabulary."

    # Get the index of the word and its embedding
    word_idx = vocab[word]
    word_embedding = model.embeddings(torch.tensor([word_idx], device=device)).detach()

    # Move all embeddings to the same device as word_embedding
    all_embeddings = model.embeddings.weight.data.to(device)

    # Compute cosine similarities
    cos_similarities = torch.matmul(all_embeddings, word_embedding.t()).squeeze()

    # Move the tensor to CPU before converting to NumPy array
    top_n_indices = cos_similarities.cpu().argsort(descending=True).numpy().tolist()
    closest_words = [idx_to_word[idx] for idx in top_n_indices if idx != word_idx][:n]

    return closest_words


In [None]:
# Test on word similarity
chosen_word = 'american'
closest_words = find_closest_words(wrod2vec_skp, chosen_word, vocab, idx_to_word, device, n=10)
print(f"Closest words to '{chosen_word}': {closest_words}")


Closest words to 'american': ['taekwondo', 'unknowingly', 'hamid', 'kazim', 'guaritore', 'flattering', 'goa', 'superstition', 'strada', 'malia']


In [None]:
# Now we create a dictionary for the word and embedding
word_vector_wrod2vec_skp = {}

for word in vocab:
  word_num = vocab[word]
  word_vector_wrod2vec_skp[word] = wrod2vec_skp.embeddings(torch.tensor(word_num, device = device))


In [None]:
# Print size
len(word_vector_wrod2vec_skp)

49732

In [43]:
# Save the embeddings
with open('./word_vector_wrod2vec_skp.pkl', 'wb') as f:
    pickle.dump(word_vector_wrod2vec_skp, f)

NameError: name 'word_vector_wrod2vec_skp' is not defined

In [56]:
# Load embeddings
word_vector_wrod2vec_skp = {}

with open('./word_vector_wrod2vec_skp.pkl', 'rb') as f:
    word_vector_word2vec_skp = torch.load(f, map_location=torch.device('cpu'))

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

 3. Manuel - CBOW *I won't be using this model*

In [None]:
# Create a training data for the CBOW only taking in cosideration when is 4 context words for a target
def create_cbow_training_data(processed_corpus, vocab, window_size=2):
    batch_input, batch_output = [], []

    for sentence in processed_corpus:
        sentence_ids = [vocab[word] for word in sentence if word in vocab]
        if len(sentence_ids) < window_size * 2 + 1:
            continue

        for idx in range(window_size, len(sentence_ids) - window_size):
            # Get the context words
            context = sentence_ids[idx - window_size:idx] + sentence_ids[idx + 1:idx + window_size + 1]
            # Get the target word
            target = sentence_ids[idx]
            batch_input.append(context)
            batch_output.append(target)

    return torch.tensor(batch_input, dtype=torch.long), torch.tensor(batch_output, dtype=torch.long)



In [None]:
# Example of Training data
training_CBOW = []

cbow_input, cbow_output = create_cbow_training_data(process_corpus, vocab, window_size)

print("Input (Context Words):", cbow_input)
print("Output (Target Words):", cbow_output)

Input (Context Words): tensor([[   0,    1,    3,    4],
        [   1,    2,    4,    5],
        [   2,    3,    5,    6],
        ...,
        [ 521,  173, 2231, 3495],
        [ 173, 4973, 3495, 1501],
        [4973, 2231, 1501,   85]])
Output (Target Words): tensor([   2,    3,    4,  ..., 4973, 2231, 3495])


In [None]:
# Create a costum dataset for our project
class Test_CBOW(Dataset):
    def __init__(self, input, output):
        self.input = input
        self.output = output

    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        input = torch.tensor(self.input[idx])
        output = torch.tensor(self.output[idx])
        return input, output

In [None]:
# Create a dataset
train_CBOW = Test_CBOW(cbow_input, cbow_output)

In [None]:
# Create a batch loader
train_loader_CBOW = DataLoader(train_CBOW, batch_size=128, shuffle=True, drop_last=True)

In [None]:
EMBED_DIMENSION = 100
EMBED_MAX_NORM = 1

class CBOW_Model(nn.Module):
    def __init__(self, vocab_size: int):
        super(CBOW_Model, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM,
        )
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size,
        )
    def forward(self, inputs_):
        x = self.embeddings(inputs_)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x

In [None]:
# Create the model and move it to the GPU if available
word2vec_CBOW = CBOW_Model(vocab_size = len(vocab)).to(device)

In [None]:
# Calculate the total number of parameters
total_params = sum(p.numel() for p in word2vec_CBOW.parameters())

# Calculate the number of trainable parameters
trainable_params = sum(p.numel() for p in word2vec_CBOW.parameters() if p.requires_grad)

print(f"Total Parameters: {total_params}")
print(f"Trainable Parameters: {trainable_params}")

Total Parameters: 9996132
Trainable Parameters: 9996132


In [None]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(word2vec_CBOW.parameters(), lr=0.01, weight_decay=5e-4)

In [None]:
# Define number of epochs
num_epochs = 3

# Training loop
for epoch in tqdm(range(num_epochs), desc="Training"):
    # Training parametres for each epoch
    word2vec_CBOW.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    # Loop for each batch of the training
    for inputs, labels in train_loader_CBOW:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = word2vec_CBOW(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        # Calculate training accuracy for the current batch
        _, predicted = torch.max(outputs, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    # Update the learning rate using the scheduler
    #scheduler.step()

    # Calculate training accuracy for the entire epoch
    train_accuracy = 100 * correct_train / total_train

    # Calculate learning rate for the epoch
    #current_lr = scheduler.get_last_lr()[0]

    # # Validation phase
    # model.eval()
    # val_loss = 0.0
    # correct_val = 0
    # total_val = 0

    # # Disable gradient calculation
    # with torch.no_grad():
    #     for inputs, labels in val_loader:
    #         inputs, labels = inputs.to(device), labels.to(device)
    #         outputs = model(inputs)
    #         loss = criterion(outputs, labels)
    #         val_loss += loss.item()

    #         _, predicted = torch.max(outputs, 1)
    #         total_val += labels.size(0)
    #         correct_val += (predicted == labels).sum().item()

    # # Calculate validation statistics
    # val_accuracy = 100 * correct_val / total_val

    # Save the model every five epochs
    if (epoch + 1) % 3 == 0:
        model_checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': word2vec_CBOW.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': running_loss
        }
        model_save_path = f'word2vec_CBOW_checkpoint_epoch_{epoch + 1}.pt'
        torch.save(model_checkpoint, model_save_path)

    # Print training and validation statistics
    tqdm.write(f'\nEpoch {epoch + 1}/{num_epochs}, '
               #f'Learning Rate: {current_lr:.4f}\n'
               f'Training Loss: {running_loss/len(train_loader):.4f}, '
               f'Training Accuracy: {train_accuracy:.2f}%\n'
              # f'Validation Loss: {val_loss/len(val_loader):.4f}, '
              # f'Validation Accuracy: {val_accuracy:.2f}%\n\n'
               )



print('Finished Training')

  input = torch.tensor(self.input[idx])
  output = torch.tensor(self.output[idx])
Training:  33%|███▎      | 1/3 [00:42<01:24, 42.19s/it]


Epoch 1/3, Training Loss: 2.5726, Training Accuracy: 0.40%



Training:  67%|██████▋   | 2/3 [01:28<00:44, 44.68s/it]


Epoch 2/3, Training Loss: 2.5617, Training Accuracy: 0.59%



Training: 100%|██████████| 3/3 [02:14<00:00, 44.97s/it]


Epoch 3/3, Training Loss: 2.5512, Training Accuracy: 0.59%

Finished Training





In [None]:
# Load the saved parametres
checkpoint = torch.load('./word2vec_CBOW_checkpoint_epoch_3.pt')

# Load state dict into the model
word2vec_CBOW.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [None]:
# Test on word similarity
chosen_word = 'american'
closest_words = find_closest_words(word2vec_CBOW, chosen_word, vocab, idx_to_word, device, n=10)
print(f"Closest words to '{chosen_word}': {closest_words}")

Closest words to 'american': ['thinking', 'depopulate', 'lithuania', 'memories', 'cardplayer', 'geetas', 'babygirl', 'abhishek', 'circumstance', 'ouija']


In [None]:
# Now we create a dictionary for the word and embedding
word_vector_word2vec_CBOW = {}

for word in vocab:
  word_num = vocab[word]
  word_vector_word2vec_CBOW[word] = word2vec_CBOW.embeddings(torch.tensor(word_num, device = device))

In [None]:
# Print size
len(word_vector_word2vec_CBOW)

49732

In [None]:
# Save the embeddings
with open('./word_vector_word2vec_CBOW.pkl', 'wb') as f:
    pickle.dump(word_vector_word2vec_CBOW, f)

In [57]:
# Load embeddings
with open('./word_vector_word2vec_CBOW.pkl', 'rb') as f:
    word_vector_word2vec_CBOW = pickle.load(f)

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

4 - Pre-Trained word2vec

Glove - 100

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = './glove.6B.100d.txt'
word2vec_output_file = './glove.6B.100d.word2vec.txt'

glove2word2vec(glove_input_file, word2vec_output_file)

  glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 100)

In [None]:
# Load the model with Gensim
model_glove = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [None]:
# Initialize a new Word2Vec model
word2vec_glove_100 = Word2Vec(vector_size=100, min_count=1)

# Add corpus vocabulary
word2vec_glove_100.build_vocab(process_corpus)

# Build vocabulary from the GloVe model
word2vec_glove_100.build_vocab([list(model_glove.key_to_index.keys())], update=True)


In [None]:
# Transfer learning for words in GloVe
total_vocab = len(word2vec_glove_100.wv)
for word, index in model_glove.key_to_index.items():
    if word in word2vec_glove_100.wv.key_to_index:
        word2vec_glove_100.wv.vectors[word2vec_glove_100.wv.key_to_index[word]] = model_glove[word]

In [None]:
# Retrain model on project corpus
word2vec_glove_100.train(process_corpus, total_examples=len(process_corpus), epochs=10, word_count=1)

(8533040, 8533040)

In [None]:
# Find words similar to 'american'
similar_words = word2vec_glove_100.wv.most_similar('american')
print(similar_words)

[('america', 0.7897357940673828), ('canadian', 0.7711366415023804), ('americans', 0.7692500352859497), ('british', 0.7675219178199768), ('african', 0.7663124799728394), ('native', 0.7617824077606201), ('u.s.', 0.752103865146637), ('australian', 0.7327556610107422), ('nation', 0.7215123176574707), ('u.s', 0.715589165687561)]


In [None]:
# Now I will create a word to vector dictonary for project vocabulary
word_vector_word2vec_glove_100 = {word: torch.tensor(word2vec_glove_100.wv[word], dtype=torch.float64) for word in vocab}


In [None]:
# Let's look at a vector
word_vector_word2vec_glove_100['american']

tensor([ 0.4821,  2.1065,  2.7069,  0.0364,  1.0853, -1.8900, -0.5241, -4.1762,
        -0.9483,  0.4180, -0.8020, -1.3401,  0.9504,  1.4267, -1.7116,  1.0932,
         2.9092, -0.4862, -1.4452,  0.7147,  2.1533,  1.2674,  2.4156, -0.8985,
         1.2497, -0.5533,  0.2356, -3.6768,  1.4475, -0.0148, -0.7305,  1.3082,
        -0.0871, -0.8859,  0.2175, -0.8065, -1.3111,  2.8696, -0.0825,  2.1021,
        -3.3625, -2.0109, -0.1507,  2.1596,  1.3362, -0.3025, -0.2484, -0.1010,
        -0.4143, -3.1531, -0.7171,  0.7964,  0.8004,  1.5582,  0.5978, -6.1966,
        -1.1580, -0.8228,  7.1553,  1.3803,  1.1359,  2.3849,  0.2537, -2.0827,
         1.3523, -1.2840, -0.4707,  1.5434, -0.3084, -0.3154, -0.3006, -0.1315,
        -2.2160,  0.4938,  0.5986,  1.8397,  1.3205, -0.0694, -3.7630, -1.3379,
        -0.0533,  0.5773, -0.3365,  1.6439, -3.6063, -1.4586, -1.0522,  0.5624,
         0.4609, -1.8788,  0.3404, -1.5877, -0.4120, -0.7619, -4.0528, -0.2314,
        -2.3075, -0.4341,  0.6466,  2.64

In [None]:
# Save the embeddings
with open('./word_vector_word2vec_glove_100.pkl', 'wb') as f:
    pickle.dump(word_vector_word2vec_glove_100, f)

In [244]:
# Load embeddings
with open('./word_vector_word2vec_glove_100.pkl', 'rb') as f:
    word_vector_word2vec_glove_100 = pickle.load(f)

# RNN Classification

Create the Different Datasets for the Classification

In [490]:
# I will need to ensure that the lenghts of the descriptions are relatively similar 
totals_len = {}

# Initialize dictionary
for genre in df_balanced['Genre'].unique():
    totals_len[genre] = 0

for sentence, genre in zip(process_corpus, df_balanced['Genre']):
    if 5 <= len(sentence) <= 100:
        totals_len[genre] += 1

totals_len

{'drama': 1374,
 'thriller': 1412,
 'documentary': 1367,
 'comedy': 1435,
 'reality-tv': 1450,
 'horror': 1370,
 'action': 1413,
 'adventure': 1446,
 'western': 1259,
 'family': 1413}

In [496]:
# I want to only keep movie descriptions between 20 to 60 words
# I will need this variables
filtered_descriptions = []
filtered_genres = []
index = []
i = 0

# Initialize a dictionary to count genres
genre_counts = {}

# Iterate through each description and its corresponding genre
for description, genre in zip(process_corpus, df_balanced['Genre']):
    # Check if the length of the description is between ?
    if 5 <= len(description) <= 100:
        # Update genre count and check if less than minimum per genre
        if genre_counts.get(genre, 0) < 1259:
            filtered_descriptions.append(description)
            filtered_genres.append(genre)
            genre_counts[genre] = genre_counts.get(genre, 0) + 1
            index.append(i)
    i+=1

In [497]:
# Create a new DF that I will use for later with the values for the classification
df_RNN = df_balanced.iloc[index]
df_RNN.reset_index(drop=True)

Unnamed: 0,Title,Genre,Description
0,A Tear for My Enemy/In'gan Kwa Chonjang (1984),drama,An American Squad of GI's capture one Chinese ...
1,Second Spring (????),drama,"After a series of unusual encounters, it is ob..."
2,Hold the Sun (2009),drama,A film about our ability and inability to conn...
3,The Young One (1960),drama,Game warden Miller lives on an isolated island...
4,Beauregard (2009),drama,"1961, in French Savoie. Pierre Hautefort, a te..."
...,...,...,...
12585,Bhatukali (2014),family,You can choose your friends but not your famil...
12586,"""Shab e Zindgi"" (2014)",family,Miriam is a middle-class girl who was married ...
12587,"""Bitworld"" (2010)",family,An incredibly interactive intergalactic televi...
12588,All at Sea (1970),family,"Douglas is on an ""educational"" cruise and has ..."


In [498]:
# How many movies we now have 
genre_counts

{'drama': 1259,
 'thriller': 1259,
 'documentary': 1259,
 'comedy': 1259,
 'reality-tv': 1259,
 'horror': 1259,
 'action': 1259,
 'adventure': 1259,
 'western': 1259,
 'family': 1259}

In [499]:
# Now we can do the splits for the data
# Number of classes
num_classes = 10

# Number of samples per class
samples_per_class = 1259

# Splits for train, validate, and test
train_split = int(0.5 * samples_per_class)
val_split = int(0.3 * samples_per_class)
test_split = samples_per_class - train_split - val_split

# Total Samples per Class
print("Train:", train_split)
print("Val:", val_split)
print("Test:", test_split)

Train: 629
Val: 377
Test: 253


In [500]:
# Create lists to hold indices for each set
train_indices, val_indices, test_indices = [], [], []

# Set random state
rng = np.random.default_rng(seed=42)

# Split indices for each class
for class_index in range(num_classes):
    start_index = class_index * samples_per_class
    indices = list(range(start_index, start_index + samples_per_class))
    rng.shuffle(indices)
    train_indices.extend(indices[:train_split])
    val_indices.extend(indices[train_split:train_split + val_split])
    test_indices.extend(indices[train_split + val_split:])

# Final lengths per class
print("Train:", len(train_indices))
print("Val:", len(val_indices))
print("Test:", len(test_indices))

Train: 6290
Val: 3770
Test: 2530


In [501]:
# Have a look if they are well distributed 
print(Counter([filtered_genres[index] for index in train_indices]))
print(Counter([filtered_genres[index] for index in val_indices]))
print(Counter([filtered_genres[index] for index in test_indices]))

Counter({'drama': 629, 'thriller': 629, 'documentary': 629, 'comedy': 629, 'reality-tv': 629, 'horror': 629, 'action': 629, 'adventure': 629, 'western': 629, 'family': 629})
Counter({'drama': 377, 'thriller': 377, 'documentary': 377, 'comedy': 377, 'reality-tv': 377, 'horror': 377, 'action': 377, 'adventure': 377, 'western': 377, 'family': 377})
Counter({'drama': 253, 'thriller': 253, 'documentary': 253, 'comedy': 253, 'reality-tv': 253, 'horror': 253, 'action': 253, 'adventure': 253, 'western': 253, 'family': 253})


In [502]:
# I'm going to need to change the number of the classes from 0-9
genre_to_index = {genre: index for index, genre in enumerate(genres)}
index_to_genre = {index: genre for genre, index in genre_to_index.items()}

genre_to_index

{'drama': 0,
 'documentary': 1,
 'comedy': 2,
 'adventure': 3,
 'horror': 4,
 'thriller': 5,
 'action': 6,
 'western': 7,
 'reality-tv': 8,
 'family': 9}

In [503]:
# Create a costum dataset for our RNN, It will process the corpus according to the embedding and it will padd the sentence
class Classification_Dataset(Dataset):
    def __init__(self, corpus, genres, genre_to_index, embeddings, vocab, max_length = 25):
        self.corpus = corpus
        self.genres = genres
        self.genre_to_index = genre_to_index
        self.embeddings = embeddings
        self.vocab = vocab
        self.idx_to_word = idx_to_word
        self.max_length = max_length

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, idx):
        # Genre with new label
        genre = self.genre_to_index[self.genres[idx]]

        # Description with each word as a Vector
        sentence = self.corpus[idx]
        embedded_sentence = [self.embeddings[words].float() for words in sentence]
        #embedded_sentence = [self.embeddings[words] for words in sentence]

        # Padding 
        if len(embedded_sentence) < self.max_length:
            embedded_sentence += [torch.zeros(100) for _ in range(self.max_length - len(embedded_sentence))]
        elif len(embedded_sentence) > self.max_length:
            embedded_sentence = embedded_sentence[:self.max_length]

        # Convert to tensor
        embedded_tensor = torch.stack(embedded_sentence)

        
        return embedded_tensor, genre


In [504]:
# Add the data into a dataset
dataset = Classification_Dataset(filtered_descriptions, filtered_genres, genre_to_index, 
                                 embeddings = word_vector_word2vec_glove_100, vocab = vocab)

In [505]:
# Look at one sample 
print(dataset[0])
print(dataset[0][0].size())


(tensor([[ 0.4821,  2.1065,  2.7069,  ..., -0.4341,  0.6466,  2.6474],
        [ 0.1053,  0.3194,  0.3410,  ...,  0.2035,  0.5191, -0.0380],
        [-0.7778, -1.9939,  0.6586,  ...,  0.0871, -0.0582,  0.6966],
        ...,
        [ 0.1267,  1.0687,  1.7751,  ...,  3.3020,  0.0348, -1.7682],
        [ 1.6057,  0.7916,  3.3139,  ...,  4.3016,  2.5812, -1.3123],
        [ 1.0933, -1.1769,  0.9713,  ...,  1.0247,  1.0844, -1.9717]]), 0)
torch.Size([25, 100])


In [506]:
# Define the batch size
bs = 32

# Create Subset datasets for each split
train_dataset = Subset(dataset, train_indices)
validate_dataset = Subset(dataset, val_indices)
test_dataset = Subset(dataset, test_indices)

# Create DataLoader objects for each split, with drop_last to ensure all batches have a size of 64 (Small amount of data loss)
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, drop_last=True)
val_loader = DataLoader(validate_dataset, batch_size=bs, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=True, drop_last=True)

In [507]:
class RNNClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RNNClassifier, self).__init__()

        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # x = x.float()
        output, hidden = self.rnn(x)
        hidden = hidden[-1, :, :]
        out = self.fc(hidden)
        return out

In [508]:
RNN_class = RNNClassifier(100, 256, 10).to(device)

In [513]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(RNN_class.parameters(), lr=0.0001, weight_decay=5e-4)

In [514]:
# Define number of epochs
num_epochs = 10

# Training loop
for epoch in tqdm(range(num_epochs), desc="Training"):
    # Training parametres for each epoch
    RNN_class.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    # Loop for each batch of the training
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = RNN_class(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        # Calculate training accuracy for the current batch
        _, predicted = torch.max(outputs, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    # Update the learning rate using the scheduler
    #scheduler.step()

    # Calculate training accuracy for the entire epoch
    train_accuracy = 100 * correct_train / total_train

    # Calculate learning rate for the epoch
    #current_lr = scheduler.get_last_lr()[0]

    # Validation phase
    RNN_class.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0

    # Disable gradient calculation
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = RNN_class(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    # Calculate validation statistics
    val_accuracy = 100 * correct_val / total_val

    # Save the model every five epochs
    # if (epoch + 1) % 5 == 0:
    #     model_checkpoint = {
    #         'epoch': epoch + 1,
    #         'model_state_dict': RNN_class.state_dict(),
    #         'optimizer_state_dict': optimizer.state_dict(),
    #         'loss': running_loss
    #     }
    #     model_save_path = f'RNN_gensim_checkpoint_epoch_{epoch + 1}.pt'
    #     torch.save(model_checkpoint, model_save_path)

    # Print training and validation statistics
    tqdm.write(f'\nEpoch {epoch + 1}/{num_epochs}, '
               #f'Learning Rate: {current_lr:.4f}\n'
               f'Training Loss: {running_loss/len(train_loader):.4f}, '
               f'Training Accuracy: {train_accuracy:.2f}%\n'
               f'Validation Loss: {val_loss/len(val_loader):.4f}, '
               f'Validation Accuracy: {val_accuracy:.2f}%\n\n'
               )



print('Finished Training')

Training:  10%|█         | 1/10 [00:07<01:08,  7.66s/it]


Epoch 1/10, Training Loss: 1.4492, Training Accuracy: 49.28%
Validation Loss: 1.6377, Validation Accuracy: 42.31%




Training:  20%|██        | 2/10 [00:14<00:59,  7.42s/it]


Epoch 2/10, Training Loss: 1.4483, Training Accuracy: 49.38%
Validation Loss: 1.6386, Validation Accuracy: 42.12%




Training:  30%|███       | 3/10 [00:22<00:52,  7.51s/it]


Epoch 3/10, Training Loss: 1.4497, Training Accuracy: 49.33%
Validation Loss: 1.6387, Validation Accuracy: 42.12%




Training:  40%|████      | 4/10 [00:29<00:43,  7.33s/it]


Epoch 4/10, Training Loss: 1.4479, Training Accuracy: 49.31%
Validation Loss: 1.6372, Validation Accuracy: 42.25%




Training:  50%|█████     | 5/10 [00:36<00:35,  7.17s/it]


Epoch 5/10, Training Loss: 1.4467, Training Accuracy: 49.54%
Validation Loss: 1.6380, Validation Accuracy: 42.04%




Training:  60%|██████    | 6/10 [00:43<00:28,  7.19s/it]


Epoch 6/10, Training Loss: 1.4485, Training Accuracy: 49.36%
Validation Loss: 1.6380, Validation Accuracy: 42.23%




Training:  70%|███████   | 7/10 [00:50<00:21,  7.15s/it]


Epoch 7/10, Training Loss: 1.4454, Training Accuracy: 49.57%
Validation Loss: 1.6368, Validation Accuracy: 42.31%




Training:  80%|████████  | 8/10 [00:57<00:14,  7.02s/it]


Epoch 8/10, Training Loss: 1.4468, Training Accuracy: 49.31%
Validation Loss: 1.6367, Validation Accuracy: 41.93%




Training:  90%|█████████ | 9/10 [01:04<00:06,  6.86s/it]


Epoch 9/10, Training Loss: 1.4456, Training Accuracy: 49.47%
Validation Loss: 1.6355, Validation Accuracy: 42.09%




Training: 100%|██████████| 10/10 [01:11<00:00,  7.12s/it]


Epoch 10/10, Training Loss: 1.4445, Training Accuracy: 49.38%
Validation Loss: 1.6381, Validation Accuracy: 42.20%


Finished Training



