In [11]:
#!pip install datasets
#!pip install gensim


## Part 0 Dataset Preparation

In [12]:
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']


## Part 1 Preparing Word Embeddings

In [3]:
import gensim.downloader as api

# List all available pre-trained models
available_models = api.info()['models'].keys()
print(available_models)


dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])


### Loading 'word2vec-google-news-300' model

In [15]:
# Load Google's pre-trained Word2Vec model (300-dimensional vectors)
word2vec_model = api.load('word2vec-google-news-300')


In [5]:
# Check the length of the dataset
print(len(train_dataset))

# View the first sample in the dataset
first_sample = train_dataset[0]
for i, key in enumerate(first_sample):
    print(f"Element {i}: {key}")

for key, value in first_sample.items():
    print(f"'{key}' : {value}")


8530
Element 0: text
Element 1: label
'text' : the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
'label' : 1


### (a) What is the size of the vocabulary formed from your training data?

In [6]:
from collections import Counter

vocab_counter = Counter()

for sentence in train_dataset['text']:  
    vocab_counter.update(sentence.split())  # Split sentences into words

# Extract vocabulary
vocab = list(vocab_counter.keys())
#print(vocab)
print("Size of training data vocabulary: " + str(len(vocab)))

Size of training data vocabulary: 18951


In [26]:
from collections import Counter
import re

vocab_counter = Counter()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = text.split()  
    return tokens

for sentence in train_dataset['text']:  
    tokens = preprocess_text(sentence)
    vocab_counter.update(tokens)  

# Extract vocabulary
vocab = list(vocab_counter.keys())
#print(vocab)
print("Size of training data vocabulary: " + str(len(vocab)))

Size of training data vocabulary: 18223


### (b) How many OOV words exist in your training data?

In [27]:
oov_words = []

for word in vocab:
    if word not in word2vec_model:
        oov_words.append(word)  # Use pre-trained Word2Vec vector
        
print("Number of OOV words in training data: " + str(len(oov_words)))
print("Sample of 10 OOV words: " + str(oov_words[:10]))

Number of OOV words in training data: 3334
Sample of 10 OOV words: ['to', '21st', 'centurys', 'and', 'a', 'jeanclaud', 'damme', 'segal', 'of', 'cowriterdirector']


### (c) Strategy to mitigate limitation of OOV words

Each OOV word can be represented as a bag of character N-grams. Embeddings are then generated based on these N-grams. By representing words as combinations of n-grams, FastText can generalize better across similar words. For example, if the model has seen "apple," it can infer meaningful representations for related words like "apples," "applet," and even misspellings or variations.FastText’s n-gram approach can capture semantic similarities between words that share similar character patterns. For instance, "cat" and "cats" will share common n-grams, leading to embeddings that are close to each other in the vector space. For instance:
The word "unhappiness" can be broken down into n-grams like "un," "happi," "ness," etc.
This enables FastText to understand that "unhappy" and "happiness" share a common root, even if those specific words were not seen during training.

In [21]:
from gensim.models import FastText

# Load the pre-trained FastText model (English) from Gensim's API
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

In [33]:
import numpy as np
import re
from collections import Counter

# Load your models and define vocab

vocab = ['<PAD>', '<UNK>'] + vocab  # Adding a padding token and a unk token

embedding_dim = 300
embedding_matrix = np.zeros((len(vocab), embedding_dim))  # Initialize embedding matrix

# Create a clean_word-to-index dictionary for your vocabulary
word_to_idx = { word : idx for idx, word in enumerate(vocab)}

oov_random_embeds = []

# Fill the embedding matrix
for word, idx in word_to_idx.items():
    if word in word2vec_model:
        embedding_matrix[idx] = word2vec_model[word]  # Use Word2Vec vector
    elif word in fasttext_model:
        embedding_matrix[idx] = fasttext_model[word]  # Use FastText vector
    else:
        embedding_matrix[idx] = np.random.normal(size=(embedding_dim,))  # Random vector for OOV words
        oov_random_embeds.append(word)

        
# Check for OOV words
print("Number of OOV words in training data: " + str(len(oov_random_embeds)))
print("Some OOV words:", oov_random_embeds[:10])  # Print the first 10 OOV words

Number of OOV words in training data: 2679
Some OOV words: ['<PAD>', '<UNK>', 'jeanclaud', 'damme', 'cowriterdirector', 'tolkiens', 'middleearth', 'tootepid', 'wisegirls', 'familyoriented']


## Part 2 Model Training & Evaluation - RNN

In [14]:
#print("Shape of word2vec vectors : " + str(word2vec_model['great'].shape))

Shape of word2vec vectors : (300,)


In [35]:
def find_seq_length(sentence):
    sentencelist = sentence.split()
    return len(sentencelist)

def encode(sentence):
    encoded_list = []
    tokens = preprocess_text(sentence)
    for token in tokens: # truncate? [:max_sequence_length]
        if token in word_to_idx:
            encoded_list.append(word_to_idx[token])
        else:
            encoded_list.append(1)# index of <UNK>

    # padding : .append(0)
    return encoded_list

maxlength = 0
minlength = float('inf')
total_length = 0
for i in range(len(train_dataset)):
    cur = find_seq_length(train_dataset[i]['text'])
    total_length += cur
    if cur > maxlength:
        maxlength = cur
    if cur < minlength:
        minlength = cur

print("Max sequence length: " + str(maxlength))
print("Min sequence length: " + str(minlength))
print("Average sequence length: " + str(total_length/len(train_dataset)))

print(encode(train_dataset[0]['text']))

Max sequence length: 59
Min sequence length: 1
Average sequence length: 20.99284876905041
[12, 13, 14, 15, 16, 17, 12, 18, 19, 20, 21, 22, 23, 24, 25, 16, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]


In [34]:
import torch
import torch.nn as nn
import torch.optim as optim

print("Shape of word2vec vectors : " + str(word2vec_model['great'].shape))

# Hyperparameters
batch_size = 32     
time_size = max length ?
feature_size =  word2vec_model['great'].shape[0]
#hidden_size = 4 * 300   
output_size = 1    # Output size (1 value)
learning_rate = 0.001



SyntaxError: invalid syntax (2262991417.py, line 9)