In [15]:
#!pip install datasets
#!pip install gensim


## Part 0 Dataset Preparation

In [58]:
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']


## Part 1 Preparing Word Embeddings

In [17]:
import gensim.downloader as api

# List all available pre-trained models
available_models = api.info()['models'].keys()
print(available_models)


dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])


### Loading 'word2vec-google-news-300' model

In [52]:
# Load Google's pre-trained Word2Vec model (300-dimensional vectors)
word2vec_model = api.load('word2vec-google-news-300')


In [39]:
# Check the length of the dataset
print(len(train_dataset))

# View the first sample in the dataset
first_sample = train_dataset[0]
for i, key in enumerate(first_sample):
    print(f"Element {i}: {key}")

for key, value in first_sample.items():
    print(f"'{key}' : {value}")


8530
Element 0: text
Element 1: label
'text' : the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
'label' : 1


### (a) What is the size of the vocabulary formed from your training data?

In [43]:
from collections import Counter

vocab_counter = Counter()

for sentence in train_dataset['text']:  
    vocab_counter.update(sentence.split())  # Split sentences into words

# Extract vocabulary
vocab = list(vocab_counter.keys())
#print(vocab)
print("Size of training data vocabulary: " + str(len(vocab)))

Size of training data vocabulary: 18951


### (b) How many OOV words exist in your training data?

In [59]:
oov_words = []

for word in vocab:
    if word not in word2vec_model:
        oov_words.append(word)  # Use pre-trained Word2Vec vector
        
print("Number of OOV words in training data: " + str(len(oov_words)))
print("Sample of 10 OOV words: " + str(oov_words[:10]))

Number of OOV words in training data: 4585
Sample of 10 OOV words: ['to', '21st', "century's", '"', 'and', 'a', ',', 'jean-claud', 'damme', 'segal']


### (c) Strategy to mitigate limitation of OOV words

Each OOV word can be represented as a bag of character N-grams. Embeddings are then generated based on these N-grams. By representing words as combinations of n-grams, FastText can generalize better across similar words. For example, if the model has seen "apple," it can infer meaningful representations for related words like "apples," "applet," and even misspellings or variations.FastText’s n-gram approach can capture semantic similarities between words that share similar character patterns. For instance, "cat" and "cats" will share common n-grams, leading to embeddings that are close to each other in the vector space. For instance:
The word "unhappiness" can be broken down into n-grams like "un," "happi," "ness," etc.
This enables FastText to understand that "unhappy" and "happiness" share a common root, even if those specific words were not seen during training.

In [None]:
from gensim.models import FastText

# Load the pre-trained FastText model (English) from Gensim's API
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

In [None]:
import numpy as np
from collections import Counter

embedding_dim = 300  # Since Word2Vec uses 300-dimensional vectors
embedding_matrix = np.zeros((len(vocab), embedding_dim))  # Embedding matrix initialization

# Create a word-to-index dictionary for your vocabulary
word_to_idx = {word : idx for idx, word in enumerate(vocab)}

# Fill the embedding matrix
for word, idx in word_to_idx.items():
    if word in word2vec_model:
        embedding_matrix[idx] = word2vec_model[word]  # Use pre-trained Word2Vec vector
    else:
        #embedding_matrix[idx] = np.random.normal(size=(embedding_dim,))  # Random vector for OOV words
        embedding_matrix[idx] = fasttext_model[word]  # Use FastText vector