# Datos y Preprocesamiento

Vamos a usar el dataset de IMDB para clasificación de reseñas de películas, el objetivo del mismo es detectar si una reseña tiene sentimiento **positivo** o **negativo**.

Descarguen el dataset de este [link](https://drive.google.com/file/d/1i0bBI4p80AxsLgnWcXkxVT65AahIzePu/view?usp=sharing) y subanlo a una carpeta **data** en la raiz de su drive personal.


In [2]:
import pandas as pd
# from google.colab import drive
# drive.mount("/content/drive")

# ! cp "/content/drive/My Drive/data/IMDB_Dataset.zip" .
# ! unzip -q IMDB_Dataset.zip
# ! rm IMDB_Dataset.zip
# ! ls

In [3]:
import re
import time
from itertools import chain
from bs4 import BeautifulSoup
from collections import Counter

import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import accuracy_score
# from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('wordnet')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

torch.manual_seed(42)
torch.backends.cudnn.deterministic = True

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


cuda:0


In [4]:
imdb_data = pd.read_csv("IMDB Dataset.csv")

#sentiment count
print(imdb_data.columns)
imdb_data['sentiment'].value_counts()

# Convert positive and negative into binary classes (1-0)
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()

sentiment_data = lb.fit_transform(imdb_data["sentiment"])
imdb_data['sentiment'] = sentiment_data

Index(['review', 'sentiment'], dtype='object')


In [9]:
imdb_data.head(2)

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode you...,1
1,wonderful little production filming technique ...,1


In [5]:
def strip_html(text):
  soup = BeautifulSoup(text, "html.parser")
  return soup.get_text()


def remove_between_square_brackets(text):
  return re.sub('\[[^]]*\]', '', text)


def remove_special_characters(text):
  pattern = r'[^a-zA-z\s]'
  text = re.sub(pattern,'',text)
  return text


def low_level_preproc(text):
  text = strip_html(text)
  text = remove_between_square_brackets(text)
  text = remove_special_characters(text)
  return text

#Apply function on review column
imdb_data['review'] = imdb_data['review'].apply(low_level_preproc)

  soup = BeautifulSoup(text, "html.parser")


In [6]:
all_stopwords = set(stopwords.words("english"))

def remove_stop_words(full_text_line):
  tokens = full_text_line.split()
  tokens = [tok for tok in tokens if tok not in all_stopwords]

  return " ".join(tokens)


def lemmatize(text):
  wnl= WordNetLemmatizer()
  lemas = [wnl.lemmatize(word) for word in text.split()]

  return " ".join(lemas)


def high_level_preproc(text):
  text = remove_stop_words(text)
  return lemmatize(text)


#Apply function on review column
imdb_data['review'] = imdb_data['review'].str.lower()
imdb_data['review'] = imdb_data['review'].apply(high_level_preproc)

In [7]:
#split the dataset  
#train dataset
train_reviews = imdb_data.review[:40000]
train_sentiments = imdb_data.sentiment[:40000]

#test dataset
test_reviews = imdb_data.review[40000:]
test_sentiments = imdb_data.sentiment[40000:]


print("Train set:", train_reviews.shape, train_sentiments.shape)
print("Test set:", test_reviews.shape, test_sentiments.shape)

Train set: (40000,) (40000,)
Test set: (10000,) (10000,)


# Vocabulario y Encoding

Vamos a crear un volcabulario para el problema, de este modo podemos representar cada palabra con un entero único. Esto nos va a permitir representar una review como una lista de ints (que luego el modelo va a mapear a word embeddings!).

Una cosa a tener en cuenta es que vamos a querer agregar padding a nuestros inputs (para que todas las reviews tengan el mismo largo), para esto vamos a usar el 0, por lo que las palabras de nuestro vocabulario deben empezar en 1.



In [11]:
count = Counter(chain(*(train_reviews.str.split())))
count

Counter({'movie': 79356,
         'film': 71546,
         'one': 41970,
         'like': 31753,
         'time': 23465,
         'good': 23088,
         'character': 21910,
         'get': 19516,
         'even': 19479,
         'story': 19330,
         'would': 19320,
         'make': 18857,
         'see': 18704,
         'really': 18250,
         'scene': 16515,
         'much': 15126,
         'well': 14957,
         'people': 14384,
         'great': 14185,
         'bad': 14153,
         'also': 13909,
         'show': 13493,
         'first': 13325,
         'dont': 13272,
         'way': 13153,
         'thing': 12811,
         'made': 12389,
         'could': 12103,
         'think': 12044,
         'life': 11456,
         'go': 11415,
         'know': 11219,
         'watch': 10932,
         'love': 10731,
         'many': 10612,
         'seen': 10454,
         'actor': 10439,
         'plot': 10427,
         'two': 10390,
         'never': 10322,
         'say': 10221,
    

In [31]:
import operator
def make_vocab(all_texts, max_vocab_size,ignore_top=0, oov_token="<OOV>"):
  # Count the number of occurrences of each word
  count = Counter(chain(*(all_texts.str.split())))

  # Create vocab containing max_vocab_size tokens
  sorted_values = sorted(count.items(),key = operator.itemgetter(1),reverse = True)[ignore_top:max_vocab_size+ignore_top]
  # Add the out of vocabulary at the end
  words = [word for word, count in sorted_values]
  words.append(oov_token)
  
  # Map from word to int index in vocab
  vocab_to_int = {word: i for i, word in enumerate(words)}

  return vocab_to_int

In [53]:
VOCAB_SIZE = 50_000
vocab_mapping = make_vocab(train_reviews,VOCAB_SIZE,0)

Ahora vamos a implementar una funcion que transforma un string con la review en una lista de enteros con la posiscion de cada una de nuestras palabras en el vocabulario. Si una palabra no está en el vocabulario usamos el indice para`"<OOV>"`

In [40]:
def get_review_features(text, word_to_idx):
  default_value = word_to_idx["<OOV>"]
  indices = [word_to_idx.get(word,default_value) for word in text.split()]
  return indices

In [41]:
res = get_review_features("this is a test", vocab_mapping)
res

[50000, 50000, 50000, 1730]

Lo siguiente es implementar padding de las sentencias, si bien los modelos son capaces de trabajar con secuencias de cualquier largo queremos que el tiempo de entrenamiento e inferencia esté controlado y no dependa del largo de los inputs. Como una pequeña optimizacion vamos a hacer **left padding**, es decir, agregar 0s a la izquierda de una secuencia hasta alcanzar `max_sequence_length` elementos.

Agregar ceros a la izquierda ayuda a los modelos a aprender de los datos ya que la informacion valiosa aparece al final de la secuencia y no tiene que recordar 3 palabras en luego de haber visto 100 ceros..

Ejemplo, la secuencia

`[117, 18, 128]`

 Quedaría:

`[0, 0, 0, 0, 0, 0, 0, 117, 18, 128]`

En lugar de 

`[117, 18, 128, 0, 0, 0, 0, 0, 0, 0] ` Forzando al modelo a recordar los 3 primeros inputs para poder predecir algo.


In [43]:
def pad_features(review_ints, sequence_length):
  if len(review_ints)> sequence_length:
    review_ints = review_ints[-sequence_length:]
  else:
    while len(review_ints)<sequence_length:
      review_ints.insert(0,0)
  return review_ints

In [44]:
def get_review_representation(review_text, word_to_idx, max_sequence_length):
  return pad_features(get_review_features(review_text, word_to_idx), max_sequence_length)

# Transformando los textos a vectores

In [46]:
MAX_SEQUENCE_LENGTH = 100

train_vectors = train_reviews.apply(lambda x: get_review_representation(x, vocab_mapping, MAX_SEQUENCE_LENGTH))
test_vectors = test_reviews.apply(lambda x: get_review_representation(x, vocab_mapping, MAX_SEQUENCE_LENGTH))

train_vectors = np.array([vec for vec in train_vectors])
test_vectors = np.array([vec for vec in test_vectors])

# Codigo de entrenamiento e inferencia 

Same old..


In [47]:
def train_epoch(training_model, loader, criterion, optim):
    training_model.train()
    epoch_loss = 0.0
    all_labels = []
    all_predictions = []
    
    for data, labels in loader:
      all_labels.extend(labels.numpy())  

      optim.zero_grad()

      predictions = training_model(data.to(DEVICE))
      all_predictions.extend(torch.argmax(predictions, dim=1).cpu().numpy())

      loss = criterion(predictions, labels.to(DEVICE))
      
      loss.backward()
      optim.step()

      epoch_loss += loss.item()

    return epoch_loss / len(loader), accuracy_score(all_labels, all_predictions) * 100


def validation_epoch(val_model, loader, criterion):
    val_model.eval()
    epoch_loss = 0.0
    all_labels = []
    all_predictions = []
    
    with torch.no_grad():
      for data, labels in loader:
        all_labels.extend(labels.numpy())  

        predictions = val_model(data.to(DEVICE))
        all_predictions.extend(torch.argmax(predictions, dim=1).cpu().numpy())

        loss = criterion(predictions, labels.to(DEVICE))

        epoch_loss += loss.item()

    return epoch_loss / len(loader), accuracy_score(all_labels, all_predictions) * 100
  

def train_model(model, train_loader, test_loader, criterion, optim, number_epochs):
  train_history = []
  test_history = []
  accuracy_history = []

  for epoch in range(number_epochs):
      start_time = time.time()

      train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
      train_history.append(train_loss)
      print("Training epoch {} | Loss {:.6f} | Accuracy {:.2f}% | Time {:.2f} seconds"
            .format(epoch + 1, train_loss, train_acc, time.time() - start_time))

      start_time = time.time()
      test_loss, acc = validation_epoch(model, test_loader, criterion)
      test_history.append(test_loss)
      accuracy_history.append(acc)
      print("Validation epoch {} | Loss {:.6f} | Accuracy {:.2f}% | Time {:.2f} seconds"
            .format(epoch + 1, test_loss, acc, time.time() - start_time))

# Modelo


![Alt text](image.png)

![Alt text](image-1.png)

In [48]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.perceptron = nn.Linear(in_features = hidden_dim, out_features = 2)

    def forward(self, x):
        out = self.embedding(x)
        out, _ = self.rnn(out)
        out = self.perceptron(out[:, -1, :])
        return out

# Entrenamiento

In [50]:
BATCH_SIZE = 10
train_targets = torch.Tensor(train_sentiments.to_numpy()).long()
train_dataset = TensorDataset(torch.LongTensor(train_vectors), train_targets) 
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, pin_memory=True, num_workers=2)

test_targets = torch.Tensor(test_sentiments.to_numpy()).long()
test_dataset = TensorDataset(torch.LongTensor(test_vectors), test_targets) 
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, pin_memory=True, num_workers=2)

In [51]:
loss_function = nn.CrossEntropyLoss().to(DEVICE)
BATCH_SIZE = 32

![Alt text](image-2.png)

In [55]:
# Instanciar un modelo
# Crear optimizador
# Entrenar (Una RNN normal debería poder superar 60% de accuracy)
import math 
embedding_dim = math.ceil(math.sqrt(VOCAB_SIZE))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = SentimentRNN(vocab_size=VOCAB_SIZE+2, embedding_dim=embedding_dim, hidden_dim=32).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_model(model,train_dataloader,test_dataloader,loss_function,optimizer,number_epochs = 10)

Training epoch 1 | Loss 0.595995 | Accuracy 68.07% | Time 68.71 seconds
Validation epoch 1 | Loss 0.536517 | Accuracy 75.17% | Time 6.55 seconds
Training epoch 2 | Loss 0.477146 | Accuracy 78.62% | Time 65.78 seconds
Validation epoch 2 | Loss 0.489867 | Accuracy 78.88% | Time 6.64 seconds
Training epoch 3 | Loss 0.411018 | Accuracy 82.51% | Time 65.78 seconds
Validation epoch 3 | Loss 0.501884 | Accuracy 78.17% | Time 6.23 seconds
Training epoch 4 | Loss 0.367547 | Accuracy 84.86% | Time 66.64 seconds
Validation epoch 4 | Loss 0.515024 | Accuracy 76.02% | Time 6.69 seconds
Training epoch 5 | Loss 0.335022 | Accuracy 86.49% | Time 77.51 seconds
Validation epoch 5 | Loss 0.522353 | Accuracy 75.43% | Time 14.87 seconds
Training epoch 6 | Loss 0.297850 | Accuracy 88.30% | Time 84.71 seconds
Validation epoch 6 | Loss 0.505308 | Accuracy 78.08% | Time 10.97 seconds
Training epoch 7 | Loss 0.267434 | Accuracy 89.75% | Time 80.66 seconds
Validation epoch 7 | Loss 0.506495 | Accuracy 78.32% | T

# Mejoras en el modelo



1. Podemos mejorar la performance si usamos una GRU o una LSTM ?
2. Que pasa si usamos celdas **bidireccionales** ?
3. Que pasa si aumentamos el numero de **capas** de nuestras celdas recurrentes 
?
4. Y si usamos vectores preentrenados (W2V, GloVe) ?




## Respuestas
1 Sí, es muy probable que la performance mejore al utilizar GRU o LSTM en lugar de una RNN simple. Las RNN tradicionales tienen problemas con las dependencias a largo plazo debido al fenómeno llamado "vanishing gradient", lo que dificulta su entrenamiento. Tanto las celdas GRU como LSTM están diseñadas para abordar este problema y, por lo general, funcionan mejor en tareas secuenciales.

LSTM: Introduce una celda de memoria y tres puertas (entrada, olvido y salida) que regulan el flujo de información en la celda. Esto permite que las LSTM mantengan la información a lo largo de secuencias más largas y sean menos susceptibles al problema del "vanishing gradient".

GRU: Es una variante simplificada de LSTM con dos puertas (restablecer y actualizar). Aunque tiene menos parámetros que LSTM, ha demostrado ser efectiva en varias tareas y, en algunos casos, puede superar a las LSTM.

2. Las celdas bidireccionales pueden mejorar la performance al procesar la secuencia en ambas direcciones (de principio a fin y de fin a principio). Esto permite que la red tenga información sobre el contexto pasado y futuro de un punto específico en la secuencia, lo cual puede ser útil para comprender mejor el significado. Sin embargo, las celdas bidireccionales también duplican el número de parámetros, lo que puede aumentar el tiempo de entrenamiento y el riesgo de sobreajuste si no se cuenta con suficientes datos.

3. Aumentar el número de capas (apilando múltiples RNNs, LSTMs o GRUs) puede permitir que la red capture patrones más complejos en los datos. Sin embargo, también puede aumentar el riesgo de sobreajuste y hacer que el entrenamiento sea más lento. Es importante encontrar un equilibrio y capaz utilizar técnicas de regularización (como dropout) al aumentar el número de capas.

4. Usar vectores preentrenados como Word2Vec o GloVe puede mejorar significativamente la performance, especialmente si el conjunto de datos es pequeño. Estos vectores capturan información semántica y relaciones entre palabras basadas en grandes volúmenes de texto. Al utilizar estos embeddings preentrenados, estamos introduciendo un conocimiento previo sobre el lenguaje en nuestro modelo, lo que puede ayudar a obtener mejores resultados más rápidamente y con menos datos.

In [57]:
class SentimentGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SentimentGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.perceptron = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        out = self.embedding(x)
        out, _ = self.rnn(out)
        out = self.perceptron(out[:, -1, :])
        return out
    
model = SentimentGRU(vocab_size=VOCAB_SIZE+2, embedding_dim=embedding_dim, hidden_dim=32).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_model(model,train_dataloader,test_dataloader,loss_function,optimizer,number_epochs = 10)

Training epoch 1 | Loss 0.475825 | Accuracy 76.81% | Time 68.19 seconds
Validation epoch 1 | Loss 0.376184 | Accuracy 83.79% | Time 6.08 seconds
Training epoch 2 | Loss 0.266361 | Accuracy 89.45% | Time 65.88 seconds
Validation epoch 2 | Loss 0.347870 | Accuracy 85.42% | Time 7.82 seconds
Training epoch 3 | Loss 0.152545 | Accuracy 94.57% | Time 71.90 seconds
Validation epoch 3 | Loss 0.416668 | Accuracy 84.45% | Time 6.81 seconds
Training epoch 4 | Loss 0.090434 | Accuracy 96.88% | Time 66.08 seconds
Validation epoch 4 | Loss 0.483416 | Accuracy 85.56% | Time 4.74 seconds
Training epoch 5 | Loss 0.060602 | Accuracy 98.02% | Time 60.61 seconds
Validation epoch 5 | Loss 0.536048 | Accuracy 85.49% | Time 4.62 seconds
Training epoch 6 | Loss 0.033737 | Accuracy 98.95% | Time 67.55 seconds
Validation epoch 6 | Loss 0.672954 | Accuracy 85.11% | Time 9.46 seconds
Training epoch 7 | Loss 0.028840 | Accuracy 99.07% | Time 66.45 seconds
Validation epoch 7 | Loss 0.654796 | Accuracy 84.54% | Tim

In [58]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.perceptron = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        out = self.embedding(x)
        out, (h, c) = self.rnn(out)
        out = self.perceptron(out[:, -1, :])
        return out
    
model = SentimentLSTM(vocab_size=VOCAB_SIZE+2, embedding_dim=embedding_dim, hidden_dim=32).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_model(model,train_dataloader,test_dataloader,loss_function,optimizer,number_epochs = 10)

Training epoch 1 | Loss 0.484305 | Accuracy 76.14% | Time 68.29 seconds
Validation epoch 1 | Loss 0.400998 | Accuracy 83.17% | Time 6.44 seconds
Training epoch 2 | Loss 0.285268 | Accuracy 88.60% | Time 65.15 seconds
Validation epoch 2 | Loss 0.355354 | Accuracy 84.63% | Time 5.19 seconds
Training epoch 3 | Loss 0.193484 | Accuracy 92.76% | Time 64.63 seconds
Validation epoch 3 | Loss 0.402408 | Accuracy 85.44% | Time 6.36 seconds
Training epoch 4 | Loss 0.125357 | Accuracy 95.62% | Time 68.43 seconds
Validation epoch 4 | Loss 0.451305 | Accuracy 84.93% | Time 7.20 seconds
Training epoch 5 | Loss 0.087493 | Accuracy 97.00% | Time 66.13 seconds
Validation epoch 5 | Loss 0.525775 | Accuracy 84.97% | Time 7.55 seconds
Training epoch 6 | Loss 0.057502 | Accuracy 98.09% | Time 68.11 seconds
Validation epoch 6 | Loss 0.600508 | Accuracy 85.26% | Time 6.13 seconds
Training epoch 7 | Loss 0.042363 | Accuracy 98.63% | Time 69.91 seconds
Validation epoch 7 | Loss 0.688051 | Accuracy 84.93% | Tim

In [59]:
class SentimentBiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SentimentBiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.perceptron = nn.Linear(2*hidden_dim, 2)  # Doble tamaño para bidireccional

    def forward(self, x):
        out = self.embedding(x)
        out, (h, c) = self.rnn(out)
        out = self.perceptron(out[:, -1, :])
        return out
    
model = SentimentBiLSTM(vocab_size=VOCAB_SIZE+2, embedding_dim=embedding_dim, hidden_dim=32).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_model(model,train_dataloader,test_dataloader,loss_function,optimizer,number_epochs = 10)

Training epoch 1 | Loss 0.498269 | Accuracy 76.08% | Time 64.73 seconds
Validation epoch 1 | Loss 0.423175 | Accuracy 82.22% | Time 7.69 seconds
Training epoch 2 | Loss 0.308496 | Accuracy 87.75% | Time 67.95 seconds
Validation epoch 2 | Loss 0.372486 | Accuracy 84.68% | Time 8.17 seconds
Training epoch 3 | Loss 0.204193 | Accuracy 92.40% | Time 67.87 seconds
Validation epoch 3 | Loss 0.394148 | Accuracy 85.32% | Time 4.92 seconds
Training epoch 4 | Loss 0.142069 | Accuracy 94.96% | Time 66.34 seconds
Validation epoch 4 | Loss 0.442744 | Accuracy 84.69% | Time 5.64 seconds
Training epoch 5 | Loss 0.092446 | Accuracy 96.81% | Time 66.30 seconds
Validation epoch 5 | Loss 0.500590 | Accuracy 85.13% | Time 5.08 seconds
Training epoch 6 | Loss 0.067785 | Accuracy 97.73% | Time 66.02 seconds
Validation epoch 6 | Loss 0.578212 | Accuracy 85.26% | Time 4.82 seconds
Training epoch 7 | Loss 0.045564 | Accuracy 98.57% | Time 66.21 seconds
Validation epoch 7 | Loss 0.649421 | Accuracy 85.05% | Tim

In [None]:
class SentimentMultiLayerLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(SentimentMultiLayerLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.perceptron = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        out = self.embedding(x)
        out, (h, c) = self.rnn(out)
        out = self.perceptron(out[:, -1, :])
        return out
model = SentimentMultiLayerLSTM(vocab_size=VOCAB_SIZE+2, embedding_dim=embedding_dim, hidden_dim=32).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_model(model,train_dataloader,test_dataloader,loss_function,optimizer,number_epochs = 10)

In [60]:
import requests
import zipfile
import os

# URL de Word2Vec
url = "http://vectors.nlpl.eu/repository/20/40.zip"

response = requests.get(url, stream=True)
filename = "word2vec.zip"
with open(filename, 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)

with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall("word2vec_folder")

word2vec_path = os.path.join("word2vec_folder", "model.bin")

print(f"Word2Vec file is located at: {word2vec_path}")

In [None]:
#%pip install gensim
from gensim.models import KeyedVectors

word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

embedding_matrix = torch.zeros((len(vocab_mapping), word2vec_model.vector_size))
for i, word in enumerate(vocab_mapping):
    if word in word2vec_model:
        embedding_matrix[i] = torch.tensor(word2vec_model[word])

In [None]:
class SentimentPretrainedEmbeddings(nn.Module):
    def __init__(self, pretrained_embeddings, hidden_dim):
        super(SentimentPretrainedEmbeddings, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        self.rnn = nn.LSTM(pretrained_embeddings.shape[1], hidden_dim, batch_first=True)
        self.perceptron = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        out = self.embedding(x)
        out, (h, c) = self.rnn(out)
        out = self.perceptron(out[:, -1, :])
        return out

model = SentimentPretrainedEmbeddings(pretrained_embeddings=embedding_matrix, hidden_dim=32).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_model(model,train_dataloader,test_dataloader,loss_function,optimizer,number_epochs = 10)