In [1]:
import torch
import pandas as pd
import string
from torch.utils.data import Dataset, DataLoader
import pickle
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
class Word2Vec(torch.nn.Module):
    def __init__(self, voc_size, embedding_dim) -> None:
        super(Word2Vec, self).__init__()
        # Encoder
        self.w1 = torch.nn.Parameter(torch.randn(size=(embedding_dim, voc_size), requires_grad = True))
        self.b1 = torch.nn.Parameter(torch.randn(size = (embedding_dim,), requires_grad= True))
        
        # Decoder
        self.w2 = torch.nn.Parameter(torch.randn(size=(voc_size ,embedding_dim ), requires_grad = True))
        self.b2 = torch.nn.Parameter(torch.randn(size=(voc_size,), requires_grad = True))
    def forward(self, x):
        x = x @ self.w1.T + self.b1
        x = x @ self.w2.T + self.b2
        return x

In [4]:
class IMDBDatasetSLTCWithEmbedding(Dataset):

    def __init__(self,file_path, voc_path, train = True, DEVICE = 'cpu', max_len = 30) -> None:
        super(IMDBDatasetSLTCWithEmbedding, self).__init__()
        with open('./NLPUtils/english.txt', 'r', encoding='utf-8') as f:
            self.stop_words = [stop_word.replace('\n', '') for stop_word in f.readlines()]
        self.df = pd.read_csv(file_path, encoding='utf-8')
        with open(voc_path, 'rb') as f:
            self.voc = pickle.load(f)
            self.word2idx = {word:idx for idx, word in enumerate(self.voc)}
            self.idx2word = {idx:word for idx, word in enumerate(self.voc)}
        self.w1 : torch.Tensor = torch.load('./trained_models/word2vecW1.pth')
        self.w1.requires_grad = False
        self.b1 : torch.Tensor = torch.load('./trained_models/word2vecB1.pth')
        self.b1.requires_grad = False
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        x_sentence = self.df['review'][index].translate(str.maketrans('', '', string.punctuation)).lower()
        x_tokens = self.tokenize(x_sentence)
        for stop_word in self.stop_words:
            x_tokens = list(filter(stop_word.__ne__, x_tokens))
        x_idx_list = [self.word2idx[word] for word in x_tokens]
        x_one_hot_list = []
        for idx in x_idx_list:
            one_hot = np.zeros(shape=(len(self.voc)))
            one_hot[idx] = 1.
            x_one_hot_list.append(one_hot)
        while len(x_one_hot_list) < self.max_len:
            one_hot = np.zeros(shape=(len(self.voc)))
            x_one_hot_list.append(one_hot)
        x_one_hot = np.array(x_one_hot_list[:self.max_len])
        x = torch.from_numpy(x_one_hot).type(torch.float32).to(DEVICE)
        x = x @ self.w1.T + self.b1
        y = torch.tensor(data=[1.]).to(DEVICE) if self.df['sentiment'][index] =='positive' else torch.tensor(data=[0.]).to(DEVICE)
        return x, y

    def tokenize(self, x_sentence) -> list:
        tokens = x_sentence.split()
        return tokens

In [5]:
class Classifier(torch.nn.Module):

    def __init__(self, freeze_trainedModel = False) -> None:
        super(Classifier, self).__init__()
        self.linear1 = torch.nn.Linear(300, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x:torch.Tensor) -> torch.Tensor:
        x = x.mean(dim = 1)
        x = self.linear1(x)
        return self.sigmoid(x)

In [6]:
file_path = './basic_dataset/IMDB/IMDB Dataset.csv'
voc_path = './basic_dataset/IMDB/voc.data'
imdbDataset = IMDBDatasetSLTCWithEmbedding(file_path=file_path, voc_path = voc_path, DEVICE = DEVICE)
imdbDataLoder = DataLoader(imdbDataset, batch_size=32)

In [7]:
imdbDataset[0]

(tensor([[-0.3144,  0.0205, -0.0961,  ..., -0.2552,  0.2956, -0.1508],
         [ 0.1272, -0.2211, -0.0283,  ...,  0.0989, -0.0653, -0.0479],
         [-0.1842,  0.2885, -0.1961,  ..., -0.1633,  0.1446,  0.1732],
         ...,
         [ 0.1751,  0.2198, -0.0398,  ..., -0.1864, -0.0751, -0.2856],
         [-0.2499,  0.4340,  0.1951,  ...,  0.7701, -0.1244, -0.2998],
         [ 0.0557, -0.0204,  0.0038,  ...,  0.0275,  0.0430,  0.0873]],
        device='cuda:0'),
 tensor([1.], device='cuda:0'))

In [8]:
model = Classifier().to(DEVICE)
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [9]:
for x, y in tqdm(imdbDataLoder):
    pred = model(x)
    loss = loss_fn(pred, y)
    loss.backward()
    optimizer.step()


 11%|█         | 166/1563 [02:09<18:06,  1.29it/s]


KeyboardInterrupt: 