In [22]:
import torch
import pandas as pd
import string
from torch.utils.data import Dataset, DataLoader
import pickle
import numpy as np

In [23]:
class Word2Vec(torch.nn.Module):
    def __init__(self, voc_size, embedding_dim) -> None:
        super(Word2Vec, self).__init__()
        # Encoder
        self.w1 = torch.nn.Parameter(torch.randn(size=(embedding_dim, voc_size), requires_grad = True))
        self.b1 = torch.nn.Parameter(torch.randn(size = (embedding_dim,), requires_grad= True))
        
        # Decoder
        self.w2 = torch.nn.Parameter(torch.randn(size=(voc_size ,embedding_dim ), requires_grad = True))
        self.b2 = torch.nn.Parameter(torch.randn(size=(voc_size,), requires_grad = True))
    def forward(self, x):
        x = x @ self.w1.T + self.b1
        x = x @ self.w2.T + self.b2
        return x

In [24]:
class IMDBDatasetSLTC(Dataset):

    def __init__(self,file_path, voc_path, train = True) -> None:
        super(IMDBDatasetSLTC, self).__init__()
        with open('./NLPUtils/english.txt', 'r', encoding='utf-8') as f:
            self.stop_words = [stop_word.replace('\n', '') for stop_word in f.readlines()]
        self.df = pd.read_csv(file_path, encoding='utf-8')
        with open(voc_path, 'rb') as f:
            self.voc = pickle.load(f)
            self.word2idx = {word:idx for idx, word in enumerate(self.voc)}
            self.idx2word = {idx:word for idx, word in enumerate(self.voc)}

    def __getitem__(self, index):
        x_sentence = self.df['review'][index].translate(str.maketrans('', '', string.punctuation)).lower()
        x_tokens = self.tokenize(x_sentence)
        for stop_word in self.stop_words:
            x_tokens = list(filter(stop_word.__ne__, x_tokens))
        x_idx = [self.word2idx[word] for word in x_tokens]
        # x_idx尚未轉成one-hot
        pass

    def tokenize(self, x_sentence) -> list:
        tokens = x_sentence.split()
        return tokens



In [28]:
file_path = './basic_dataset/IMDB/IMDB Dataset.csv'
voc_path = './basic_dataset/IMDB/voc.data'
imdbDataset = IMDBDatasetSLTC(file_path=file_path, voc_path = voc_path)

In [27]:
# word2vec = torch.load('./trained_models/word2vec.pth')