In [1]:
import torch
import torchtext
from torch.utils.data import Dataset, DataLoader
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
IMDB_file = './basic_dataset/IMDB/IMDB Dataset.csv'

In [3]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

class IMDBDataset(Dataset):
    def __init__(self, csv_file, MAX_LEN = 100):
        self.dataframe = pd.read_csv(csv_file)
        self.tokenizer = get_tokenizer("basic_english")
        self.voc = build_vocab_from_iterator(self.yield_tokens(self.dataframe.review), specials=['<unk>'])
        self.voc.set_default_index(self.voc['<unk>'])
        self.text_pipeline = lambda x : self.fixed_size(max_len = MAX_LEN, voc = self.voc(self.tokenizer(x)))
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        # 應該想辦法轉換成tensor
        if(self.dataframe.sentiment[index] == 'positive'):
            y = torch.tensor(data=1)
        else:
            y = torch.tensor(data=0)
        x = torch.tensor(self.text_pipeline(self.dataframe.review[index]))
        return x, y


    def yield_tokens(self, data_iter):
        for text in data_iter:
            yield self.tokenizer(text)

    @staticmethod
    def fixed_size(max_len, voc = []):
        if len(voc) > max_len:
            return voc[:max_len]
        while len(voc) < max_len:
            voc.append(0)
        return voc

In [4]:
dataset = IMDBDataset(IMDB_file, MAX_LEN=200)
IMDBDataloader = DataLoader(dataset=dataset, batch_size=128)

In [5]:
x,_ = dataset[26]

print(x)

tensor([    1,  2603,     9,    40,  4264,   900,     3,     5, 11776,  1214,
           91,    28,    69,     1,  3863,   343,     6,     5,  1561,   486,
            3,    21,    90,    91,    34,     6,     5,    60,  1026,   172,
            2,    13,     9, 29374,  2041,     6,    54,    57,    33,  3167,
           50,   404,  3257, 16283,    72, 15361, 12331,     2,    13,     9,
        25329,    36,   169,     3,  4039,  1150,  2235,     7,   764,    91,
        17579,     4,  6325,     4,  1822,   146,   370,  1850,     2,    13,
            9,     1,   125,    20,     6,     1,   339,     7,  1317,     2,
           12,     8,   145,   341,  1904,  5145,    49,    13,    22,     3,
         1678,    42,    37,   438,     4,    64,  2374,     7,   868,   184,
          109,     4,    85,   120,     2,     7, 16529,  6507,    13,    22,
            9,     7,   694,     1,   228,     2,    13,    20,   206,    33,
            5,  8599,     3,     5,  2981,   121,   715,   106, 

In [6]:
for x,y in IMDBDataloader:
    print(y.shape)
    print(x.shape)
    break


torch.Size([128])
torch.Size([128, 200])


In [7]:
data_loader = DataLoader(dataset=dataset, batch_size=128)

In [8]:

from torch import nn

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class) -> None:
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()
        
    def init_weight(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
    
    def forward(self, text):
        x = self.embedding(text)
        x = self.relu(x)
        x = self.fc(x)
        return self.softmax(x)

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [10]:
tcModel = TextClassificationModel(len(dataset.voc), embed_dim= 256, num_class=2).to(device)

In [11]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(tcModel.parameters(), lr=1e-1)

In [12]:
tcModel.train()
for epoch in range(100):
    total_acc, total_count = 0, 0
    correct = 0.
    for x,y in data_loader:
        x, y= x.to(device), y.to(device)
        optimizer.zero_grad()
        pred = tcModel(x)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    acc = correct/len(dataset)
    print(acc)


0.55702
0.59708
0.61166
0.617
0.62196
0.62674
0.63092
0.63472
0.63856
0.64164
0.64498
0.64822
0.6505
0.65282
0.65482
0.65716
0.6597
0.6621
0.66478
0.66678
0.66902
0.67098
0.67274
0.67434
0.67612
0.67798
0.67966
0.68214
0.68364
0.68592
0.68762
0.68984
0.69178
0.69354
0.69492
0.69614
0.69752
0.69894
0.70018
0.7019
0.70312
0.70448
0.7059
0.70664
0.70818
0.70942
0.71036
0.71172
0.713
0.7142
0.71544
0.7167
0.71772
0.71864
0.71978
0.7207
0.7216
0.7226
0.72402
0.72502
0.72606
0.727
0.72824
0.72874
0.72948
0.73028
0.73112
0.73172
0.7328
0.73354
0.7345
0.73562
0.73626
0.73734
0.73836
0.73888
0.73928
0.73984
0.74044
0.74142
0.74178
0.74274
0.74334
0.74392
0.74438
0.74508
0.74596
0.7465
0.74742
0.74792
0.74866
0.74934
0.75006
0.75064
0.75144
0.75234
0.75288
0.75342
0.75414
0.75464


In [13]:
tcModel.eval()
correct = 0
for i,(x,y) in enumerate(data_loader):
    x, y= x.to(device), y.to(device)
    pred = tcModel(x)
    correct += (pred.argmax(1) == y).type(torch.float).sum().item()
acc = correct/len(dataset)

In [14]:
print(acc)

0.75572


In [15]:
text_pipline = dataset.text_pipeline

In [24]:

pred = tcModel(torch.tensor(text_pipline("Seriously what is wrong with critics I loved this retailing")).view(size=(1, 200)).to(device=device))
print(pred)
if pred.argmax(1) ==1:
    print('Positive')
else:
    print('negative')

tensor([[0.4936, 0.5064]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Positive


In [28]:
def show_Result(commend:str):
    pred = tcModel(torch.tensor(text_pipline(commend)).view(size=(1, 200)).to(device=device))
    print(pred)
    if pred.argmax(1) ==1:
        print('Positive')
    else:
        print('negative')

In [29]:
# groundTruth good
show_Result("Seriously what is wrong with critics I loved this retailing")
show_Result("It's funny! It's rhythmic!! It's spectacular!!!")
# groundTruth bad
show_Result("Disney strikes back with yet another live-action remake disaster that soils the name of the original movies. Aladdin (2019) is a confused mess. It not only adds nothing remarkable on to the original to justify it's existence, but Aladdin is a movie that only works in animation, because in live-action it's identity is just lost. The CGI is abominable. Despite a few scenes where it's OK, the CGI overall is shockingly bad for a Disney film and it's even more depressing that they sunk so much money into it. The Genie specifically is one of the worst looking CGI characters I've seen in some time. And more on the Genie; I've seen a lot of people say that they like Will Smith's Genie, and while I will commend Smith for not trying to imitate Robin Williams' Genie and for doing his on thing, it's just not working for me. They also butchered a lot of the songs, specifically 'Friend Like Me' and 'Prince Ali'. Not only can Will Smith not sing to save his life and the songs have been 69'd by autotune, but they've also added some new lines and lyrics into them as well as changing their flow and energy at times. While it's fine to add some differences from the original, this is the wrong kind of different. No one wanted the songs to be changed and the new lyrics are OK at best and shockingly bad at worst. The flow and tone of the songs has also been changed and that's probably due to Will Smith's singing, because what made Robin Williams' Genie great is the sheer amount of incredible energy he gave which is also what made 'Friend Like Me' and 'Prince Ali' so fun and catchy. Here, Will Smith completely butchers these songs, especially Prince Ali' because of how slow and boring it is compared to the original. Finally, the movie is just pointless. It doesn't justify it's existence at all. Why would I want to watch a boring and slow version of the original with none of the colourful, expressive animation and humour when I can watch the original, at ANY TIME, which has the expressive animation and humour? It's a pointless and unoriginal bore of a film that I will not be seeing again.")
show_Result("What's to say. Before even filming we knew Guy Ritchie was obviously the wrong choice to direct the film. His style doesn't quite lend itself to this type of film. Miscast Mena Massoud didn't bring any real character to the role of Aladdin he was simply okay/ functional. Will Smith is as usual charismatic and had difficult shoes to fill after Robin Williams' take on the genie but he was seriously let down by the VFX in blue genie mode and a weak script. Another miscast Naomi Scott gave a good performance but I always felt i was looking at Indian and not Middle Eastern. In fact the film IS very bollywood with the Jasmin, Aladdin dance off Indian Style (and to Indian music) then Russian dancing(?) plus other times an Indian soundtrack. And i'm not even going to talk about that sorry excuse for a villain. No, no i am not. For a Disney movie set in large Kingdom it sure does feel quite quite small. It looked cheap too. Weird. Ultimately a dreadfully executed cash grab of a movie that no one asked for. Thanks D. Thankfully this is exactly what i expected having seen the trailers.")

tensor([[0.4936, 0.5064]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Positive
tensor([[0.2516, 0.7484]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Positive
tensor([[0.8335, 0.1665]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
negative
tensor([[0.9267, 0.0733]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
negative
