In [4]:
import torch
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd
import numpy as np
import os

In [5]:
RANDOM_SEED = 1234
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [7]:
Vocab_size = 20000
learning_rate = 5e-3
batch_size = 128
num_epoch = 1
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
embedding_dim = 128
hidden_dim = 256
num_classes = 2

# DOWNLOAD DATASETS

In [None]:
import wget
url = "https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz"
wget.download(url, "movie_data.csv.gz")

In [None]:
import gzip
import shutil
with gzip.open('./data/movie_data.csv.gz', 'rb') as f_in:
    with open('./data/movie_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [9]:
df = pd.read_csv('../data/movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [10]:
import spacy
import en_core_web_sm
text = torchtext.legacy.data.Field(
    tokenize="spacy",
    tokenizer_language= 'en_core_web_sm'
)
label = torchtext.legacy.data.LabelField(dtype=torch.long)

In [11]:
fields = [('TEXT_COLUMN_NAME', text), ('LABEL_COLUMN_NAME', label)]
dataset = torchtext.legacy.data.TabularDataset(
    path="../data/movie_data.csv", format='csv',
    skip_header=True, fields = fields
)

# SPLIT DATASET INTO TRAIN/VALIDATION/TEST

In [12]:
train_data, test_data = dataset.split(
    split_ratio=[0.75, 0.25],
    random_state = random.seed(RANDOM_SEED)
)
print(f'Num train_data: {len(train_data)}')
print(f'Num test_data: {len(test_data)}')

Num train_data: 37500
Num test_data: 12500


In [13]:
train_data, valid_data = train_data.split(
    split_ratio=[0.8, 0.2],
    random_state = random.seed(RANDOM_SEED)
)
print(f'Num train_data: {len(train_data)}')
print(f'Num test_data: {len(valid_data)}')

Num train_data: 30000
Num test_data: 7500


In [14]:
print(vars(train_data.examples[0]))

{'TEXT_COLUMN_NAME': ['Tatie', 'Danielle', 'is', 'all', 'about', 'a', 'ghastly', 'old', 'hag', 'who', 'torments', 'her', 'loving', 'and', 'oblivious', 'family', 'out', 'of', 'sheer', 'spite', '.', 'There', "'s", 'a', 'bit', 'of', 'subtext', 'that', 'might', 'be', 'about', 'France', "'s", 'colonial', 'past', 'but', 'it', "'s", 'mostly', 'just', 'Danielle', 'doing', 'the', 'sorts', 'of', 'things', '(', 'like', 'deliberately', 'abandoning', 'a', 'small', 'child', 'in', 'a', 'park', ')', 'that', 'would', 'soon', 'have', 'a', 'man', 'picking', 'up', 'his', 'teeth', 'with', 'broken', 'fingers', '.', 'Sadly', ',', 'that', 'does', "n't", 'happen', 'here', '.', 'It', 'looks', 'good', 'and', 'the', 'acting', 'is', 'fine', 'and', 'there', "'s", 'nothing', 'really', 'wrong', 'with', 'the', 'concept', 'but', 'it', "'s", 'just', 'so', 'SMUG', '.', 'God', ',', 'does', 'this', 'movie', 'love', 'itself', '.', 'Pity', 'it', 'is', "n't", 'nearly', 'as', 'clever', 'or', 'as', 'funny', 'as', 'it', 'thinks'

# BUILD VOCABULARY

In [15]:
text.build_vocab(train_data, max_size=Vocab_size)
label.build_vocab(train_data)
print(f'Vocabulary size: {len(text.vocab)}')
print(f'Number of classes: {len(label.vocab)}')

Vocabulary size: 20002
Number of classes: 2


In [16]:
# 100 kí tự phổ biến nhất nhiều nhất
print(text.vocab.freqs.most_common(100))

[('the', 345114), (',', 326502), ('.', 282237), ('a', 186104), ('and', 185901), ('of', 171857), ('to', 159083), ('is', 128952), ('in', 104713), ('I', 93719), ('it', 91379), ('that', 82954), ('"', 76054), ("'s", 73553), ('this', 72321), ('-', 63183), ('/><br', 60606), ('was', 59986), ('with', 51064), ('as', 50939), ('movie', 50624), ('for', 50129), ('film', 46325), ('The', 45111), ('but', 41497), ('(', 39423), ('on', 39203), ("n't", 39079), (')', 38849), ('you', 36996), ('are', 35769), ('not', 34598), ('have', 33677), ('his', 32998), ('be', 32010), ('he', 29049), ('one', 28984), ('!', 26263), ('at', 26033), ('by', 25948), ('all', 25218), ('an', 24994), ('who', 24270), ('from', 23348), ('like', 23213), ('they', 23092), ('so', 21345), ('or', 20369), ('about', 20182), ("'", 20158), ('has', 20113), ('out', 20028), ('her', 19982), ('It', 19784), ('just', 19559), ('do', 18820), ('?', 17890), ('some', 17050), ('good', 16977), ('more', 16569), ('very', 15921), ('would', 15732), ('up', 15550), (

In [17]:
# 10 kí tự đầu tiên
print(text.vocab.itos[:10])
# because of '<unk>', '<pad>', the vocab_size is 20002 but not 20000

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']


In [18]:
# convert string to integer
print(text.vocab.stoi['and'])

6


In [19]:
# class label
print(label.vocab.stoi)
# class label .count
print(label.vocab.freqs)

defaultdict(None, {'1': 0, '0': 1})
Counter({'1': 15067, '0': 14933})


# DEFINE DATALOADER

In [20]:
train_dataloader, valid_dataloader, test_dataloader = \
    torchtext.legacy.data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size = batch_size,
        sort_within_batch = False,
        sort_key = lambda x: len(x.TEXT_COLUMN_NAME),
        device=device
    )

In [21]:
print('Train')
for batch in train_dataloader:
    print(f'Text matrix size: {batch.TEXT_COLUMN_NAME.size()}')
    print(f'Target vector size: {batch.LABEL_COLUMN_NAME.size()}')
    break

print('\nValidations')
for batch in valid_dataloader:
    print(f'Text matrix size: {batch.TEXT_COLUMN_NAME.size()}')
    print(f'Target vector size: {batch.LABEL_COLUMN_NAME.size()}')
    break

print('\nTest')
for batch in test_dataloader:
    print(f'Text matrix size: {batch.TEXT_COLUMN_NAME.size()}')
    print(f'Target vector size: {batch.LABEL_COLUMN_NAME.size()}')
    break

Train
Text matrix size: torch.Size([1070, 128])
Target vector size: torch.Size([128])

Validations
Text matrix size: torch.Size([53, 128])
Target vector size: torch.Size([128])

Test
Text matrix size: torch.Size([50, 128])
Target vector size: torch.Size([128])


# BUILD MODEL RNN

In [22]:
class RNN(torch.nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(RNN, self).__init__()
        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        # self.rnn = torch.nn.RNN(embedding_dim, hidden_dim, nonlinearity='relu')
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim)
        self.fullyconnected = torch.nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        # text_dim = [sentence_length, batch_size]
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        # output_dim = [sentence_length, batch_size, hidden_dim]
        # hidden_dim = [1, batch_size, hidden_dim]
        
        hidden.squeeze_(0) # bỏ sổ 1 trong hidden_dim 
        output = self.fullyconnected(hidden)
        return output

In [23]:
model = RNN(input_dim=len(text.vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=num_classes)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# TRAINING

In [24]:
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0
        for idx, (features, targets) in enumerate(data_loader):
            features = features.to(device)
            targets = targets.to(device)
            logits = model(features)
            _, prediction_labels = torch.max(logits, 1)
            
            num_examples += targets.size(0)
            correct_pred += (prediction_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [None]:
start_time = time.time()
for epoch in range(num_epoch):
    model.train()
    for batch_idx, batch_data in enumerate(train_dataloader):
        text = batch_data.TEXT_COLUMN_NAME.to(device)
        labels = batch_data.LABEL_COLUMN_NAME.to(device)

        # forward and backward
        logits = model(text)
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad()

        # đạo hàm
        loss.backward()

        # update weight
        optimizer.step()

        if not batch_idx % 50:
            print(f'Epoch: {epoch+1:03d}/{num_epoch:03d} | '
                f'Batch: {batch_idx:03d}/{len(train_dataloader):03d} | '
                f'Loss: {loss: .4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
            f'{compute_accuracy(model, train_dataloader, device): .2f}'
            f'\nvalid accuracy: '
            f'{compute_accuracy(model, valid_dataloader, device):.2f}%')
    print(f'Time elapse: {(time.time() - start_time)/60:.2f} min')
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_dataloader, device): .2f}%')

torch.save(model, '../Movie Sentimentation/model/model1.pth')

In [29]:
import spacy
nlp = spacy.blank("en")

device = torch.device('cpu')
model = torch.load('../Movie Sentimentation/model/model1.pth', map_location=device)
def predict_sentiment(model, sentence):

    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [text.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.nn.functional.softmax(model(tensor), dim=1)
    return prediction[0][0].item()


print('Probability positive:')
print(predict_sentiment(model, "This is a good movie I have ever seen."))

print('Probability negative:')
print(1-predict_sentiment(model, "This is a very bac movie I have ever seen"))

Probability positive:
0.710697591304779
Probability negative:
0.2259497046470642
