In [1]:
# this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 請輸入資料夾之所在位置
FOLDERNAME = '我的 筆記型電腦 MSI/Research data/stanCodeML/L16'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/Othercomputers/{}'.format(FOLDERNAME))

# Get to the folder we are at
%cd drive/Othercomputers/$FOLDERNAME/

Mounted at /content/drive
/content/drive/Othercomputers/我的 筆記型電腦 MSI/Research data/stanCodeML/L16


In [2]:
import torch
import torch.nn.functional as F
import torchtext
import random

In [3]:
%ls

[0m[01;34m__MACOSX[0m/    [01;34mmnist_train[0m/    My_project.ipynb  sentiment.ipynb
[01;34mmnist_test[0m/  movie_data.csv  NLP.ipynb


In [4]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

CORPUS_SIZE = 10000
LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 13
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
NUM_CLASSES = 2

In [5]:
USE_GPU = True

dtype = torch.float32 # we will be using float throughout this tutorial

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss
print_every = 100

print('using device:', device)

using device: cuda


In [6]:
# Define Your tokenizer
from torchtext.legacy.data import Field, LabelField

In [None]:
##########
eng_tokenizer = Field(tokenize='spacy', tokenizer_language='en_core_web_sm')
label_tokenizer = LabelField()
fields = [('TEXT', eng_tokenizer), ('LABEL', label_tokenizer)]
##########

In [8]:
# Build Train/Val/Test Data
from torchtext.legacy.data import TabularDataset

In [None]:
##########
dataset = TabularDataset(path = 'movie_data.csv', format='csv', skip_header=True, fields=fields)
train, test = dataset.split(split_ratio=[0.95, 0.05], random_state=random.seed(RANDOM_SEED))
train, val = train.split(split_ratio=[0.7, 0.3], random_state=random.seed(RANDOM_SEED))
##########

In [None]:
print(f'Number of train: {len(train)}')
print(f'Number of val: {len(val)}')
print(f'Number of test: {len(test)}')

In [None]:
# Index All Tokens

In [None]:
eng_tokenizer.build_vocab(train, max_size=CORPUS_SIZE)
label_tokenizer.build_vocab(train)
print(len(eng_tokenizer))

In [None]:
# Build Train/Val/Test Mini-batches
from torchtext.legacy.data import BucketIterator

In [None]:
from os import supports_follow_symlinks
##########
train_loader, val_loader, test_loader = BucketIterator.splits((train, val, test), batch_sizes=BATCH_SIZE, sort_within_batch=False, sort_key=lambda x: len(x.TEST), device=device)
##########

In [None]:
# train_batch = next(iter(train_loader))
# print("Training:", train_batch.TEXT.shape)

# val_batch = next(iter(val_loader))
# print("Validation:", val_batch.TEXT.size())

In [None]:
class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True)        
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        

    def forward(self, x):
        # x dim: [batch size, sentence length]
        
        embedded = self.embedding(x)
        # embedded dim: [batch size, sentence length, embedding dim]
        
        output, _ = self.lstm(embedded)
        # output dim: [batch size, sentence length, hidden dim]
        # hidden dim: [batch size, 1, hidden dim]

        out = output[:, -1, :]
        # out dim: [batch size, hidden dim]
        
        return self.fc(out)

In [None]:
torch.manual_seed(RANDOM_SEED)
model = RNN(input_dim=len(eng_tokenizer.vocab),
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            output_dim=NUM_CLASSES
)

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
def compute_accuracy(model, data_loader, device):
    model.eval()
    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.T.to(device)
            targets = targets.float().to(device)

            scores = model(features)
            _, predicted_labels = torch.max(scores, 1)

            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [None]:
for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        text = batch_data.TEXT.T.to(device)
        labels = batch_data.LABEL.to(device)

        ### FORWARD AND BACK PROP
        scores = model(text)
        loss = F.cross_entropy(scores, labels)
        optimizer.zero_grad()
        
        loss.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')

    print(f'training accuracy: '
          f'{compute_accuracy(model, train_loader, device):.2f}%'
          f'\nvalid accuracy: '
          f'{compute_accuracy(model, val_loader, device):.2f}%')
            
print(f'Test accuracy: {compute_accuracy(model, test_loader, device):.2f}%')