In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Get to the folder we are at
FOLDERNAME = 'Colab\ Notebooks/SC201_L17/imdb'
%cd drive/MyDrive/$FOLDERNAME/

/content/drive/MyDrive/Colab Notebooks/SC201_L17/imdb


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd

In [None]:
# Seed for same output
torch.manual_seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
# Reading in our file
data = pd.read_csv('IMDBDataset.csv')  #通常喜歡把文本整理成csv檔

In [None]:
# Get data & labels
reviews = data.review
labels = data.sentiment

In [None]:
# Replace 'positive' with 1; 'negative' with 0
labels.replace({'positive':1, 'negative':0}, inplace=True)

In [None]:
labels

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [None]:
patterns = ['<br />', '--', '.', ',', '!', '?', ')', '(', ';', ':', '*', '~', '_', "'", '"'] #只有引號用空字串，其餘都用空白鍵替換
replacements = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '']

In [None]:
def preprocessing(reviews, patterns, replacements):
  print(len(reviews))
  for i in range(len(reviews)):
    review = reviews[i].lower()  #case-insensitive
    for pattern, replacement in zip(patterns, replacements):
      reveiw = review.replace(pattern, replacement)
    reviews[i] = review
  return reviews

In [None]:
reviews = preprocessing(reviews, patterns, replacements)

50000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews[i] = review


In [None]:
reviews

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [None]:
num_train = 35000
num_val = 15000
longest_num_tokens = 250

In [None]:
def indexing_tokens():
  indices = {'<start>':0, '<end>':1, '<pad>':2, '<unk>':3}
  counter = 4
  for i in range(num_train):  #只需對train data做indexing
    tokens = reviews[i].split()
    for token in tokens:
      if token not in indices:
        indices[token] = counter
        counter += 1
  return indices

In [None]:
def get_data(indices, longest_line_tokens, mode='train'):
    data = []
    Y = []
    if mode == 'train':
      for i in range(num_train):
        train_data = []
        label, tokens_lst = labels[i], reviews[i].split()
        for token in tokens_lst:
          train_data.append(indices[token])  #將字串轉成數字串
          if len(train_data) == longest_line_tokens:
            break
        while len(train_data) < longest_line_tokens:  #裝不夠
          train_data.append(indices['<pad>'])
        train_data.insert(indices['<start>'], 0)
        train_data.append(indices['<end>'])
        data.append(train_data)
        Y.append(label)
    else:
      for i in range(num_train, num_train+num_val):
        val_data = []
        label, tokens_lst = labels[i], reviews[i].split()
        for token in tokens_lst:
          if token in indices:
            val_data.append(indices[token])
          else:
            val_data.append(indices['<unk>'])  #處理val data中沒看過的字 
          if len(val_data) == longest_line_tokens:
            break
        while len(val_data) < longest_line_tokens:
          val_data.append(indices['<pad>'])
        val_data.insert(indices['<start>'], 0)
        val_data.append(indices['<end>'])
        data.append(val_data)
        Y.append(label)
    return data, Y

In [None]:
# Loading Training Data & Val Data
indices = indexing_tokens()
training_data, training_labels = get_data(indices, longest_num_tokens)
val_data, val_labels = get_data(indices, longest_num_tokens, mode='val')

In [None]:
print('Number of training:', len(training_data))
print('Number of validation:', len(val_data))
print('Length of corpus:', len(indices))

Number of training: 35000
Number of validation: 15000
Length of corpus: 313078


In [None]:
# Create tensors of train & val
train_tensor = torch.tensor(training_data)
train_labels_tensor = torch.tensor(training_labels)
val_tensor = torch.tensor(val_data)
val_labels_tensor = torch.tensor(training_labels)

In [None]:
print('Train Tensor:', train_tensor.shape)
print('Val Tensor:', val_tensor.shape)

Train Tensor: torch.Size([35000, 252])
Val Tensor: torch.Size([15000, 252])


In [None]:
num_embeddings = 313078 #corpus_size(用到的所有的詞)
embedding_dim = 300
hidden_dim = 256
sequence_len = 252 #longest_num_tokens+start+end
output_dim = 2
print_every = 400
batch_size = 32

In [None]:
class MyModel(nn.Module):
  def __init__(self, num_embeddings, embedding_dim, hidden_dim, output_dim):
    super().__init__()
    self.embedding_layer = nn.Embedding(num_embeddings, embedding_dim) #每個字賦予多少features
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) #每個字有embedding_dim個features，經由LSTM轉成hidden_dim個高維度features
    self.fc = nn.Linear(hidden_dim, output_dim)
  
  def forward(self, x):
    # x.shape = N x 252
    embedding_data = self.embedding_layer(x)

    # N x 252 x 300 (NxHxW能餵進LSTM的形式)
    output, (h_n, c_n) = self.lstm(embedding_data)
    out = output[:,-1,:]
    return self.fc(out)

In [None]:
model = MyModel(num_embeddings, embedding_dim, hidden_dim, output_dim)
model = model.cuda()

In [None]:
mini_trains = DataLoader(train_tensor, batch_size=batch_size)
mini_train_labels = DataLoader(training_labels, batch_size=batch_size)

mini_vals = DataLoader(val_tensor, batch_size=batch_size)
mini_val_labels = DataLoader(val_labels, batch_size=batch_size)

In [None]:
iterator = iter(mini_trains)
print(next(iterator).shape)

iterator = iter(mini_train_labels)
print(next(iterator).shape)

torch.Size([32, 252])
torch.Size([32])


In [None]:
# Training Procedure
def train(num_epoch, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer):
  for epoch in range(num_epoch):
    num_iters = 0
    for x, y in zip(mini_trains, mini_train_labels):
      model.train()
      x = x.to(device)
      y = y.to(device)
      scores = model(x)  
      loss = loss_function(scores, y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      if num_iters % print_every == 0:
        print(f'Epoch {epoch+1}: {loss.item()}', end='/ ')
        evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device)
      num_iters += 1

In [None]:
# Evaluate Procedure
def evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device):
  model.eval()
  with torch.no_grad():
    acc_count = 0
    for x, y in zip(mini_vals, mini_val_labels):
      x=x.to(device)
      y=y.to(device)
      scores=model(x)
      predictions=scores.max(1)[1]
      acc = predictions.eq(y).sum().item()
      acc_count += acc
    print(f'Acc: {acc_count/len(val_data)}')

In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
# Start training 
train(5, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer)

Epoch 1: 0.11371156573295593/ Acc: 0.8623333333333333
Epoch 1: 0.08962731063365936/ Acc: 0.8625333333333334
Epoch 1: 0.10662756860256195/ Acc: 0.8637333333333334
Epoch 2: 0.011988739483058453/ Acc: 0.8607333333333334
Epoch 2: 0.011653845198452473/ Acc: 0.8609333333333333
Epoch 2: 0.012542111799120903/ Acc: 0.8569333333333333
Epoch 3: 0.0035609127953648567/ Acc: 0.8624666666666667
Epoch 3: 0.010496099479496479/ Acc: 0.8582
Epoch 3: 0.0026615476235747337/ Acc: 0.8575333333333334
Epoch 4: 0.003228145884349942/ Acc: 0.8596
Epoch 4: 0.0027841413393616676/ Acc: 0.8572
Epoch 4: 0.0020611633080989122/ Acc: 0.8516666666666667
Epoch 5: 0.0020190016366541386/ Acc: 0.8470666666666666
Epoch 5: 0.0025503612123429775/ Acc: 0.8480666666666666
Epoch 5: 0.0014897454530000687/ Acc: 0.8380666666666666


In [None]:
# Prediction
# label = {0:'負面', 1:'正面'}
def predict(text):
    with torch.no_grad():
        text = torch.tensor(text).to(device)
        text = text.unsqueeze(dim=0)  #多一個維度(batch)
        output = model(text)
        return output.max(1)[1].item()

In [None]:
# Test on one batch of data
test_size=8
test_reviews = reviews[num_train:num_train+test_size]
test_reviews.index=list(range(0,test_size))
test_labels = labels[num_train:num_train+test_size]
test_labels.index=list(range(0,test_size))

for i in range(test_size):
  print(test_reviews[i])
  print(f'真實類別: {test_labels[i]}')

  pred = predict(val_data[i])
  print(f'預測類別: {pred}')
  print()

just don't bother. i thought i would see a movie with great supspense and action.<br /><br />but it grows boring and terribly predictable after the interesting start. in the middle of the film you have a little social drama and all tension is lost because it slows down the speed. towards the end the it gets better but not really great. i think the director took this movie just too serious. in such a kind of a movie even if u don't care about the plot at least you want some nice action. i nearly dozed off in the middle/main part of it. rating 3/10.<br /><br />derboiler.
真實類別: 0
預測類別: 0

be careful with this one. once you get yer mitts on it, it'll change the way you look at kung-fu flicks. you will be yearning a plot from all of the kung-fu films now, you will be wanting character depth and development, you will be craving mystery and unpredictability, you will demand dynamic camera work and incredible backdrops. sadly, you won't find all of these aspects together in one kung-fu movie, 