In [None]:
!mv ./kaggle.json /root/.kaggle/
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!unzip ./imdb-dataset-of-50k-movie-reviews.zip

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from collections import Counter

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
df = pd.read_csv('./IMDB Dataset.csv')
train_data, val_data = train_test_split(df, test_size=0.2)
print(len(train_data), len(val_data))

In [None]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [None]:
tokens = tokenizer('this is a test.')

In [None]:
vocab_size = 8_000
all_tokens = []
for review in tqdm(train_data['review']):
  tokens = tokenizer(review)
  all_tokens.extend([i.text for i in tokens])

In [None]:
count = Counter(all_tokens)
tokens, counts = zip(*count.most_common(vocab_size))
vocab = {token: idx for idx, token in enumerate(tokens)}
vocab['<unk>'] = len(vocab)

In [None]:
print(vocab['<unk>'])
print(vocab['I'])

In [None]:
class IMDBDataset(Dataset):
  def __init__(self, data: pd.DataFrame, vocab):
    self.data = data
    self.vocab = vocab
    self.default = self.vocab['<unk>']

  def tokenize(self, text: str):
    return [i.text for i in tokenizer(text)]

  def encode_tokens(self, tokens):
    encoded = [self.vocab.get(token, self.default) for token in tokens]
    return torch.tensor(encoded, device=device)

  def encode_label(self, label: str):
    return torch.tensor(0, device=device) if label == 'negative' else torch.tensor(1, device=device)
  
  def __getitem__(self, n: int):
    review = self.data['review'].iloc[n]
    sentiment = self.data['sentiment'].iloc[n]
    return self.encode_tokens(self.tokenize(review)), self.encode_label(sentiment)

  def __len__(self):
    return len(self.data)

In [None]:
train_ds = IMDBDataset(train_data, vocab)
val_ds = IMDBDataset(val_data, vocab)

In [None]:
train_loader = DataLoader(train_ds, batch_size=1, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=1, shuffle=True)

In [None]:
class MLP(nn.Module):
  def __init__(self, n_tokens, emb_dim, hidden_dim, output_dim):
    super().__init__()
    self.embedding = nn.Embedding(n_tokens, emb_dim)
    self.fc1 = nn.Linear(emb_dim, hidden_dim)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    # x: Tensor([[0, 1, 2, 5, 100, 3, 6]]), shape [B, seq_len]
    # embedding.weight:
    # 0:       [ 0.3, 0.5, ..., 0.7]
    #          ...
    # n_token: [ 1.0, 0.8, ..., 0.8]
    # 
    # embedded = embedding(0) + embedding(1) + ... + embedding(6)
    embedded = self.embedding(x)
    # embedded: Tensor([[0.4, 0.2, ..., -0.9]]), shape [B, emb_dim]
    hidden1 = self.fc1(embedded)
    hidden2 = self.relu(hidden1)
    hidden3 = self.fc2(hidden2)
    return hidden3.sum(dim=1)



In [None]:
model = MLP(vocab_size + 1, 100, 200, 2).to(device)

In [None]:
def train(loader, model, optimizer, loss_fn):
  model.train()
  losses = []
  pbar = tqdm(loader)
  for x, y in pbar:
    optimizer.zero_grad()

    # run the model on the input
    logits = model(x)
    loss = loss_fn(logits, y)
    pbar.set_postfix({'loss': loss.item()})
    losses.append(loss.item())

    loss.backward()  # calculate gradients for w/b
    optimizer.step()  # update weights according to optimizer rules
  return sum(losses) / len(losses)


def evaluate(loader, model, loss_fn, score_fn):
  model.eval()
  predictions = []
  labels = []
  for x, y in tqdm(loader):
    logits = model(x)
    loss = loss_fn(logits, y)

    pred = torch.argmax(logits, dim=-1)
    predictions.append(pred.numpy())
    labels.append(y.numpy())
  score = score_fn(labels, predictions)
  return score

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()
score_fn = accuracy_score
n_epochs = 3
best_acc = 0
for epoch in range(n_epochs):
  avg_loss = train(train_loader, model, optimizer, loss_fn)
  print('train loss: ', avg_loss)
  accuracy = evaluate(val_loader, model, loss_fn, score_fn)
  print('val accuracy: ', accuracy)
  if accuracy > best_acc and accuracy > 0.7:
    torch.save(model.state_dict(), f'best-model.pt')

In [None]:
s1 = torch.randint(0, 10, (1, 6))
pad = torch.zeros(size=(1, 4)) - 1
s3 = torch.cat([s1, pad], dim=1)
s2 = torch.randint(0, 10, (1, 10))

In [None]:
torch.cat([s1], dim=0)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder

In [None]:
mlb = MultiLabelBinarizer()

In [None]:
mlb.fit([['apple', 'banana', 'orange']])

In [None]:
mlb.transform([['apple', 'banana']])

In [None]:
ohe = OneHotEncoder()