In [None]:
!gdown 1d7JABk4jViI-USjLsWmhGkvzi8uQIL5C

Downloading...
From: https://drive.google.com/uc?id=1d7JABk4jViI-USjLsWmhGkvzi8uQIL5C
To: /content/data.zip
  0% 0.00/151k [00:00<?, ?B/s]100% 151k/151k [00:00<00:00, 17.8MB/s]


In [None]:
!unzip ./data.zip

Archive:  ./data.zip
   creating: data/
  inflating: __MACOSX/._data         
  inflating: data/restaurants_train.csv  
  inflating: __MACOSX/data/._restaurants_train.csv  
  inflating: data/restaurants_test.csv  
  inflating: __MACOSX/data/._restaurants_test.csv  


##**Dataset**

In [None]:
import torch
from torch.utils.data import Dataset

class ABSADataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        tokens, tags, pols = self.df.iloc[idx, :3].values

        tokens = tokens.replace("'", "").strip("][").split(', ')
        tags = tags.strip('][').split(', ')
        pols = pols.strip('][').split(', ')

        bert_tokens = []
        bert_tags = []
        bert_pols = []
        for i in range(len(tokens)):
            t = self.tokenizer.tokenize(tokens[i])
            bert_tokens += t
            bert_tags += [int(tags[i])]*len(t)
            bert_pols += [int(pols[i])]*len(t)

        bert_ids = self.tokenizer.convert_tokens_to_ids(bert_tokens)

        ids_tensor = torch.tensor(bert_ids)
        tags_tensor = torch.tensor(bert_tags)
        pols_tensor = torch.tensor(bert_pols)
        return bert_tokens, ids_tensor, tags_tensor, pols_tensor

    def __len__(self):
        return len(self.df)

In [None]:
import pandas as pd

train_df = pd.read_csv('./data/restaurants_train.csv')
test_df = pd.read_csv('./data/restaurants_test.csv')

In [None]:
train_df.iloc[0]

Tokens        ['But', 'the', 'staff', 'was', 'so', 'horrible...
Tags                                [0, 0, 1, 0, 0, 0, 0, 0, 0]
Polarities                  [-1, -1, 0, -1, -1, -1, -1, -1, -1]
Name: 0, dtype: object

In [None]:
from transformers import BertTokenizer

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [None]:
tokenizer.cls_token_id

101

In [None]:
tokenizer.sep_token_id

102

In [None]:
train_ds = ABSADataset(train_df, tokenizer)
test_ds = ABSADataset(test_df, tokenizer)

In [None]:
len(train_ds)

3602

In [None]:
next(iter(train_ds))

(['but', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us'],
 tensor([2021, 1996, 3095, 2001, 2061, 9202, 2000, 2149]),
 tensor([0, 0, 1, 0, 0, 0, 0, 0]),
 tensor([-1, -1,  0, -1, -1, -1, -1, -1]))

In [None]:
from torch.nn.utils.rnn import pad_sequence

def padding(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    tags_tensors = [s[2] for s in samples]
    tags_tensors = pad_sequence(tags_tensors, batch_first=True)

    pols_tensors = [s[3] for s in samples]
    pols_tensors = pad_sequence(pols_tensors, batch_first=True)

    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)

    return ids_tensors, tags_tensors, pols_tensors, masks_tensors

In [None]:
from torch.utils.data import DataLoader

batch_size = 32
train_loader = DataLoader(
    train_ds, batch_size=batch_size, shuffle=True, collate_fn=padding
)
test_loader = DataLoader(
    test_ds, batch_size=batch_size, shuffle=True, collate_fn=padding
)

In [None]:
next(iter(train_loader))

(tensor([[ 1997,  2009,  2003,  2009,  2005,  1996,  2833,  1048, 15185,  1037,
          25269,  2497,  2009,  2442,  2022,  1996,  2326,  2030,  1996,  6438],
         [ 1996,  7224,  2003,  2009,  1996,  4602,  2021,  1045,  6814,  2008,
           1055,  2129,  2027,  2562,  1996,  7597,  2091,     0,     0,     0]]),
 tensor([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]]),
 tensor([[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
          -1, -1],
         [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  2,  0,
           0,  0]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]))

##**Model**

In [None]:
from transformers import BertModel

class ABTEBert(torch.nn.Module):
    def __init__(self, model_name):
        super(ABTEBert, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 3)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, ids_tensors, masks_tensors, tags_tensors):
        bert_outputs= self.bert(
            input_ids=ids_tensors, attention_mask=masks_tensors, return_dict=False
            )
        bert_outputs = bert_outputs[0]

        linear_outputs = self.linear(bert_outputs)
        if tags_tensors is not None:
            tags_tensors = tags_tensors.view(-1)
            linear_outputs_ = linear_outputs.view(-1,3)
            loss = self.loss_fn(linear_outputs_, tags_tensors)
            return loss, linear_outputs
        else:
            return linear_outputs

In [None]:
model = ABTEBert(model_name)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
model.to(device)

ABTEBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [None]:
import time
import numpy as np
from sklearn.metrics import classification_report

def train_epoch(model, optimizer, train_loader, device):
    losses = []
    for batch in (train_loader):
        ids_tensors, tags_tensors, _, masks_tensors = batch
        ids_tensors = ids_tensors.to(device)
        tags_tensors = tags_tensors.to(device)
        masks_tensors = masks_tensors.to(device)

        loss, _ = model(
            ids_tensors=ids_tensors,
            masks_tensors=masks_tensors,
            tags_tensors=tags_tensors
        )
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return sum(losses)/len(losses)

def evaluate_epoch(model, valid_loader, device):
    losses = []

    preds, labels = [], []
    with torch.no_grad():
        for batch in (valid_loader):
            ids_tensors, tags_tensors, _, masks_tensors = batch
            ids_tensors = ids_tensors.to(device)
            tags_tensors = tags_tensors.to(device)
            masks_tensors = masks_tensors.to(device)

            loss, outputs = model(
                ids_tensors=ids_tensors,
                masks_tensors=masks_tensors,
                tags_tensors=tags_tensors
            )
            losses.append(loss.item())

            _, p = torch.max(outputs, dim=2)
            preds += list([int(j) for i in p for j in i ])
            labels += list([int(j) for i in tags_tensors for j in i ])

    acc = np.mean(np.array(preds) == np.array(labels))
    return sum(losses)/len(losses), acc

def train(model, model_name, save_model, optimizer, train_loader, valid_loader, num_epochs, device):
    train_losses = []
    eval_accs, eval_losses = [], []
    best_loss_eval = 100
    times = []
    for epoch in range(1, num_epochs+1):
        epoch_start_time = time.time()
        # Training
        train_loss = train_epoch(model, optimizer, train_loader, device)
        train_losses.append(train_loss)

        # Evaluation
        eval_loss, eval_acc = evaluate_epoch(model, valid_loader, device)
        eval_accs.append(eval_acc)
        eval_losses.append(eval_loss)

        # Save best model
        if eval_loss < best_loss_eval:
            torch.save(model.state_dict(), save_model + f'/{model_name}.pt')

        times.append(time.time() - epoch_start_time)
        # Print loss, acc end epoch
        print("-" * 59)
        print(
            "| End of epoch {:3d} | Time: {:5.2f}s | Train Loss {:8.3f} "
            "| Valid Accuracy {:8.3f} | Valid Loss {:8.3f} ".format(
                epoch, time.time() - epoch_start_time, train_loss, eval_acc, eval_loss
            )
        )
        print("-" * 59)

    # Load best model
    model.load_state_dict(torch.load(save_model + f'/{model_name}.pt'))
    model.eval()
    metrics = {
        'train_loss': train_losses,
        'valid_accuracy': eval_accs,
        'valid_loss': eval_losses,
        'time': times
    }
    return model, metrics

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [None]:
loss = train_epoch(model, optimizer, train_loader, device)
loss

0.2498261686579316

In [None]:
loss, acc = evaluate_epoch(model, test_loader, device)
loss, acc

(0.22030128070286342, 0.9139599831669153)

##**Training**

In [None]:
!mkdir "./model"

In [None]:
save_model = "./model"
model = ABTEBert(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
num_epochs = 5
best_model, metrics = train(
    model, model_name, save_model, optimizer, train_loader, test_loader, num_epochs, device
)

-----------------------------------------------------------
| End of epoch   1 | Time: 41.50s | Train Loss    0.284 | Valid Accuracy    0.908 | Valid Loss    0.233 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   2 | Time: 36.46s | Train Loss    0.161 | Valid Accuracy    0.916 | Valid Loss    0.212 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   3 | Time: 37.13s | Train Loss    0.100 | Valid Accuracy    0.919 | Valid Loss    0.249 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   4 | Time: 37.27s | Train Loss    0.049 | Valid Accuracy    0.914 | Valid Loss    0.305 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   5 | Time: 37.50s | Trai

##**Prediction**

In [None]:
def predict(best_model, sentence, device):
    word_pieces = list(tokenizer.tokenize(sentence))
    input_ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([input_ids]).to(device)

    with torch.no_grad():
        outputs = model(input_tensor, None, None)
        _, predictions = torch.max(outputs, dim=2)

    predictions = predictions[0].tolist()
    return word_pieces, predictions, outputs

In [None]:
sentence = " ".join(test_df.iloc[0]["Tokens"].replace("'", "").strip("][").split(', '))
predict(best_model, sentence, device)

(['the', 'bread', 'is', 'top', 'notch', 'as', 'well'],
 [0, 1, 0, 0, 0, 0, 0],
 tensor([[[ 5.8475, -1.7779, -4.0687],
          [-3.8340,  4.0682, -0.0569],
          [ 5.0618, -3.3294, -2.3222],
          [ 5.0944, -2.8798, -2.5601],
          [ 5.4222, -3.2680, -2.4064],
          [ 5.1982, -2.9147, -2.8567],
          [ 5.3094, -2.9266, -2.6666]]], device='cuda:0'))