In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 16.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 47.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 38.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
img_dir = '/content/drive/MyDrive/ml_a4/col774-2022/images/images/'#'../input/col774-2022/images/images/'
train_path_x ='/content/drive/MyDrive/ml_a4/col774-2022/train_x.csv'#'../input/col774-2022/train_x.csv'
train_path_y ='/content/drive/MyDrive/ml_a4/col774-2022/train_y.csv'#'../input/col774-2022/train_y.csv'
test_path_x = '/content/drive/MyDrive/ml_a4/col774-2022/non_comp_test_x.csv'#'../input/col774-2022/non_comp_test_x.csv'
test_path_y = '/content/drive/MyDrive/ml_a4/col774-2022/non_comp_test_y.csv'#'../input/col774-2022/non_comp_test_y.csv'

In [4]:
import torch
from torch import nn
import pandas as pd
import numpy as np
from transformers import BertModel, BertConfig, BertTokenizer
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AdamW 
import copy

In [5]:
!CUBLAS_WORKSPACE_CONFIG=:4096:2
torch.backends.cudnn.deterministic = True
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [6]:
device

device(type='cuda')

In [7]:
pretrained_model_name = 'bert-large-uncased'

In [8]:
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

In [9]:
def get_data(path_x, path_y):
    title_info = pd.read_csv(path_x).set_index('Id')
    img_labels = pd.read_csv(path_y).set_index('Id')
    x, y = [], []
    for i in range(len(img_labels['Genre'])):
        title = title_info['Title'][i]
        img_label = img_labels['Genre'][i]
        x.append(title)
        y.append(int(img_label))
    return x, y

In [10]:
train_x, train_y = get_data(train_path_x, train_path_y)
test_x, test_y   = get_data(test_path_x, test_path_y)

In [11]:
train_encodings = tokenizer(train_x, truncation=True, padding=True, max_length=50)
test_encodings  = tokenizer(test_x, truncation=True, padding=True, max_length=50)
num_labels = 30

In [12]:
# create the dataloaders
class TitleHeadingDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = TitleHeadingDataset(train_encodings, train_y)
test_dataset = TitleHeadingDataset(test_encodings, test_y)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [13]:
# load the model
load_path = '/content/drive/MyDrive/ml_a4/BertModel_fine_tuned_v4.pt'
model = BertForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=num_labels)
model.load_state_dict(torch.load(load_path))
model.to(device)
model.eval()

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [14]:
def accuracy(dataloader, model):     
    softmax = nn.Softmax(dim=1)        
    correct = 0.0
    model.eval()
    with torch.no_grad():    
        for batch in dataloader:        
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            output = model(input_ids, attention_mask=attention_mask, labels=labels)            
            # apply softmax to output of model
            preds = softmax(output['logits'])                        
            
            # move logits to cpu
            preds = preds.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            
            # get correct classifications
            correct += np.sum(label_ids == preds.argmax(1))            

    return correct/len(dataloader.dataset)

In [15]:
def train(train_dataloader, test_dataloader, optimizer, given_model, epochs=100, model_name=pretrained_model_name, num_classes=30, verbose=True): # returns the trained model
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = given_model
    model.to(device)
    model.train()

    optim =optimizer #(model.parameters(), lr=1e-5)
    best_score, best_model = 0.0, None
    
    for epoch in range(epochs):        
        batch_cnt = 0
        ep_l=0
        model.train()
        for batch in train_dataloader:
            batch_cnt += 1

            optim.zero_grad()            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            loss.backward()
            optim.step()
            ep_l+=loss.item()
            # if verbose and batch_cnt%100:
            #     print('loss = {}'.format(loss.item()))        

        val_accuracy = accuracy(test_dataloader, model)
        print('loss = {} val_acc = {}'.format(ep_l/batch_cnt,val_accuracy)) 
        if best_score < val_accuracy:
            best_score = val_accuracy
            best_model = copy.deepcopy(model)
    
    return best_model

In [16]:
lr = 1e-5
wd=0.01
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
model = train(train_dataloader, test_dataloader, optimizer, given_model=model, epochs=2)

loss = 0.008050263731111555 val_acc = 0.6082456140350877
loss = 0.006850107814262569 val_acc = 0.6126315789473684


In [17]:
# save the model
save_path = '/content/drive/MyDrive/ml_a4/BertModel_fine_tuned_v5.pt'
torch.save(model.state_dict(), save_path)

In [18]:
# train set accuracy
print(accuracy(train_dataloader, model))
# print(accuracy(test_dataloader, model))

0.9996491228070176


In [19]:
# test set accuracy
print(accuracy(test_dataloader, model))

0.6126315789473684
