In [None]:
# from google.colab import drive
# drive.mount("/content/drive")

In [None]:
# %%capture
# !pip install wandb -qqq
# !pip install transformers -qqq

In [None]:
# !wandb login

In [None]:
# !cp '/content/drive/My Drive/hw/all_train.csv' 'all_train.csv'

In [None]:
import pandas as pd
import numpy as np
import os
import json
import re
import csv
import string
import nltk
import ast
import re
import wandb
import warnings
from tqdm.notebook import tqdm as tqdm
from torch import nn
import transformers
import torch

In [None]:
# disabling warnings about max length of tokens
import logging
import re
def set_global_logging_level(level=logging.ERROR, prefices=[""]):
    prefix_re = re.compile(fr'^(?:{ "|".join(prefices) })')
    for name in logging.root.manager.loggerDict:
        if re.match(prefix_re, name):
            logging.getLogger(name).setLevel(level)
set_global_logging_level(logging.ERROR, ["transformers", "nlp", "torch", "tensorflow", "tensorboard", "wandb"])

In [None]:
RESOURCES_PATH = os.curdir
DRIVE_PATH = '/content/drive/My Drive/hw/'
BATCH_SIZE = 8
N_EPOCH = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device('cpu')
print(device)

In [None]:
df = pd.read_csv(os.path.join(RESOURCES_PATH, 'all_train.csv'))
# df = df.sample(frac=1.).reset_index(drop=True)
# df_train, df_test = df[:9000], df[9000:10000]
df_train, df_test = df[:24000], df[24000:]

In [None]:
class ClassificationBERTModel(nn.Module):
    def __init__(self, big=False, dropout=False):
        super(ClassificationBERTModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-multilingual-cased')
        if dropout:
            self.linear = nn.Sequential(nn.Linear(768, 256),
                                        nn.ReLU(),
                                        nn.Dropout(0.4),
                                        nn.Linear(256, 64),
                                        nn.ReLU(),
                                        nn.Dropout(0.4),
                                        nn.Linear(64, 3))
        elif big:
            self.linear = nn.Sequential(nn.Linear(768, 256),
                                        nn.ReLU(),
                                        nn.Linear(256, 64),
                                        nn.ReLU(),
                                        nn.Linear(64, 3))
        else:
            self.linear = nn.Sequential(nn.Linear(768, 3))

    def forward(self, input_ids, attention_mask):
        a, out = self.bert(input_ids, attention_mask=attention_mask, return_dict=False)
        return self.linear(out)
    
    def freeze_bert(self, freeze):
        for param in self.bert.parameters():
            param.requires_grad = not freeze

    def save(self, name, all=True):
        if all:
           torch.save(self.state_dict(), DRIVE_PATH + 'bert_' + name)
        else:
           torch.save(self.linear.state_dict(), DRIVE_PATH + 'bert_linear_' + name)
      
    @staticmethod
    def load(name, all=True, big=False, dropout=False):
        model = ClassificationBERTModel(big, dropout)
        if all:
            model.load_state_dict(torch.load(DRIVE_PATH + 'bert_' + name))
        else:
            model.linear.load_state_dict(torch.load(DRIVE_PATH + 'bert_linear_' + name))
        return model

In [None]:
def to_one_hot(i, n):
    res = [0] * n
    res[i] = 1
    return np.array(res)

def transform(df):
    tokenized = []
    y = []
    ids = []
    texts = df.Text.values
    ys = df.FinalScore1.values
    for i in range(len(df)):
        x = texts[i]
        tokens = tokenizer.encode(x, add_special_tokens=True)
        if len(tokens) > 512:
            tokens = tokens[1:-1]
            for j in range(len(tokens) // 510 + 1):
                part = tokens[j * 510: (j + 1) * 510]
                if len(part) < 100:
                    break
                tokenized.append([tokenizer.cls_token_id] + part + [tokenizer.sep_token_id])
                y.append(ys[i] + 1)
                ids.append(i)
        else:
            tokenized.append(tokens)
            y.append(ys[i] + 1)
            ids.append(i)
    max_len = 0
    for i in tokenized:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized])
    y = np.array(y)
    attention_mask = np.where(padded != 0, 1, 0)
    
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)
    y = torch.tensor(y)
    assert len(input_ids) == len(y) and len(attention_mask) == len(y) and len(ids) == len(y)
    return input_ids, attention_mask, y, ids

In [None]:
from sklearn.metrics import f1_score, accuracy_score
def evaluate(model, df, criterion, use_tqdm=True):
    predicts = []
    targets = []
    losses = []
    itt = df.groupby(np.arange(len(df)) // BATCH_SIZE)
    if use_tqdm:
        itt = tqdm(itt)
    for _, batch in itt:
        input_ids1, attention_mask1, y1, ids1 = transform(batch)
        res = {}
        for j in range(len(input_ids1) // BATCH_SIZE + 1):
            input_ids, attention_mask, y, ids = input_ids1[j * BATCH_SIZE: (j + 1) * BATCH_SIZE].to(device), attention_mask1[j * BATCH_SIZE: (j + 1) * BATCH_SIZE].to(device), y1[j * BATCH_SIZE: (j + 1) * BATCH_SIZE].to(device), ids1[j * BATCH_SIZE: (j + 1) * BATCH_SIZE]
            if len(input_ids) == 0:
              break
            predict = model(input_ids, attention_mask=attention_mask)
            loss = criterion(predict, y)
            predict = predict.detach().tolist()
            y = y.detach().tolist()
            for i, p, t in zip(ids, predict, y):
                res[i] = res.get(i, {'p': [], 't': []})
                res[i]['p'].append(p)
                res[i]['t'].append(t)
            losses.append(loss.detach().item())
        
        for r in res.values():
            p, t = r['p'], r['t']
            t = t[0]
            p = np.argmax(np.array(p).sum(axis=0))
            predicts.append(p)
            targets.append(t)
    return f1_score(targets, predicts, average='macro'), accuracy_score(targets, predicts), sum(losses) / len(losses)

In [None]:
# tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# model = ClassificationBERTModel(big=True).to(device)
# model.freeze_bert(False)
# model.eval()
# criterion = nn.CrossEntropyLoss()
# with torch.no_grad():
#     f1_test, _, loss_test = evaluate(model, df_test.sample(n=1000).reset_index(drop=True), criterion, use_tqdm=True)

In [None]:
# Load pretrained model/tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = ClassificationBERTModel(big=True).to(device)
model.freeze_bert(False)

wandb.init(project="bert-fine-tune", name='no-cut', reinit=True)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.00002)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, N_EPOCH + 1):
    df_train = df_train.sample(frac=1.).reset_index(drop=True)
    batches = tqdm(df_train.groupby(np.arange(len(df_train)) // BATCH_SIZE), desc='EPOCH 1 Mean loss: NaN')
    losses = []
    for i, (_, batch) in enumerate(batches):
        model.train()

        input_ids1, attention_mask1, y1, _ = transform(batch)

        for j in range(len(input_ids1) // BATCH_SIZE + 1):
            optimizer.zero_grad()
            input_ids, attention_mask, y = input_ids1[j * BATCH_SIZE: (j + 1) * BATCH_SIZE].to(device), attention_mask1[j * BATCH_SIZE: (j + 1) * BATCH_SIZE].to(device), y1[j * BATCH_SIZE: (j + 1) * BATCH_SIZE].to(device)
            if len(input_ids) == 0:
              break
            predict = model(input_ids, attention_mask=attention_mask)
            loss = criterion(predict, y)

            loss.backward()
            losses.append(loss.detach().item())
            optimizer.step()
        if (i + 1) % 75 == 0:
            batches.set_description(f'EPOCH {epoch} loss: {round(sum(losses) / len(losses), 4)}')
            wandb.log({"current_loss": sum(losses) / len(losses)})
            losses = []
        if (i + 1) % 750 == 0:
            model.eval()
            with torch.no_grad():
                f1_train, _, loss_train = evaluate(model, df_train.sample(n=1000).reset_index(drop=True), criterion, use_tqdm=False)

                f1_test, _, loss_test = evaluate(model, df_test.sample(n=1000).reset_index(drop=True), criterion, use_tqdm=False)
                print(f'EPOCH {epoch} {i + 1} train loss: {round(loss_train, 4)} test loss: {round(loss_test, 4)}'
                      f' train f1: {round(f1_train, 4)} test f1: {round(f1_test, 4)}')
                wandb.log(
                    {
                        'train_loss': loss_train,
                        'test_loss': loss_test,
                        'train_f1': f1_train,
                        'test_f1': f1_test
                    }
                )
                model.save('nocut_' + str(epoch) + '_' + str(i + 1))
    model.eval()
    with torch.no_grad():
        f1, accuracy, loss = evaluate(model, df_test, criterion)
        print(f'FULL Test # {epoch}: F1 macro = {f1} Accuracy = {accuracy}')
        wandb.log(
                    {
                        'full_test_f1': f1
                    }
                )
    model.save('nocut_' + str(epoch))