In [1]:
from transformers import HerbertTokenizer, RobertaModel, get_linear_schedule_with_warmup
import torch.nn as nn
import torch
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW
from torch.nn.utils.clip_grad import clip_grad_norm_

In [2]:
torch.cuda.is_available()

True

In [3]:
tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMTokenizer'. 
The class this function is called from is 'HerbertTokenizer'.


In [4]:
df = pd.read_csv('../dataset/dataset.csv', sep='$', encoding='utf-8')

In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

# Split 

In [6]:
X = df['user_comment']
y = df['user_rate']
seed = 32
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test, random_state=seed)

In [7]:
y_train = y_train.to_numpy().reshape(-1, 1)
y_valid = y_valid.to_numpy().reshape(-1, 1)

Save the test part for later

In [8]:
X_test.to_csv('test_input.csv', sep='$', encoding='utf-8')
y_test.to_csv('test_target.csv', sep='$', encoding='utf-8')

# Tokenization

In [9]:
def pre_parse_dataset(input):
    encoded_corpus = tokenizer(text=input, add_special_tokens=True, padding='max_length', truncation='longest_first', max_length=256, return_attention_mask=True)
    return np.array(encoded_corpus['input_ids']), np.array(encoded_corpus['attention_mask'])

In [10]:
training_input_id, training_attention_mask = pre_parse_dataset(X_train.tolist())
valid_input_id, valid_attention_mask = pre_parse_dataset(X_valid.tolist())

Consider rescaling the target values

# Dataset preparation for pytorch

In [11]:
def prepare_dataloader(inputs, masks, target, batch_size):
    input_tensor = torch.tensor(inputs).to(device=device)
    mask_tensor = torch.tensor(masks).to(device=device)
    labels_tensor = torch.tensor(target).to(device=device)
    dataset = TensorDataset(input_tensor, mask_tensor, labels_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_dataloader = prepare_dataloader(training_input_id, training_attention_mask, y_train, 16)
validation_dataloader = prepare_dataloader(valid_input_id, valid_attention_mask, y_valid, 16)

# Model

In [12]:
class HerbertRegressionModel(nn.Module):
    def __init__(self, drop_rate=0.2):
        super(HerbertRegressionModel, self).__init__()
        D_in, D_out = 768, 1
        self.model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
        self.regressor = nn.Sequential(nn.Dropout(drop_rate), nn.Linear(D_in, D_out))

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask)
        outputs = outputs[1]
        return self.regressor(outputs)

model = HerbertRegressionModel(drop_rate=0.2)
model.to(device)

HerbertRegressionModel(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50560, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm)

# optimizer, loss and scheduler

In [13]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_function = nn.MSELoss()

# Training (new one)

In [14]:
from tqdm import tqdm
from torch.nn.utils.clip_grad import clip_grad_norm

def train_one_epoch():
    avg_loss = 0.0
    i = 0
    for data in tqdm(train_dataloader):
        inputs, mask, target = data

        optimizer.zero_grad()
        outputs = model(inputs, mask)

        loss = loss_function(outputs, target.float())
        loss.backward()
        clip_grad_norm_(model.parameters(), 2)
        optimizer.step()

        avg_loss += loss.item()
        i+=1
    return avg_loss / (i+1)

In [15]:
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter

timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
writer = SummaryWriter('../runs/filmweb_model0_{}'.format(timestamp))

EPOCHS = 5
best_epoch = 0
best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print(f'EPOCH {epoch+1}:')

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    training_loss = train_one_epoch()
    writer.add_scalar('Train/Loss', training_loss, epoch + 1)

    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(validation_dataloader):
            vinputs, vmask, vtarget = vdata
            voutputs = model(vinputs, vmask)
            vloss = loss_function(voutputs, vtarget)
            running_vloss += vloss.item()

    avg_vloss = running_vloss / (i+1)
    print(f'LOSS train {training_loss} valid {avg_vloss}')

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalar('Valid/Loss', avg_vloss, epoch + 1)
    
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_epoch = epoch
        best_vloss = avg_vloss
        model_path = f'../models/model_{timestamp}'
        torch.save(model.state_dict(), model_path)
print(f'Best model came from  epoch no {best_epoch}')


EPOCH 1:


100%|██████████| 214/214 [01:37<00:00,  2.19it/s]


LOSS train 4.889314004986785 valid 2.9441011437663325
EPOCH 2:


100%|██████████| 214/214 [01:36<00:00,  2.22it/s]


LOSS train 2.7666440769683485 valid 3.4213406112458973
EPOCH 3:


100%|██████████| 214/214 [01:39<00:00,  2.15it/s]


LOSS train 1.7555619324362555 valid 3.5104249274289168
EPOCH 4:


100%|██████████| 214/214 [01:39<00:00,  2.16it/s]


LOSS train 1.096490182017171 valid 3.20210059704604
EPOCH 5:


100%|██████████| 214/214 [01:40<00:00,  2.14it/s]


LOSS train 0.7665732414223427 valid 3.2497354260197393
Best model came from  epoch no 0


# Memory cleanup

In [16]:
import gc

gc.collect()
optimizer.zero_grad()
torch.cuda.empty_cache()

# Model performance evaluation

In [17]:
test_input = pd.read_csv('./test_input.csv', sep='$', encoding='utf-8', header=0)
test_target = pd.read_csv('./test_target.csv', sep='$', encoding='utf-8', header=0)
X_test = test_input['user_comment']
y_test = test_target['user_rate'].to_numpy().reshape(-1, 1)
test_input_id, test_attention_mask = pre_parse_dataset(X_test.tolist())
test_dataloader = prepare_dataloader(test_input_id, test_attention_mask, y_test, 1)

In [43]:

def predict_rating(model, dataloader):
    model.eval()
    output = []
    for batch in dataloader:
        batch_inputs, batch_masks, _ = batch
        with torch.no_grad():
            output += model(batch_inputs, batch_masks).view(1,-1).tolist()[0]
    return [round(o) for o in output]

In [29]:
model = HerbertRegressionModel()
model.load_state_dict(torch.load(model_path))
model.to(device=device)

HerbertRegressionModel(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50560, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm)

In [44]:
y_predicted = predict_rating(model, dataloader=test_dataloader)

In [31]:
y_test = list(map(lambda x: float(x[0]), y_test.tolist()))

In [45]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

mae = mean_absolute_error(y_test, y_predicted)
mdae = median_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)
mape = mean_absolute_percentage_error(y_test, y_predicted)
mdape = ((pd.Series(y_test) - pd.Series(y_predicted)) / pd.Series(y_test)).abs().median()
r_squared = r2_score(y_test, y_predicted)
result = {'mae': mae, 'mdae': mdae, 'mse': mse, 'mape': mape, 'mdape': mdape, 'r_squared': r_squared}
result

{'mae': 1.6107226107226107,
 'mdae': 1.0,
 'mse': 4.538461538461538,
 'mape': 419916049172074.5,
 'mdape': 0.16666666666666666,
 'r_squared': -0.18200382084483113}

In [46]:
results_summary = pd.DataFrame({'target': y_test, 'predict': y_predicted})
results_summary['diff'] = results_summary['target'] - results_summary['predict']
results_summary

Unnamed: 0,target,predict,diff
0,8.0,7,1.0
1,7.0,6,1.0
2,6.0,8,-2.0
3,5.0,8,-3.0
4,8.0,5,3.0
...,...,...,...
424,6.0,8,-2.0
425,7.0,6,1.0
426,8.0,6,2.0
427,6.0,6,0.0


In [49]:
results_summary[abs(results_summary['diff']) == 0]

Unnamed: 0,target,predict,diff
7,7.0,7,0.0
16,8.0,8,0.0
22,7.0,7,0.0
25,8.0,8,0.0
27,7.0,7,0.0
...,...,...,...
418,7.0,7,0.0
421,8.0,8,0.0
423,6.0,6,0.0
427,6.0,6,0.0


# Final cleanup

In [None]:
writer.close()

# To Do

[ ] add hyperparameters tracking (preferably with wandb.ai)