In [1]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [2]:
from transformers import HerbertTokenizer, RobertaModel
import torch.nn as nn
import torch
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

In [3]:
torch.cuda.is_available()

True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

# Tokenization

In [6]:
tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/591k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMTokenizer'. 
The class this function is called from is 'HerbertTokenizer'.


In [7]:
def pre_parse_dataset(input):
    encoded_corpus = tokenizer(text=input, add_special_tokens=True, padding='max_length', truncation='longest_first', max_length=256, return_attention_mask=True)
    return np.array(encoded_corpus['input_ids']), np.array(encoded_corpus['attention_mask'])

# Dataset preparation for pytorch

In [8]:
def prepare_dataloader(inputs, masks, target, batch_size):
    input_tensor = torch.tensor(inputs).to(device=device)
    mask_tensor = torch.tensor(masks).to(device=device)
    labels_tensor = torch.tensor(target).to(device=device)
    dataset = TensorDataset(input_tensor, mask_tensor, labels_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Load model

In [9]:
class HerbertRegressionModel(nn.Module):
    def __init__(self, drop_rate=0.2):
        super(HerbertRegressionModel, self).__init__()
        D_in, D_out = 768, 1
        self.model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
        self.regressor = nn.Sequential(nn.Dropout(drop_rate), nn.Linear(D_in, D_out))

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask)
        outputs = outputs[1]
        return self.regressor(outputs)

model = HerbertRegressionModel()
model_path = "/content/drive/MyDrive/University/S9/mm-review-based-rate-ai/model_2024_01_25_08_34_29"
model.load_state_dict(torch.load(model_path))
model.to(device=device)

config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/500M [00:00<?, ?B/s]

HerbertRegressionModel(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50560, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm)

# Model performance evaluation

In [10]:
test_input = pd.read_csv('/content/drive/MyDrive/University/S9/mm-review-based-rate-ai/test_input.csv', sep='$', encoding='utf-8', header=0)
test_target = pd.read_csv('/content/drive/MyDrive/University/S9/mm-review-based-rate-ai/test_target.csv', sep='$', encoding='utf-8', header=0)
X_test = test_input['user_comment']
y_test = test_target['user_rate'].to_numpy().reshape(-1, 1)
test_input_id, test_attention_mask = pre_parse_dataset(X_test.tolist())
test_dataloader = prepare_dataloader(test_input_id, test_attention_mask, y_test, 1)

In [11]:
def predict_rating(model, dataloader):
    model.eval()
    output = []
    for batch in dataloader:
        batch_inputs, batch_masks, _ = batch
        with torch.no_grad():
            output += model(batch_inputs, batch_masks).view(1,-1).tolist()[0]
    return [round(o) for o in output]

In [12]:
y_predicted = predict_rating(model, dataloader=test_dataloader)

In [13]:
y_test = list(map(lambda x: float(x[0]), y_test.tolist()))

In [14]:
results_summary = pd.DataFrame({'movie': test_input['title'], 'target': y_test, 'predict': y_predicted})
results_summary['diff'] = results_summary['target'] - results_summary['predict']
results_summary

Unnamed: 0,movie,target,predict,diff
0,Game of Thrones,10.0,8,2.0
1,Misconduct,7.0,8,-1.0
2,The Mandalorian,7.0,9,-2.0
3,The X Files,8.0,9,-1.0
4,Shrek,7.0,9,-2.0
...,...,...,...,...
5665,Taken,10.0,10,0.0
5666,Ostatnia wieczerza,7.0,7,0.0
5667,High School Musical,6.0,5,1.0
5668,How I Met Your Mother,8.0,5,3.0


# Predict show rating

In [15]:
movie_ratings = results_summary.groupby('movie')['predict'].apply(list).reset_index(name="ratings")
movie_ratings

Unnamed: 0,movie,ratings
0,Pupille,[9]
1,The Devil All the Time,"[9, 9, 8]"
2,#BringBackAlice,[8]
3,1899,"[8, 9, 9, 9, 4, 2]"
4,1923,[9]
...,...,...
1036,Ślub doskonały,"[6, 8]"
1037,Święto ognia,"[9, 3]"
1038,Żeby nie było śladów,"[9, 9, 5, 9, 9, 7, 9]"
1039,Żmijowisko,"[4, 3, 8]"


In [16]:
from statistics import mean

movie_ratings['movie_rating'] = movie_ratings['ratings'].apply(mean)
movie_ratings

Unnamed: 0,movie,ratings,movie_rating
0,Pupille,[9],9.000000
1,The Devil All the Time,"[9, 9, 8]",8.666667
2,#BringBackAlice,[8],8.000000
3,1899,"[8, 9, 9, 9, 4, 2]",6.833333
4,1923,[9],9.000000
...,...,...,...
1036,Ślub doskonały,"[6, 8]",7.000000
1037,Święto ognia,"[9, 3]",6.000000
1038,Żeby nie było śladów,"[9, 9, 5, 9, 9, 7, 9]",8.142857
1039,Żmijowisko,"[4, 3, 8]",5.000000
