In [1]:
import re

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from sklearn.utils import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import imblearn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_metric


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
torch.cuda.empty_cache()


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
train_df = pd.read_csv('./train_data.csv')
train_df.head()


In [None]:
test_df = pd.read_csv(
    "./test_data.csv",
    header=None
    ).rename(columns={0: "review"})
test_df.head()


In [None]:
train_df.iloc[0].review


In [None]:
def clean_data(input_text: str) -> str:
    pattern = r'[,„•’\"-]'
    text =  re.sub(pattern, ' ', input_text).strip()
    pattern = r'^\s*|\s\s*'
    text =  re.sub(pattern, ' ', text).strip().replace(r'\n', ' ')
    return text


In [None]:
def create_tokens(data: list[str], tokenizer: AutoTokenizer) -> list[str]:
    return tokenizer(data, padding=True, truncation=True)


In [None]:
train_df.review = train_df.review.apply(clean_data)
test_df.review = test_df.review.apply(clean_data)


In [None]:
model_name = "allegro/herbert-base-cased"  # "allegro/herbert-large-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer


In [None]:
input_data = train_df["review"].values
input_targets = train_df["rating"].values


In [None]:
oversampler = imblearn.over_sampling.RandomOverSampler()

input_data_over, input_targets_over = oversampler.fit_resample(
    input_data.reshape(-1,1),
    input_targets
)

train_data, val_data, train_targets, val_targets = train_test_split(
    input_data_over,
    input_targets_over,
    test_size=0.20
)


In [None]:
train_tokenized = create_tokens(list(train_data.squeeze()), tokenizer)
val_tokenized = create_tokens(list(val_data.squeeze()), tokenizer)


In [None]:
class ReviewDataset(Dataset):
    def __init__(self, tokenized_data, targets):
        self.tokenized_data = tokenized_data
        self.targets = targets

    def __getitem__(self, idx):
        temp = {key: torch.tensor(val[idx]) for key, val in self.tokenized_data.items()}
        temp["labels"] = torch.tensor(self.targets[idx])
        return temp

    def __len__(self):
        return len(self.targets)


In [None]:
train_dataset = ReviewDataset(tokenized_data=train_tokenized, targets=train_targets)
val_dataset = ReviewDataset(tokenized_data=val_tokenized, targets=val_targets)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True)



In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5
).to(device)


In [None]:
y = np.array(train_dataset[:]["labels"])
class_weights=compute_class_weight(
    "balanced",
    classes=np.unique(y),
    y=y
)

class_weights=torch.tensor(class_weights, dtype=torch.float)


In [None]:
class_weights

In [None]:
optim = torch.optim.Adam(
    model.parameters(),
    lr=4e-4
)

loss_fun = nn.CrossEntropyLoss(
    weight=class_weights.to(device))

scheduler = torch.optim.lr_scheduler.ExponentialLR(
    optimizer=optim,
    gamma=0.96
)


In [None]:
N_EPOCH = 10

In [None]:
model.train()

for epoch in range(N_EPOCH):
    epoch_loss = []
    for batch in train_loader:
        optim.zero_grad()
        inputs = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["labels"].to(device)
        outputs = model(inputs, attention_mask=attention_mask, labels=targets)
        loss = loss_fun(outputs.logits, targets)
        loss.backward()
        optim.step()
        epoch_loss.append(loss.item())

    loss_mean = np.array(epoch_loss).mean()
    print("Loss:", loss_mean)
    scheduler.step()
