In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from transformers import AutoTokenizer

class BankiDataset(Dataset):
    def __init__(self, tokenizer: AutoTokenizer, dataset_path: str = "../data/processed/promsvyazbank_reviews.csv"):
        df = pd.read_csv(dataset_path)
        tokenized_
        for idx, row in tqdm(df, total=len(df)):
            
            tokenizer()
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.loc[index]
        input_text = "Название: {}\nОценка:{}\n\n{}".format(row["title"], row["grattitue"], row["text"])


In [2]:
config = {
    "model": "ai-forever/ru-en-RoSBERTa",
    "batch_size": 16,
    "lr": 1e-5,
    "eval_batch_size": 4,
    "epochs": 10
}

In [3]:
def test(row):
    print(row)

In [4]:
NUM_REVIEW_CLS = 3
NUM_FINANCIAL_CLS = 2
NUM_REASON_CLS = 4
TOTAL_CLS = NUM_REVIEW_CLS + NUM_FINANCIAL_CLS + NUM_REASON_CLS

In [5]:
def label_encode(row):
    label = [0]*TOTAL_CLS
    label[row.review_category] = 1
    label[row.financial + NUM_REVIEW_CLS] = 1
    label[row.reason_category + NUM_FINANCIAL_CLS + NUM_REVIEW_CLS] = 1
    return label

In [14]:
from datasets import Dataset
import pandas as pd

df = pd.read_csv( "../data/processed/promsvyazbank_reviews.csv")
df = df.set_index("id")
df["bert_text"] = df.apply(lambda row: "classification: Название: {}\nОценка: {}\n\n{}".format(row["title"], row["grade"], row["text"]), axis=1)
df["label"] = df.apply(label_encode, axis=1)

In [17]:
from datasets import Dataset
dataset = Dataset.from_pandas(df[["bert_text", "label"]])

In [18]:
dataset.train_test_split(test_size=0.1)

DatasetDict({
    train: Dataset({
        features: ['bert_text', 'label', 'id'],
        num_rows: 7093
    })
    test: Dataset({
        features: ['bert_text', 'label', 'id'],
        num_rows: 789
    })
})

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(config['model'])
dataset = dataset.map(lambda examples: tokenizer(examples["bert_text"], truncation=True, padding="max_length"), batched=True)

Map:   0%|          | 0/7882 [00:00<?, ? examples/s]

In [20]:
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
import numpy as np

REVIEW_CLS_IND = range(0, 3)
FINANCIAL_CLS_IND = range(3, 4)
REASON_CLS_IND = range(4, 8)

In [None]:
def get_preds_from_logits(logits):
    ret = np.zeros(logits.shape)
    
    # The first 5 columns (GLOBAL_SCORE_INDICES) are for global scores. They should be handled with a multiclass approach
    # i.e. we fill 1 to the class with highest probability, and 0 into the other columns
    best_class = np.argmax(logits[:, REVIEW_CLS_IND], axis=-1)
    ret[list(range(len(ret))), best_class] = 1
    
    # The other columns are for causes and emotions. They should be handled with multilabel approach.
    # i.e. we fill 1 to every class whose score is higher than some threshold
    # In this example, we choose that threshold = 0
    ret[:, FINANCIAL_CLS_IND] = (logits[:, FINANCIAL_CLS_IND] >= 0).astype(int)
    ret[:, REASON_CLS_IND] = (logits[:, REASON_CLS_IND] >= 0).astype(int)
    
    return ret

In [21]:
from sklearn.metrics import f1_score, classification_report
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    final_metrics = {}
    
    # Deduce predictions from logits
    predictions = get_preds_from_logits(logits)
    
    # Get f1 metrics for global scoring. Notice that f1_micro = accuracy
    final_metrics["f1_micro_for_review_score"] = f1_score(labels[:, REVIEW_CLS_IND], predictions[:, REVIEW_CLS_IND], average="micro")
    final_metrics["f1_macro_for_review_score"] = f1_score(labels[:, REVIEW_CLS_IND], predictions[:, REVIEW_CLS_IND], average="macro")
    
    # Get f1 metrics for causes
    final_metrics["f1_micro_for_financial"] = f1_score(labels[:, FINANCIAL_CLS_IND], predictions[:, FINANCIAL_CLS_IND], average="micro")
    final_metrics["f1_macro_for_financial"] = f1_score(labels[:, FINANCIAL_CLS_IND], predictions[:, FINANCIAL_CLS_IND], average="macro")
    
    # Get f1 metrics for emotions
    final_metrics["f1_micro_for_reason"] = f1_score(labels[:, REASON_CLS_IND], predictions[:, REASON_CLS_IND], average="micro")
    final_metrics["f1_macro_for_reason"] = f1_score(labels[:, REASON_CLS_IND], predictions[:, REASON_CLS_IND], average="macro")

    # The global f1_metrics
    final_metrics["f1_micro"] = f1_score(labels, predictions, average="micro")
    final_metrics["f1_macro"] = f1_score(labels, predictions, average="macro")
    
    # Classification report
    print("Classification report for global scores: ")
    print(classification_report(labels[:, REVIEW_CLS_IND], predictions[:, REVIEW_CLS_IND], zero_division=0))
    print("Classification report for causes: ")
    print(classification_report(labels[:, FINANCIAL_CLS_IND], predictions[:, FINANCIAL_CLS_IND], zero_division=0))
    print("Classification report for emotions: ")
    print(classification_report(labels[:, REASON_CLS_IND], predictions[:, REASON_CLS_IND], zero_division=0))
    return final_metrics

In [23]:
from transformers import Trainer, TrainerCallback, TrainingArguments

class MultiTaskClassificationTrainer(Trainer):
    def __init__(self, group_weights=None, **kwargs):
        super().__init__(**kwargs)
        self.group_weights = group_weights
        
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        
        global_score_loss = torch.nn.functional.cross_entropy(logits[:, REVIEW_CLS_IND], labels[:, REVIEW_CLS_IND])
        emotion_loss = torch.nn.functional.binary_cross_entropy_with_logits(logits[:, FINANCIAL_CLS_IND], labels[:, FINANCIAL_CLS_IND])
        cause_loss = torch.nn.functional.binary_cross_entropy_with_logits(logits[:, REASON_CLS_IND], labels[:, REASON_CLS_IND])
        
        loss = self.group_weights[0] * global_score_loss + self.group_weights[2] * emotion_loss + self.group_weights[1] * cause_loss
        return (loss, outputs) if return_outputs else loss

In [24]:
class PrinterCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, logs=None, **kwargs):
        print(f"Epoch {state.epoch}: ")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification(config['model'])

In [None]:
training_args = TrainingArguments(
    output_dir="../models/psh",
    learning_rate=config['lr'],
    per_device_train_batch_size=config['batch_size'],
    per_device_eval_batch_size=config['eval_batch_size'],
    num_train_epochs=config['epochs'],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="f1_macro",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

trainer = MultiTaskClassificationTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    compute_metrics=compute_metrics,
    callbacks=[PrinterCallback],
    group_weights=(0.7, 4, 4)
)

In [None]:
trainer.train()