In [None]:
%pip install -q 'transformers[torch]' datasets pyarrow==15.0.2

In [24]:
import pandas as pd

In [None]:
df1 = pd.read_csv("../../data/chat_gpt_dataset1.csv")
df3 = pd.read_csv("../../data/labeled.csv").rename({"comment": "text", "toxic": "label"}, axis=1)
df4 = pd.read_csv("../../data/synt_dataset/synt_dataset.csv").rename({"content": "text", "type_toxic": "label"}, axis=1)
df2 = pd.read_csv("../../data/chat_gpt_dataset2.csv")

df4 = df4[df4.type_queue == 'output']
df4.loc[df4.target == 0, "label"] = "neutral"
df4 = df4[["text", "label"]].reset_index(drop=True)

In [85]:
df3['label'] = df3['label'].map({0.0: "neutral", 1.0: "insult"})

In [86]:
df = pd.concat([df1, df2, df3, df4], ignore_index=True)
label_mapping = {k: i for i, k in enumerate(df['label'].unique())}
label_mapping

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate

In [None]:
tokenizer = AutoTokenizer.from_pretrained('apanc/russian-inappropriate-messages')

# Prepare the dataset
df['label'] = df['label'].map(label_mapping)
train_df, eval_df = train_test_split(df, test_size=0.2)

# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Remove the text column as it is now tokenized
train_dataset = train_dataset.remove_columns(['text'])
eval_dataset = eval_dataset.remove_columns(['text'])

# Set the format of the dataset to PyTorch tensors
train_dataset.set_format('torch')
eval_dataset.set_format('torch')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    'apanc/russian-inappropriate-messages',
    num_labels=len(label_mapping),
    ignore_mismatched_sizes=True
)

In [None]:
accuracy_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')
roc_auc_metric = evaluate.load('roc_auc', 'multiclass')
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
    
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1 = f1_metric.compute(predictions=preds, references=labels, average='weighted')
    roc_auc = roc_auc_metric.compute(prediction_scores=probs, references=labels, multi_class='ovr')
    precision = precision_metric.compute(predictions=preds, references=labels, average='weighted')
    recall = recall_metric.compute(predictions=preds, references=labels, average='weighted')
    
    return {
        'accuracy': accuracy['accuracy'],
        'f1': f1['f1'],
        'roc_auc': roc_auc['roc_auc'],
        'precision': precision['precision'],
        'recall': recall['recall']
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    logging_dir='./logs',
    logging_steps=250,
    save_steps=250,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    warmup_steps=500,
    weight_decay=0.01,
    eval_strategy='steps',
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    lr_scheduler_type='cosine',
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
trainer.evaluate()

In [None]:
model.save_pretrained('./saved_model')