In [None]:
%pip install -q 'transformers[torch]' datasets pyarrow==15.0.2

In [None]:
import pandas as pd

In [None]:
df1 = pd.read_csv("../../data/chat_gpt_dataset1.csv")
df2 = pd.read_csv("../../data/chat_gpt_dataset2.csv")
df3 = pd.read_csv("../../data/labeled.csv").rename({"comment": "text", "toxic": "label"}, axis=1)

In [None]:
df3['label'] = df3['label'].map({0.0: "neutral", 1.0: "insult"})

In [None]:
df = pd.concat([df1, df2, df3], ignore_index=True)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
from sklearn.model_selection import train_test_split

tokenizer = AutoTokenizer.from_pretrained('apanc/russian-inappropriate-messages')

# Prepare the dataset
df['label'] = df['label'].map({k: i for i, k in enumerate(df['label'].unique())})
train_df, eval_df = train_test_split(df, test_size=0.2)

# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Remove the text column as it is now tokenized
train_dataset = train_dataset.remove_columns(['text'])
eval_dataset = eval_dataset.remove_columns(['text'])

# Set the format of the dataset to PyTorch tensors
train_dataset.set_format('torch')
eval_dataset.set_format('torch')

# Load the BERT model
model = AutoModelForSequenceClassification.from_pretrained('apanc/russian-inappropriate-messages', num_labels=7, ignore_mismatched_sizes=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=20,
)

# Create the Trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

In [None]:
model.save_pretrained('./saved_model')