In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split

competition_dataset_path = 'dataset/train_essays.csv'
extra_dataset_path = 'dataset/train_v2_drcat_02.csv'

competition_dataset = pd.read_csv(competition_dataset_path)
competition_dataset['label'] = competition_dataset['generated']
competition_dataset = competition_dataset.drop(columns=['prompt_id', 'id', 'generated'])

extra_dataset = pd.read_csv(extra_dataset_path)
extra_dataset = extra_dataset.drop(columns=['prompt_name', 'source', 'RDizzl3_seven'])

dataset = pd.concat([competition_dataset, extra_dataset], ignore_index=True)

condition = dataset['label'] == 0
filtered_dataset = dataset[condition]
random_rows = filtered_dataset.sample(n=28000)
dataset = dataset.drop(random_rows.index)

condition = dataset['label'] == 1
filtered_dataset = dataset[condition]
random_rows = filtered_dataset.sample(n=16800)
dataset = dataset.drop(random_rows.index)

df_train, df_test = train_test_split(dataset, test_size=0.30, random_state=42, shuffle=True, stratify=dataset["label"])

In [18]:
from torch.nn import CrossEntropyLoss
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset

model_location = 'results/checkpoint-10'
# model_location = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_location)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

train_dataset = Dataset.from_pandas(df_train)
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = Dataset.from_pandas(df_test)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

LABELS = ['human', 'generated']
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}

model = AutoModelForSequenceClassification.from_pretrained(model_location, num_labels=len(LABELS), id2label=id2label, label2id=label2id)

training_args = TrainingArguments(
 output_dir="./results",
 num_train_epochs=3,  # Количество эпох
 per_device_train_batch_size=8,  # Размер мини-багажа для обучения
 per_device_eval_batch_size=8,  # Размер мини-багажа для оценки
 evaluation_strategy="steps",  # Периодичность оценки
 save_strategy="steps",  # Периодичность сохранения модели
 save_steps= 10,
 logging_dir="./logs",  # Директория для логов
 logging_steps=10,  # Шаги логирования
 learning_rate=5e-5,  # Темп обучения
 # label_names = ["generated"]
)

trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,  # Укажите тренировочный датасет
 eval_dataset=eval_dataset,
 tokenizer=tokenizer
)
# trainer.train()
# trainer.evaluate()

#https://web3day.ru/how-to-train-model-for-text-classification/
#GPT

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/434 [00:00<?, ? examples/s]

In [29]:
test_dataset_path = 'dataset/test_essays.csv'

test_df = pd.read_csv(test_dataset_path)
test_df = test_df.drop(columns=['prompt_id'])

test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(preprocess_function, batched=True)

predictions, label_ids, metrics = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions, axis=1)

out =  pd.DataFrame(data={'id': test_df.id, 'generated': predicted_labels})
out
# out.to_csv('submission.csv', index=False)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Unnamed: 0,id,generated
0,0000aaaa,1
1,1111bbbb,0
2,2222cccc,0
