In [None]:
import torch

from datasets import load_dataset, Dataset
import transformers
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
MODEL_NAME = "distilbert/distilbert-base-uncased"

In [None]:
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules = ["q_lin"],
    r=8,
    bias="none",
    task_type=TaskType.SEQ_CLS,
)

In [None]:
training_arguments = TrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    weight_decay=0.01,
    fp16=False,
    bf16=False,
    eval_strategy="epoch",
)

In [None]:
id2label = {0: "male", 1: "female"}

label2id = {"male": 0, "female": 1}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2, id2label=id2label, label2id=label2id)

model = prepare_model_for_kbit_training(model)


In [None]:
model = get_peft_model(model, peft_config).to(device)

In [None]:
posts = load_dataset('barilan/blog_authorship_corpus', trust_remote_code=True)

In [None]:
train = posts['train']
test = posts['validation']

In [None]:
#Change labels from str to int
def set_labels(example):
    example['label'] = label2id[example['label']]
    return example

#Delete unused information
def preprocess_dataset(dataset):
    new_dataset = dataset.rename_column('gender', 'label')
    new_dataset = new_dataset.remove_columns('age')
    new_dataset = new_dataset.remove_columns('horoscope')
    new_dataset = new_dataset.remove_columns('job')
    new_dataset = new_dataset.remove_columns('date')


    return new_dataset



In [None]:
train = preprocess_dataset(train)
test = preprocess_dataset(test)


In [None]:
train = train.map(set_labels)
test = test.map(set_labels)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})


In [None]:
def tokenize_dataset(example):
    tokenizer.truncation_side = "left"
    return tokenizer(example['text'], truncation=True, max_length=512, padding="max_length")

In [None]:
tokenized_train = train.map(tokenize_dataset, batched=True)
tokenized_test = test.map(tokenize_dataset, batched=True)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    args=training_arguments,
)


In [None]:
model.config.use_cache = False


In [None]:
trainer.train()

In [None]:
# save_directory = "#"
# model.save_pretrained(save_directory)
