In [1]:
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv()
login(token=os.getenv("HF_HUB_TOKEN"))

In [None]:
from datasets import load_dataset

mbti_data = load_dataset("minhaozhang/mbti", split='train')
mbti_data.features

In [None]:
mbti_data = mbti_data.class_encode_column("mbti")
mbti_data = mbti_data.class_encode_column("E-I")
mbti_data = mbti_data.class_encode_column("N-S")
mbti_data = mbti_data.class_encode_column("F-T")
mbti_data = mbti_data.class_encode_column("J-P")


mbti_data = mbti_data.train_test_split(test_size=0.01, stratify_by_column="mbti", seed=0)
# mbti_data = mbti_data['train'].train_test_split(test_size=0.05, seed=0)

mbti_data = mbti_data["test"]

In [None]:
mbti_data.features

In [None]:
mbti_data = mbti_data.train_test_split(test_size=0.2, stratify_by_column="mbti")
mbti_data

In [None]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

In [None]:
def preprocess_function(data):
    return tokenizer(data["body"], truncation=True)

In [None]:
tokenized_mbti_data = mbti_data.map(preprocess_function, batched=True)
del mbti_data
# tokenized_mbti_data = tokenized_mbti_data.remove_columns(['body', 'author', 'mbti', 'N-S', 'F-T', 'E-I'])
tokenized_mbti_data = tokenized_mbti_data.remove_columns(['author', 'mbti', 'N-S', 'F-T', 'E-I'])
tokenized_mbti_data = tokenized_mbti_data.rename_column("J-P", "label")

In [None]:
print(tokenized_mbti_data['train'].features)
print(tokenized_mbti_data['train'][0])

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # print(f"Predictions: {predictions}")
    # print(f"Labels: {labels}")
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0: "J", 1: "P"}
label2id = {"J": 0, "P": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
import torch
# torch.backends.cuda.matmul.allow_tf32 = True
# torch.backends.cudnn.allow_tf32 = True

training_args = TrainingArguments(
    output_dir="Phi-3-mini-4k-instruct-mbti",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="steps",
    save_strategy="steps",
    logging_steps=500,
    eval_steps=500,
    save_steps=500,
    load_best_model_at_end=True,
    push_to_hub=True,
    optim="adamw_bnb_8bit",
    # optim="adafactor",
    gradient_accumulation_steps=4, 
    eval_accumulation_steps=4,
    # gradient_checkpointing=True,
    # torch_compile=False,
    # bf16=True,
    # deepspeed="ds_config.json",
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_mbti_data["train"],
    eval_dataset=tokenized_mbti_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

In [None]:
trainer.save_model("Phi-3-mini-4k-instruct-mbti")
trainer.push_to_hub()