In [1]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

# used for runpod instances
# os.environ['HF_HOME'] = '/workspace/hfcache/'

load_dotenv()
login(token=os.getenv("HF_HUB_TOKEN"))

In [2]:
from datasets import load_dataset

# mbti_data = load_dataset("minhaozhang/mbti", split='train')
mbti_data = load_dataset("minhaozhang/mbti")

In [3]:
# encode the labels for stratified splits
mbti_data = mbti_data.class_encode_column("mbti")


# to train from scratch with entire dataset you don't need any of these splits
# these are only used to test behaviors
mbti_data = mbti_data['train'].train_test_split(test_size=0.0005, stratify_by_column="mbti", seed=0)
mbti_data = mbti_data["test"]
mbti_data = mbti_data.train_test_split(test_size=0.1, stratify_by_column="mbti", seed=1)


mbti_data = mbti_data.class_encode_column("E-I")
mbti_data = mbti_data.class_encode_column("N-S")
mbti_data = mbti_data.class_encode_column("F-T")
mbti_data = mbti_data.class_encode_column("J-P")

Flattening the indices:   0%|          | 0/1138 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1138 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/127 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/127 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1138 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/127 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1138 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/127 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1138 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/127 [00:00<?, ? examples/s]

In [4]:
# MODEL = "microsoft/Phi-3-mini-4k-instruct" # used for training from scratch
# MODEL = "minhaozhang/Phi-3-mini-4k-instruct-mbti-2" # trained already with 10% data
# TRAINED_MODEL = "Phi-3-mini-4k-instruct-mbti-JP" # newly trained model
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
TRAINED_MODEL = "Llama-3.2-1B-Instruct-MBTI-JP"

In [5]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained(MODEL)

tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(data):
    return tokenizer(data["body"], truncation=True)

In [6]:
tokenized_mbti_data = mbti_data.map(preprocess_function, batched=True)
del mbti_data

# remove unnecessary columns
# to train other dimension, change the label to the corresponding column
tokenized_mbti_data = tokenized_mbti_data.remove_columns(['author', 'mbti', 'F-T', "E-I", 'N-S'])
tokenized_mbti_data = tokenized_mbti_data.rename_column('J-P', "label")

Map:   0%|          | 0/1138 [00:00<?, ? examples/s]

Map:   0%|          | 0/127 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
import evaluate

# use 3 different metrics to evaluate the model
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
balanced_accuracy = evaluate.load("hyperml/balanced_accuracy")
matthews_correlation = evaluate.load("matthews_correlation")

In [9]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy_result = accuracy.compute(predictions=predictions, references=labels)
    f1_result = f1.compute(predictions=predictions, references=labels)
    balanced_accuracy_result = balanced_accuracy.compute(predictions=predictions, references=labels)
    matthews_correlation_result = matthews_correlation.compute(predictions=predictions, references=labels)
    return {**accuracy_result, **f1_result, **balanced_accuracy_result, **matthews_correlation_result}

In [10]:
# VERY IMPORTANT 
# you have to make sure this is coresponding to the label from the tokenizer and label encoder 
# using 1: J and 0: P will cause NaN in the training which cause the model not to train
id2label = {0: "J", 1: "P"}
label2id = {"J": 0, "P": 1}

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL, num_labels=2, id2label=id2label, label2id=label2id
)
model.config.pad_token_id = tokenizer.eos_token_id

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import torch
# torch.backends.cuda.matmul.allow_tf32 = True
# torch.backends.cudnn.allow_tf32 = True

steps = 100 # remember to adjust this

training_args = TrainingArguments(
    output_dir=TRAINED_MODEL,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="steps",
    save_strategy="steps",
    logging_steps=steps,
    eval_steps=steps,
    save_steps=steps,
    load_best_model_at_end=True,
    # push_to_hub=True,
    optim="adamw_bnb_8bit",
    eval_accumulation_steps=2,
    gradient_accumulation_steps=2, 
    bf16=True,
)

from torch import nn
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        
        # compute weighted loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.6, 0.4]).to('cuda'))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_mbti_data["train"],
    eval_dataset=tokenized_mbti_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

  trainer = CustomTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mzhangminhao[0m ([33mzmhzmh[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113163833336633, max=1.0…

Step,Training Loss,Validation Loss,Accuracy,F1,Balanced Accuracy,Matthews Correlation
100,1.0765,0.753354,0.417323,0.026316,0.506667,0.07418
200,0.771,0.812956,0.409449,0.0,0.5,0.0


KeyboardInterrupt: 

In [None]:
trainer.evaluate()
trainer.save_model(TRAINED_MODEL)
trainer.push_to_hub()