In [1]:
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv()
login(token=os.getenv("HF_HUB_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/myadmin/.cache/huggingface/token
Login successful


In [2]:
from datasets import load_dataset

mbti_data = load_dataset("minhaozhang/mbti", split='train')
mbti_data.features

{'author': Value(dtype='string', id=None),
 'body': Value(dtype='string', id=None),
 'mbti': Value(dtype='string', id=None),
 'E-I': Value(dtype='string', id=None),
 'N-S': Value(dtype='string', id=None),
 'F-T': Value(dtype='string', id=None),
 'J-P': Value(dtype='string', id=None)}

In [3]:
mbti_data = mbti_data.class_encode_column("mbti")
mbti_data = mbti_data.class_encode_column("E-I")
mbti_data = mbti_data.class_encode_column("N-S")
mbti_data = mbti_data.class_encode_column("F-T")
mbti_data = mbti_data.class_encode_column("J-P")


mbti_data = mbti_data.train_test_split(test_size=0.1, stratify_by_column="mbti", seed=0) # remove the 10% 
mbti_data = mbti_data["train"]
mbti_data = mbti_data.train_test_split(test_size=0.2, stratify_by_column="mbti", seed=0) # 90%  * 20% = 18%
mbti_data = mbti_data["test"]
mbti_data = mbti_data.train_test_split(test_size=0.1, stratify_by_column="mbti", seed=1)


In [8]:
MODEL = "microsoft/Phi-3-mini-4k-instruct"
TRAINED_MODEL = "Phi-3-mini-4k-instruct-mbti-2"

In [10]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained(MODEL)

def preprocess_function(data):
    return tokenizer(data["body"], truncation=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
tokenized_mbti_data = mbti_data.map(preprocess_function, batched=True)
del mbti_data
# tokenized_mbti_data = tokenized_mbti_data.remove_columns(['body', 'author', 'mbti', 'N-S', 'F-T', 'E-I'])
tokenized_mbti_data = tokenized_mbti_data.remove_columns(['author', 'mbti', 'F-T', "E-I", 'N-S'])
tokenized_mbti_data = tokenized_mbti_data.rename_column('J-P', "label")

Map:   0%|          | 0/50575 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

In [14]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy_result = accuracy.compute(predictions=predictions, references=labels)
    f1_result = f1.compute(predictions=predictions, references=labels)
    return {**accuracy_result, **f1_result}

In [15]:
id2label = {0: "J", 1: "P"}
label2id = {"J": 0, "P": 1}

In [16]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL, num_labels=2, id2label=id2label, label2id=label2id
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Phi3ForSequenceClassification were not initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

steps = 2000

training_args = TrainingArguments(
    output_dir=TRAINED_MODEL,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="steps",
    save_strategy="steps",
    logging_steps=steps,
    eval_steps=steps,
    save_steps=steps,
    load_best_model_at_end=True,
    push_to_hub=True,
    optim="adamw_bnb_8bit",
    # optim="adafactor",
    eval_accumulation_steps=4,
    gradient_accumulation_steps=4, 
    # gradient_checkpointing=True,
    # torch_compile=False,
    # bf16=True,
    # fp32=False
    # fp16=False,
    tf32=True,
)

from torch import nn
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.6, 0.4]).to('cuda'))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_mbti_data["train"],
    eval_dataset=tokenized_mbti_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

[2024-08-07 06:26:02,200] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/myadmin/anaconda3/envs/zmh-mbti-env/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mzhangminhao[0m ([33mzmhzmh[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112530688889264, max=1.0…

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss,Validation Loss,Accuracy,F1
2000,0.7097,0.688838,0.607968,0.746305
4000,0.6833,0.667495,0.596915,0.651164
6000,0.6713,0.659581,0.613821,0.67476
8000,0.6592,0.654,0.638853,0.722176
10000,0.6498,0.64557,0.591359,0.588708
12000,0.6362,0.637159,0.64607,0.714986


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


TrainOutput(global_step=12643, training_loss=0.6664236307540178, metrics={'train_runtime': 24655.56, 'train_samples_per_second': 8.205, 'train_steps_per_second': 0.513, 'total_flos': 9.588100274111693e+17, 'train_loss': 0.6664236307540178, 'epoch': 0.999940682155215})

In [18]:
trainer.evaluate()

{'eval_loss': 0.6371585130691528,
 'eval_accuracy': 0.6460701927829956,
 'eval_f1': 0.7149863066046749,
 'eval_runtime': 1198.6577,
 'eval_samples_per_second': 42.193,
 'eval_steps_per_second': 10.548,
 'epoch': 0.999940682155215}

In [19]:
trainer.save_model(TRAINED_MODEL)
trainer.push_to_hub()

model-00003-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/minhaozhang/Phi-3-mini-4k-instruct-mbti-2/commit/8c48d1b2db9e075c416bca1f12a2a7564c4cabdc', commit_message='End of training', commit_description='', oid='8c48d1b2db9e075c416bca1f12a2a7564c4cabdc', pr_url=None, pr_revision=None, pr_num=None)