In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

In [4]:
!pip install -U \
    transformers \
    peft \
    datasets \
    torch==2.4.0 \
    torchvision==0.19.0 \
    accelerate \
    sentence-transformers==3.0.1 \
    faiss-cpu==1.8.0 \
    pandas==2.2.2 \
    tqdm \
    trl \
    bitsandbytes



## **Dataset Loading**

In [4]:
from datasets import load_dataset
import re

# Load dataset
ds = load_dataset("kl08/myers-briggs-type-indicator")

Repo card metadata block was not found. Setting CardData to empty.


In [5]:
ds.shape

{'train': (8675, 2)}

## **Data Processing as `Zero shot LoRA` Fine-Tuning**

In [6]:
# Clean function
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\s+', ' ', text)     # Normalize spaces
    text = text.strip()
    return text

# Format for Phi-3
def format_mbti(example):
    posts = example['posts']
    mbti = example['type'].upper()

    # Clean
    posts = clean_text(posts)

    # Truncate to ~2000 tokens (safe for 4k context)
    if len(posts) > 3000:
        posts = posts[:3000] + "..."

    formatted = f"""<|system|>
You are an expert in MBTI personality analysis. Analyze the person's writing and behavior to determine their exact 4-letter MBTI type (e.g., INTJ, ESFP). Return ONLY the type.<|end|>
<|user|>
Analyze this person's posts and determine their MBTI type:

"{posts}"<|end|>
<|assistant|>
{mbti}<|end|>"""

    return {"text": formatted}

# Apply
formatted_ds = ds['train'].map(format_mbti, remove_columns=ds['train'].column_names)

## Model Define

In [9]:
from transformers import AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import torch

# Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_4bit=True,
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

model = get_peft_model(model, lora_config)

print(f"Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

`torch_dtype` is deprecated! Use `dtype` instead!
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Trainable params: 8,912,896


## **Training Arguments Define**

In [10]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import torch
# Split
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer
# Split
train_texts = [x['text'] for x in formatted_ds]
train_ds, val_ds = train_test_split(train_texts, test_size=0.15, random_state=42)
train_dataset = Dataset.from_dict({"text": train_ds})
val_dataset = Dataset.from_dict({"text": val_ds})
# Tokenizer
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# Tokenize with longer context
MAX_LENGTH = 2048 # Phi-3 supports 4k, but 2k is safe
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors=None
    )
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

training_args = TrainingArguments(
    output_dir="./phi3_mbti_lora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=0.5,  # Updated to less than 1 epoch to reduce risk of overfitting
    logging_steps=50,
    eval_steps=200,
    save_steps=400,  # Updated to 400, which is a multiple of eval_steps=200 (2x)
    eval_strategy="steps",  # Updated to eval_strategy (replaces deprecated evaluation_strategy)
    save_total_limit=2,
    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    bf16=True,
    report_to="none",
    load_best_model_at_end=True,  # Added to load the best model based on eval loss
    metric_for_best_model="eval_loss"  # Added to use eval loss for best model selection
)
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    peft_config=lora_config
)

Map:   0%|          | 0/7373 [00:00<?, ? examples/s]

Map:   0%|          | 0/1302 [00:00<?, ? examples/s]



Truncating train dataset:   0%|          | 0/7373 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1302 [00:00<?, ? examples/s]

## **Train Now**

In [11]:
trainer.train()

  return fn(*args, **kwargs)
You are not running the flash-attention implementation, expect numerical differences.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
200,0.003,0.002417,0.002022,1638400.0,0.999729
400,0.0059,0.001442,0.001414,3276800.0,0.999825


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=461, training_loss=0.18752985378919743, metrics={'train_runtime': 2541.0566, 'train_samples_per_second': 1.451, 'train_steps_per_second': 0.181, 'total_flos': 8.455214226997248e+16, 'train_loss': 0.18752985378919743, 'epoch': 0.5002034450020344})

In [16]:
from huggingface_hub import login
login()  # This will prompt for your token; paste a write-enabled one

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

## **Push the model into the HuggingFace**

In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# Use the last checkpoint (461 has the final state)
model_path = "./phi3_mbti_lora/checkpoint-461"

base_model_name = "microsoft/Phi-3-mini-4k-instruct"

# Your repo ID (ensure "alam1n" is YOUR username; if not, change to e.g., "your_actual_username/phi3-mbti-lora")
hub_model_id = "alam1n/phi3-mbti-lora"  # Fix if namespace is wrong

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_4bit=True,
    trust_remote_code=True
)

# Load LoRA from checkpoint
model = PeftModel.from_pretrained(model, model_path)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Push with token (use your token var if not logged in)
model.push_to_hub(hub_model_id, commit_message="Upload fine-tuned LoRA adapters for MBTI from checkpoint-461")
tokenizer.push_to_hub(hub_model_id, commit_message="Upload tokenizer")

print(f"Pushed to {hub_model_id}")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

Pushed to alam1n/phi3-mbti-lora


In [19]:
!git config --global credential.helper store

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
