In [None]:
# # Fine-Tuning a Language Model for Big 5 Personality Classification using LoRA
# This notebook demonstrates how to fine-tune a pre-trained transformer model for a text classification task (Big 5 personality traits) using the Parameter-Efficient Fine-Tuning (PEFT) library with LoRA.
# --- 1. Installation of required libraries ---
# Run this cell to install the necessary packages if you don't have them already.
# !pip install transformers datasets peft torch accelerate scikit-learn pandas -q

In [2]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    pipeline,
)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch

In [None]:
MODEL_CHECKPOINT = "distilbert-base-uncased"
MODEL_CHECKPOINT = "roberta-large"

TARGET_COLUMN = "cEXT" 
TEXT_COLUMN = "STATUS"
DATA_FILE = "/data/jmharja/projects/PersonaClassifier/data/mypersonality.csv"

LORA_R = 16  # The rank of the update matrices. Higher rank means more parameters.
LORA_ALPHA = 32 # The alpha parameter for LoRA scaling.
LORA_DROPOUT = 0.05 # Dropout probability for LoRA layers.

# Training arguments
EPOCHS = 16
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
OUTPUT_DIR = "./output/"
LOGGING_DIR = './logs'

In [12]:
def load_and_prepare_data(file_path, text_col, target_col):
    df = pd.read_csv(file_path, encoding='Windows-1252')
    df = df.dropna(subset=[text_col, target_col])
    df['label'] = df[target_col].apply(lambda x: 1 if str(x).lower() == 'y' else 0)
    df_processed = df[[text_col, 'label']].rename(columns={text_col: 'text'})
    train_df, val_df = train_test_split(df_processed, test_size=0.2, random_state=42, stratify=df_processed['label'])

    dataset_dict = DatasetDict({
        'train': Dataset.from_pandas(train_df, preserve_index=False),
        'validation': Dataset.from_pandas(val_df, preserve_index=False)
    })
    print("Data preparation complete.")
    return dataset_dict

In [14]:
dataset = load_and_prepare_data(DATA_FILE, TEXT_COLUMN, TARGET_COLUMN)
print(dataset)
print("\nExample from training set:", dataset['train'][0])

Data preparation complete.
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 7933
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1984
    })
})

Example from training set: {'text': 'alone in marin...5 more pages, thanks lindow... rudy home in less than 2 days... Oslo 8 days!', 'label': 0}


In [16]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=2,
    id2label={0: f"NOT_{TARGET_COLUMN}", 1: TARGET_COLUMN},
    label2id={f"NOT_{TARGET_COLUMN}": 0, TARGET_COLUMN: 1}
)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    # Target modules are different for RoBERTa
    target_modules=['query', 'key', 'value'] 
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = get_peft_model(model, lora_config)
model.to(device)
print("\nTrainable parameters after applying LoRA:")
model.print_trainable_parameters()


Trainable parameters after applying LoRA:
trainable params: 887,042 || all params: 67,842,052 || trainable%: 1.3075




In [31]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=512)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"]) # Remove original text column

print("\nTokenized dataset structure:")
print(tokenized_dataset)
print("\nExample of tokenized input:", tokenized_dataset['train'][0])
# Data collator will dynamically pad the inputs to the max length within a batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/7933 [00:00<?, ? examples/s]

Map:   0%|          | 0/1984 [00:00<?, ? examples/s]


Tokenized dataset structure:
DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 7933
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1984
    })
})

Example of tokenized input: {'label': 0, 'input_ids': [101, 2894, 1999, 16400, 1012, 1012, 1012, 1019, 2062, 5530, 1010, 4283, 11409, 3527, 2860, 1012, 1012, 1012, 18254, 2188, 1999, 2625, 2084, 1016, 2420, 1012, 1012, 1012, 9977, 1022, 2420, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [22]:
def compute_metrics(eval_pred):
    """Computes accuracy and F1 score for evaluation."""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
    }

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_dir=LOGGING_DIR,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# --- Start Fine-Tuning ---
print("\nStarting fine-tuning...")
trainer.train()
print("Fine-tuning complete.")

  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.



Starting fine-tuning...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.68,0.677652,0.575605,0.420563
2,0.6773,0.675134,0.578629,0.429144
3,0.6743,0.672082,0.579637,0.440696
4,0.6643,0.669755,0.590222,0.489143
5,0.6687,0.66823,0.595262,0.548138
6,0.6672,0.66673,0.59123,0.493476
7,0.6678,0.665124,0.59627,0.514884
8,0.6578,0.663506,0.602823,0.546659
9,0.6667,0.662633,0.60131,0.543518
10,0.6601,0.661879,0.599294,0.545598




In [27]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model adapter and tokenizer saved to {OUTPUT_DIR}")

Model adapter and tokenizer saved to ./output/


In [28]:
print("\n--- Running example inference ---")

# Load the fine-tuned model for inference
# Note: We load the base model and then apply the saved adapter
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)
peft_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
peft_model.to(device)

classifier = pipeline(
    "text-classification",
    model=peft_model,
    tokenizer=tokenizer,
)

# Example texts
test_texts = [
    "I love being the center of attention at parties.", # Likely 'cEXT' = y
    "I prefer a quiet evening with a good book.",      # Likely 'cEXT' = n
    "Just finished a long day of work, feeling tired." # Ambiguous
]

results = classifier(test_texts, truncation=True, max_length=512)
print("\nInference results:")
for text, result in zip(test_texts, results):
    print(f"Text: '{text}'")
    print(f" -> Prediction: {result['label']} (Score: {result['score']:.4f})\n")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Running example inference ---


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The model 'PeftModelForSequenceClassification' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassifi


Inference results:
Text: 'I love being the center of attention at parties.'
 -> Prediction: LABEL_0 (Score: 0.5939)

Text: 'I prefer a quiet evening with a good book.'
 -> Prediction: LABEL_0 (Score: 0.5913)

Text: 'Just finished a long day of work, feeling tired.'
 -> Prediction: LABEL_0 (Score: 0.6006)

