In [None]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    pipeline,
)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import getpass
import openai # For GPT-4o Zero-shot

In [None]:
TARGET_COLUMN = "cEXT" 
TEXT_COLUMN = "STATUS"
DATA_FILE = "/data/jmharja/projects/PersonaClassifier/data/mypersonality.csv"

In [None]:
def load_and_prepare_data(file_path, text_col, target_col):
    """Loads data from a CSV and prepares it for all experiments."""
    print("Loading and preparing data...")
    df = pd.read_csv(file_path, encoding='Windows-1252')
    df = df.dropna(subset=[text_col, target_col])
    df['label'] = df[target_col].apply(lambda x: 1 if str(x).lower() == 'y' else 0)
    df_processed = df[[text_col, 'label']].rename(columns={text_col: 'text'})
    
    train_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42, stratify=df_processed['label'])
    
    # For Hugging Face models
    train_val_dataset = Dataset.from_pandas(train_df, preserve_index=False)
    test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
    
    train_val_split = train_val_dataset.train_test_split(test_size=0.1, seed=42)
    
    dataset_dict = DatasetDict({
        'train': train_val_split['train'],
        'validation': train_val_split['test'],
        'test': test_dataset
    })
    
    print("Data preparation complete.")
    return dataset_dict, train_df, test_df

# Load data once for all experiments
main_dataset_dict, train_df, test_df = load_and_prepare_data(DATA_FILE, TEXT_COLUMN, TARGET_COLUMN)
print("\nDataset structure for Transformer models:")
print(main_dataset_dict)

In [None]:
from huggingface_hub import whoami
print("HEHEHe")
try:
    user_info = whoami()
    print("Authenticated as:", user_info['name'])
except Exception as e:
    print("Not logged in or token is invalid.")
    print(e)

In [None]:
import os

hf_token = os.environ.get("HUGGING_FACE_HUB_TOKEN") or os.environ.get("HF_TOKEN")
if hf_token:
    print("An environment variable for the Hugging Face token is set.")
    print("This might be overriding your login.")
else:
    print("No Hugging Face token found in environment variables. That's good.")

In [None]:
## Experiment 1: LoRA Fine-Tuning with Llama-3-8B
# This approach fine-tunes a small number of parameters (adapters) on top of a frozen Llama-3-8B model. It's memory and computationally efficient.
# **Note:** The smallest available Llama-3 model is 8B. This requires authentication with Hugging Face.

# --- Llama-3 LoRA Setup ---
try:
    from huggingface_hub import notebook_login
    notebook_login()
except ImportError:
    print("Please run `pip install huggingface_hub` and log in to use gated models like Llama-3.")

# --- Config ---
MODEL_CHECKPOINT_LLAMA = "meta-llama/Meta-Llama-3-8B"
OUTPUT_DIR_LLAMA = "./big5_classifier_llama_lora"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# --- Model & Tokenizer ---
MODEL_CHECKPOINT_LLAMA = "roberta-small"  #"meta-llama/Meta-Llama-3-8B"

tokenizer_llama = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT_LLAMA)
# Add padding token if it doesn't exist
if tokenizer_llama.pad_token is None:tokenizer_llama.pad_token = tokenizer_llama.eos_token

model_llama_lora = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT_LLAMA,
    num_labels=2,
    torch_dtype=torch.bfloat16, # Use bfloat16 for memory efficiency
    device_map="auto"
)
model_llama_lora.config.pad_token_id = tokenizer_llama.pad_token_id

# --- LoRA Config ---
lora_config_llama = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"] # Target modules for Llama
)
model_llama_lora = get_peft_model(model_llama_lora, lora_config_llama)
model_llama_lora.print_trainable_parameters()

# --- Tokenization ---
def tokenize_function(examples):
    return tokenizer_llama(examples["text"], truncation=True, max_length=256)

tokenized_dataset_llama = main_dataset_dict.map(tokenize_function, batched=True).remove_columns(["text"])

# --- Trainer ---
trainer_llama = Trainer(
    model=model_llama_lora,
    args=TrainingArguments(
        output_dir=OUTPUT_DIR_LLAMA,
        num_train_epochs=3,
        per_device_train_batch_size=4,  # Lower batch size for large models
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset_llama["train"],
    eval_dataset=tokenized_dataset_llama["validation"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_llama),
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1))},
)

# --- Training ---
print("\nStarting Llama-3 LoRA fine-tuning...")
trainer_llama.train() # Uncomment to run training
print("Llama-3 LoRA training would be run here.")

In [None]:
## Experiment 2: Full Fine-Tuning with RoBERTa-Large
# This is the traditional approach where all 355 million parameters of RoBERTa-Large are updated during training. It can be very effective but requires more computational resources.
# --- RoBERTa Full Fine-Tune Setup ---
MODEL_CHECKPOINT_ROBERTA = "roberta-large"
OUTPUT_DIR_ROBERTA = "./big5_classifier_roberta_full"

# --- Model & Tokenizer ---
tokenizer_roberta = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT_ROBERTA)
model_roberta_full = AutoModelForSequenceClassification.from_pretrained( MODEL_CHECKPOINT_ROBERTA, num_labels=2).to(device)
print(f"\nRoBERTa-Large has {model_roberta_full.num_parameters():,} total parameters (all trainable).")

# --- Tokenization ---
def tokenize_function_roberta(examples):
    return tokenizer_roberta(examples["text"], truncation=True, max_length=512)
tokenized_dataset_roberta = main_dataset_dict.map(tokenize_function_roberta, batched=True).remove_columns(["text"])

# --- Trainer ---
trainer_roberta = Trainer(
    model=model_roberta_full,
    args=TrainingArguments(
        output_dir=OUTPUT_DIR_ROBERTA,
        num_train_epochs=16,
        per_device_train_batch_size=16, # Can be higher than Llama-3
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset_roberta["train"],
    eval_dataset=tokenized_dataset_roberta["validation"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_roberta),
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1))},
)
# --- Training ---
print("\nStarting RoBERTa-Large full fine-tuning...")
trainer_roberta.train() # Uncomment to run training
print("RoBERTa full fine-tuning would be run here.")

In [None]:
# ## Experiment 3: Zero-Shot Classification with GPT-4o
# This approach leverages a powerful, general-purpose model's existing knowledge. We simply ask it to classify the text based on a carefully crafted prompt, without any training data.
# --- GPT-4o Zero-Shot Setup ---
try:
    openai.api_key = getpass.getpass("Enter your OpenAI API key: ")
except Exception as e:
    print("Could not set OpenAI API key.", e)
    
def classify_with_gpt4o(text, trait):
    """Classifies a single text using GPT-4o with a zero-shot prompt."""
    prompt = f"""
    You are a psychology expert. Read the following text and determine if it indicates the author has the personality trait of '{trait}'.
    The trait '{trait}' is defined as being outgoing, talkative, and energetic.
    Respond with only the word 'yes' or 'no'.

    Text: "{text}"
    """
    try:
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=3,
            temperature=0,
        )
        answer = response.choices[0].message.content.strip().lower()
        return 1 if 'yes' in answer else 0
    except Exception as e:
        print(f"An error occurred: {e}")
        return -1 # Return -1 for errors

# --- Evaluation ---
print("\nRunning GPT-4o zero-shot evaluation...")
# Note: This will make one API call per item in the test set. This can be slow and costly.
# We will run on a small sample.
sample_test_df = test_df.sample(n=10, random_state=42)
predictions_gpt = [classify_with_gpt4o(text, TARGET_COLUMN) for text in sample_test_df['text']]
true_labels_gpt = sample_test_df['label'].tolist()

# Filter out errors
valid_preds = [p for p, t in zip(predictions_gpt, true_labels_gpt) if p != -1]
valid_labels = [t for p, t in zip(predictions_gpt, true_labels_gpt) if p != -1]

if valid_labels:
    accuracy_gpt = accuracy_score(valid_labels, valid_preds)
    print(f"GPT-4o Zero-Shot Accuracy on 10 samples: {accuracy_gpt:.4f}")
else:
    print("Could not get any valid predictions from GPT-4o.")

In [None]:
## Experiment 4: SVM with Simulated LIWC Features
# This is a classic NLP approach. Instead of deep learning, we engineer features using a lexicon (like LIWC) and train a simple, powerful classifier like an SVM.
# **Note:** LIWC is proprietary. We simulate its output by creating a simple word-counting function.
# --- SVM + LIWC Setup ---
def simulate_liwc_features(text):
    """
    A simplified simulation of LIWC. Counts words from predefined categories.
    In a real scenario, you'd use the official LIWC tool/lexicon.
    """
    text = text.lower()
    features = {
        'pronoun_i': len(re.findall(r'\b(i|me|my|mine)\b', text)),
        'positive_affect': len(re.findall(r'\b(love|nice|sweet|happy|good)\b', text)),
        'negative_affect': len(re.findall(r'\b(hate|sad|angry|bad|awful)\b', text)),
        'social': len(re.findall(r'\b(friend|party|talk|we|us|our)\b', text)),
        'work': len(re.findall(r'\b(work|job|office|company)\b', text)),
    }
    return features

# --- Feature Extraction ---
import re # Needed for the LIWC simulation

print("\nExtracting simulated LIWC features...")
# Convert text data to a list of feature dicts
train_features = [simulate_liwc_features(text) for text in train_df['text']]
test_features = [simulate_liwc_features(text) for text in test_df['text']]

# Convert dicts to DataFrame
train_features_df = pd.DataFrame(train_features)
test_features_df = pd.DataFrame(test_features)

# Get labels
y_train = train_df['label']
y_test = test_df['label']

# --- SVM Training ---
print("Training SVM model...")
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(train_features_df, y_train)

# --- Evaluation ---
predictions_svm = svm_model.predict(test_features_df)
accuracy_svm = accuracy_score(y_test, predictions_svm)
print(f"SVM + LIWC Accuracy: {accuracy_svm:.4f}")