In [7]:
# @title 1. Install Dependencies & Import Libraries
# !pip install transformers torch scikit-learn pandas numpy

import torch
import numpy as np
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
warnings.filterwarnings("ignore")

Using device: cuda


In [8]:
# @title 2. Load Data & Preprocess Labels
from google.colab import drive
drive.mount('/content/drive')

# --- CONFIGURATION ---
# UPDATE THIS PATH to where your dataset is stored on Drive
FILE_PATH = '/content/drive/MyDrive/mbti_1.csv'
# ---------------------

def load_and_encode_data(path):
    df = pd.read_csv(path)
    print(f"Dataset loaded. Shape: {df.shape}")

    # Create 4 binary columns
    # 0 for first letter (I, N, F, P), 1 for second letter (E, S, T, J)
    # Note: You can adjust mapping preference.
    # Standard: I-E, N-S, F-T, P-J

    df['IE'] = df['type'].apply(lambda x: 0 if 'I' in x else 1) # 0=Introvert, 1=Extrovert
    df['NS'] = df['type'].apply(lambda x: 0 if 'N' in x else 1) # 0=Intuition, 1=Sensing
    df['TF'] = df['type'].apply(lambda x: 0 if 'T' in x else 1) # 0=Thinking, 1=Feeling
    df['JP'] = df['type'].apply(lambda x: 0 if 'J' in x else 1) # 0=Judging, 1=Perceiving

    return df

# Load data
try:
    df = load_and_encode_data(FILE_PATH)
    print(df[['type', 'IE', 'NS', 'TF', 'JP']].head())
except FileNotFoundError:
    print("Error: File not found. Please check FILE_PATH.")

Mounted at /content/drive
Dataset loaded. Shape: (8675, 2)
   type  IE  NS  TF  JP
0  INFJ   0   0   1   0
1  ENTP   1   0   0   1
2  INTP   0   0   0   1
3  INTJ   0   0   0   0
4  ENTJ   1   0   0   0


In [16]:
# @title 3. Text Cleaning & Custom Dataset Class
import string
import re

def clean_text(text):
  text = text.lower()
  text = re.sub(r'https?\S+|www\S+', '', text)
  text = re.sub(r'@\w+|#', '', text)

  mbti_types = [
            "infj",
            "entp",
            "intp",
            "intj",
            "entj",
            "enfj",
            "infp",
            "enfp",
            "isfp",
            "istp",
            "isfj",
            "istj",
            "estp",
            "esfp",
            "estj",
          "esfj",
  ]
  pattern = r"\b(" + "|".join(mbti_types) + r")\b"
  text = re.sub(pattern, "", text)
  text = re.sub(r'\bsent (from )?my \w+(\s\w+)? using tapatalk\b', '', text, flags=re.IGNORECASE)


  text = text.replace("|||", " ")

  text = re.sub(r'w w w', '', text)

  text = re.sub(r"\s+", " ", text).strip()

  return text

print("Cleaning texts... (this might take a minute)")
df['cleaned_posts'] = df['posts'].apply(clean_text)

class MBTIDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

Cleaning texts... (this might take a minute)


In [None]:
# @title 4. Generic Training & Evaluation Loop

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=targets
        )

        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )

            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    accuracy = correct_predictions.double() / n_examples
    return accuracy, np.mean(losses), all_preds, all_labels

In [13]:
def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )

            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    accuracy = correct_predictions.double() / n_examples
    return accuracy, np.mean(losses), all_preds, all_labels

In [17]:
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.utils.class_weight import compute_class_weight

# --- HELPER: Layer Freezing Logic ---
def freeze_roberta_layers(model, unfreeze_last_n_layers=2):
    """
    Freezes all layers except the classifier and the last N encoder layers.
    """
    # 1. Freeze everything first
    for param in model.parameters():
        param.requires_grad = False

    # 2. Unfreeze the Classifier Head (The part we initialized randomly)
    for param in model.classifier.parameters():
        param.requires_grad = True

    # 3. Unfreeze the last N layers of the Encoder
    # RoBERTa base has 12 layers (0-11).
    # If unfreeze_last_n_layers = 2, we want to train layer 10 and 11.
    total_layers = model.config.num_hidden_layers

    for i in range(total_layers - unfreeze_last_n_layers, total_layers):
        for param in model.roberta.encoder.layer[i].parameters():
            param.requires_grad = True

    # Verify optimization
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())
    print(f" -> Layer Freezing Applied. Trainable Params: {trainable_params:,} / {all_params:,} ({(trainable_params/all_params):.1%})")

# --- Training Function (Weighted) ---
def train_epoch_weighted(model, data_loader, optimizer, device, scheduler, n_examples, class_weights):
    model = model.train()
    losses = []
    correct_predictions = 0
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        loss = loss_fn(logits, targets)

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

# --- CONFIGURATION ---
ROBERTA_MODEL_NAME = 'roberta-base' # Changed from BERT
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 2e-5
UNFREEZE_LAST_N = 2 # We will only train the last 2 layers + Classifier

dimensions = ['IE', 'NS', 'TF', 'JP']
results_store = {}

# Use RoBERTa Tokenizer
print(f"Loading Tokenizer: {ROBERTA_MODEL_NAME}")
tokenizer = RobertaTokenizer.from_pretrained(ROBERTA_MODEL_NAME)

for dim in dimensions:
    print(f"\n{'='*40}")
    print(f" TRAINING RoBERTa (Frozen) FOR: {dim}")
    print(f"{'='*40}")

    X = df['cleaned_posts'].values
    y = df[dim].values

    # Stratified Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

    # Compute Class Weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    print(f"Class Weights: {class_weights}")

    # Datasets
    train_dataset = MBTIDataset(X_train, y_train, tokenizer, MAX_LEN)
    test_dataset = MBTIDataset(X_test, y_test, tokenizer, MAX_LEN)

    train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    # Initialize RoBERTa
    model = RobertaForSequenceClassification.from_pretrained(ROBERTA_MODEL_NAME, num_labels=2)
    model = model.to(device)

    # --- APPLY FREEZING ---
    freeze_roberta_layers(model, unfreeze_last_n_layers=UNFREEZE_LAST_N)
    # ----------------------

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    best_val_loss = float('inf')

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')

        train_acc, train_loss = train_epoch_weighted(
            model, train_data_loader, optimizer, device, scheduler, len(train_dataset), weights_tensor
        )
        print(f'Train loss {train_loss:.4f} accuracy {train_acc:.4f}')

        val_acc, val_loss, preds, labels = eval_model(
            model, test_data_loader, device, len(test_dataset)
        )
        print(f'Val   loss {val_loss:.4f} accuracy {val_acc:.4f}')

    print(f"\n--- Report for {dim} ---")
    print(classification_report(labels, preds, target_names=['Class 0', 'Class 1']))
    results_store[dim] = val_acc.item()

    del model
    torch.cuda.empty_cache()

Loading Tokenizer: roberta-base

 TRAINING RoBERTa (Frozen) FOR: IE
Class Weights: [0.64969107 2.17010632]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 -> Layer Freezing Applied. Trainable Params: 14,767,874 / 124,647,170 (11.8%)
Epoch 1/5
Train loss 0.6875 accuracy 0.5529
Val   loss 0.5525 accuracy 0.7429
Epoch 2/5
Train loss 0.6477 accuracy 0.6370
Val   loss 0.5154 accuracy 0.7493
Epoch 3/5
Train loss 0.6203 accuracy 0.6764
Val   loss 0.5230 accuracy 0.7493
Epoch 4/5
Train loss 0.6022 accuracy 0.6846
Val   loss 0.5713 accuracy 0.7101
Epoch 5/5
Train loss 0.5929 accuracy 0.7032
Val   loss 0.5625 accuracy 0.7147

--- Report for IE ---
              precision    recall  f1-score   support

     Class 0       0.84      0.77      0.81      1335
     Class 1       0.41      0.53      0.46       400

    accuracy                           0.71      1735
   macro avg       0.63      0.65      0.63      1735
weighted avg       0.74      0.71      0.73      1735


 TRAINING RoBERTa (Frozen) FOR: NS
Class Weights: [0.58007355 3.62212944]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 -> Layer Freezing Applied. Trainable Params: 14,767,874 / 124,647,170 (11.8%)
Epoch 1/5
Train loss 0.6972 accuracy 0.6275
Val   loss 0.5986 accuracy 0.8542
Epoch 2/5
Train loss 0.6633 accuracy 0.6523
Val   loss 0.5162 accuracy 0.7562
Epoch 3/5
Train loss 0.6286 accuracy 0.6821
Val   loss 0.4744 accuracy 0.7890
Epoch 4/5
Train loss 0.6032 accuracy 0.7108
Val   loss 0.5868 accuracy 0.6801
Epoch 5/5
Train loss 0.5880 accuracy 0.7114
Val   loss 0.5486 accuracy 0.7101

--- Report for NS ---
              precision    recall  f1-score   support

     Class 0       0.91      0.74      0.82      1496
     Class 1       0.24      0.51      0.33       239

    accuracy                           0.71      1735
   macro avg       0.57      0.63      0.57      1735
weighted avg       0.81      0.71      0.75      1735


 TRAINING RoBERTa (Frozen) FOR: TF
Class Weights: [1.08948195 0.9241012 ]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 -> Layer Freezing Applied. Trainable Params: 14,767,874 / 124,647,170 (11.8%)
Epoch 1/5
Train loss 0.6401 accuracy 0.6277
Val   loss 0.5928 accuracy 0.6847
Epoch 2/5
Train loss 0.5764 accuracy 0.7033
Val   loss 0.5628 accuracy 0.7055
Epoch 3/5
Train loss 0.5484 accuracy 0.7220
Val   loss 0.5553 accuracy 0.7233
Epoch 4/5
Train loss 0.5335 accuracy 0.7369
Val   loss 0.5539 accuracy 0.7159
Epoch 5/5
Train loss 0.5141 accuracy 0.7497
Val   loss 0.5655 accuracy 0.7205

--- Report for TF ---
              precision    recall  f1-score   support

     Class 0       0.74      0.61      0.67       796
     Class 1       0.71      0.81      0.76       939

    accuracy                           0.72      1735
   macro avg       0.72      0.71      0.71      1735
weighted avg       0.72      0.72      0.72      1735


 TRAINING RoBERTa (Frozen) FOR: JP
Class Weights: [1.26319621 0.82756976]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 -> Layer Freezing Applied. Trainable Params: 14,767,874 / 124,647,170 (11.8%)
Epoch 1/5
Train loss 0.6931 accuracy 0.5294
Val   loss 0.6815 accuracy 0.6115
Epoch 2/5
Train loss 0.6762 accuracy 0.5804
Val   loss 0.6377 accuracy 0.6398
Epoch 3/5
Train loss 0.6541 accuracy 0.6012
Val   loss 0.6389 accuracy 0.6305
Epoch 4/5
Train loss 0.6397 accuracy 0.6386
Val   loss 0.6469 accuracy 0.6357
Epoch 5/5
Train loss 0.6377 accuracy 0.6272
Val   loss 0.6451 accuracy 0.6323

--- Report for JP ---
              precision    recall  f1-score   support

     Class 0       0.53      0.57      0.55       687
     Class 1       0.70      0.67      0.69      1048

    accuracy                           0.63      1735
   macro avg       0.62      0.62      0.62      1735
weighted avg       0.64      0.63      0.63      1735



-------