In [1]:
# @title 1. Install Dependencies & Import Libraries
# !pip install transformers torch scikit-learn pandas numpy

import torch
import numpy as np
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
warnings.filterwarnings("ignore")

Using device: cuda


In [2]:
# @title 2. Load Data & Preprocess Labels
from google.colab import drive
drive.mount('/content/drive')

# --- CONFIGURATION ---
# UPDATE THIS PATH to where your dataset is stored on Drive
FILE_PATH = '/content/drive/MyDrive/mbti_1.csv'
# ---------------------

def load_and_encode_data(path):
    df = pd.read_csv(path)
    print(f"Dataset loaded. Shape: {df.shape}")

    # Create 4 binary columns
    # 0 for first letter (I, N, F, P), 1 for second letter (E, S, T, J)
    # Note: You can adjust mapping preference.
    # Standard: I-E, N-S, F-T, P-J

    df['IE'] = df['type'].apply(lambda x: 0 if 'I' in x else 1) # 0=Introvert, 1=Extrovert
    df['NS'] = df['type'].apply(lambda x: 0 if 'N' in x else 1) # 0=Intuition, 1=Sensing
    df['TF'] = df['type'].apply(lambda x: 0 if 'T' in x else 1) # 0=Thinking, 1=Feeling
    df['JP'] = df['type'].apply(lambda x: 0 if 'J' in x else 1) # 0=Judging, 1=Perceiving

    return df

# Load data
try:
    df = load_and_encode_data(FILE_PATH)
    print(df[['type', 'IE', 'NS', 'TF', 'JP']].head())
except FileNotFoundError:
    print("Error: File not found. Please check FILE_PATH.")

Mounted at /content/drive
Dataset loaded. Shape: (8675, 2)
   type  IE  NS  TF  JP
0  INFJ   0   0   1   0
1  ENTP   1   0   0   1
2  INTP   0   0   0   1
3  INTJ   0   0   0   0
4  ENTJ   1   0   0   0


In [3]:
# @title 3. Text Cleaning & Custom Dataset Class

def clean_text(text):
    """
    1. Remove URLs
    2. Remove pipe separators |||
    3. Lowercase
    """
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = text.replace('|||', ' ')     # Replace separators with space
    text = re.sub(r'\s+', ' ', text).strip() # Remove multiple spaces
    return text.lower()

print("Cleaning texts... (this might take a minute)")
df['cleaned_posts'] = df['posts'].apply(clean_text)

class MBTIDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

Cleaning texts... (this might take a minute)


In [None]:
# @title 4. Generic Training & Evaluation Loop

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=targets
        )

        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )

            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    accuracy = correct_predictions.double() / n_examples
    return accuracy, np.mean(losses), all_preds, all_labels

In [7]:
# @title 4 & 5 Updated: Training with Class Weights (Replaces previous logic)

from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn

# --- Modified Training Function to accept Weights ---
def train_epoch_weighted(model, data_loader, optimizer, device, scheduler, n_examples, class_weights):
    model = model.train()
    losses = []
    correct_predictions = 0

    # Define Loss function with weights
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)

        # Note: We do NOT use model(labels=targets) here because we need custom loss
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits

        # Calculate loss manually using our weighted function
        loss = loss_fn(logits, targets)

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

# --- Main Execution Loop ---

BERT_MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5 # Increased slightly as weighted loss stabilizes slowly
LEARNING_RATE = 2e-5

dimensions = ['IE', 'NS', 'TF', 'JP']
results_store = {}

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

for dim in dimensions:
    print(f"\n{'='*30}")
    print(f" TRAINING CLASSIFIER FOR: {dim} (With Class Weights)")
    print(f"{'='*30}")

    X = df['cleaned_posts'].values
    y = df[dim].values

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

    # --- CALCULATE CLASS WEIGHTS HERE ---
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    # Convert to Tensor and push to GPU
    weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    print(f"Class Weights for {dim}: {class_weights}")
    # ------------------------------------

    train_dataset = MBTIDataset(X_train, y_train, tokenizer, MAX_LEN)
    test_dataset = MBTIDataset(X_test, y_test, tokenizer, MAX_LEN)

    train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    model = BertForSequenceClassification.from_pretrained(BERT_MODEL_NAME, num_labels=2)
    model = model.to(device)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    best_val_loss = float('inf')

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')

        # Pass weights_tensor to the training function
        train_acc, train_loss = train_epoch_weighted(
            model, train_data_loader, optimizer, device, scheduler, len(train_dataset), weights_tensor
        )
        print(f'Train loss {train_loss:.4f} accuracy {train_acc:.4f}')

        # Use previous eval function
        val_acc, val_loss, preds, labels = eval_model(
            model, test_data_loader, device, len(test_dataset)
        )
        print(f'Val   loss {val_loss:.4f} accuracy {val_acc:.4f}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss

    print(f"\n--- Report for {dim} ---")
    print(classification_report(labels, preds, target_names=['Class 0', 'Class 1']))
    results_store[dim] = val_acc.item()

    del model
    torch.cuda.empty_cache()


 TRAINING CLASSIFIER FOR: IE (With Class Weights)
Class Weights for IE: [0.64969107 2.17010632]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Train loss 0.6307 accuracy 0.6912
Val   loss 0.6148 accuracy 0.7366
Epoch 2/5
Train loss 0.5369 accuracy 0.7677
Val   loss 0.5298 accuracy 0.7550
Epoch 3/5
Train loss 0.4255 accuracy 0.8346
Val   loss 0.5014 accuracy 0.7862
Epoch 4/5
Train loss 0.2859 accuracy 0.9102
Val   loss 0.5339 accuracy 0.7988
Epoch 5/5
Train loss 0.1928 accuracy 0.9471
Val   loss 0.6091 accuracy 0.8023

--- Report for IE ---
              precision    recall  f1-score   support

     Class 0       0.86      0.89      0.87      1335
     Class 1       0.58      0.53      0.55       400

    accuracy                           0.80      1735
   macro avg       0.72      0.71      0.71      1735
weighted avg       0.80      0.80      0.80      1735


 TRAINING CLASSIFIER FOR: NS (With Class Weights)
Class Weights for NS: [0.58007355 3.62212944]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Train loss 0.6607 accuracy 0.7138
Val   loss 0.4891 accuracy 0.8231
Epoch 2/5
Train loss 0.5449 accuracy 0.8133
Val   loss 0.5990 accuracy 0.7424
Epoch 3/5
Train loss 0.4427 accuracy 0.8736
Val   loss 0.4516 accuracy 0.8144
Epoch 4/5
Train loss 0.3580 accuracy 0.9190
Val   loss 0.5329 accuracy 0.8259
Epoch 5/5
Train loss 0.2771 accuracy 0.9510
Val   loss 0.5795 accuracy 0.8277

--- Report for NS ---
              precision    recall  f1-score   support

     Class 0       0.91      0.89      0.90      1496
     Class 1       0.38      0.42      0.40       239

    accuracy                           0.83      1735
   macro avg       0.65      0.66      0.65      1735
weighted avg       0.83      0.83      0.83      1735


 TRAINING CLASSIFIER FOR: TF (With Class Weights)
Class Weights for TF: [1.08948195 0.9241012 ]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Train loss 0.5990 accuracy 0.6764
Val   loss 0.4996 accuracy 0.7602
Epoch 2/5
Train loss 0.4465 accuracy 0.8030
Val   loss 0.4961 accuracy 0.7712
Epoch 3/5
Train loss 0.3078 accuracy 0.8805
Val   loss 0.5674 accuracy 0.7671
Epoch 4/5
Train loss 0.1881 accuracy 0.9393
Val   loss 0.7530 accuracy 0.7689
Epoch 5/5
Train loss 0.1171 accuracy 0.9679
Val   loss 0.9416 accuracy 0.7735

--- Report for TF ---
              precision    recall  f1-score   support

     Class 0       0.78      0.71      0.74       796
     Class 1       0.77      0.83      0.80       939

    accuracy                           0.77      1735
   macro avg       0.77      0.77      0.77      1735
weighted avg       0.77      0.77      0.77      1735


 TRAINING CLASSIFIER FOR: JP (With Class Weights)
Class Weights for JP: [1.26319621 0.82756976]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Train loss 0.6367 accuracy 0.6236
Val   loss 0.5700 accuracy 0.7170
Epoch 2/5
Train loss 0.5044 accuracy 0.7501
Val   loss 0.5653 accuracy 0.7003
Epoch 3/5
Train loss 0.3661 accuracy 0.8452
Val   loss 0.6698 accuracy 0.6951
Epoch 4/5
Train loss 0.2203 accuracy 0.9216
Val   loss 0.9827 accuracy 0.6818
Epoch 5/5
Train loss 0.1348 accuracy 0.9581
Val   loss 1.0566 accuracy 0.7020

--- Report for JP ---
              precision    recall  f1-score   support

     Class 0       0.63      0.61      0.62       687
     Class 1       0.75      0.76      0.76      1048

    accuracy                           0.70      1735
   macro avg       0.69      0.69      0.69      1735
weighted avg       0.70      0.70      0.70      1735



-------