# 1. Data Loading, Cleaning & Splitting (MBTI Preprocessing)

In [None]:
# ==============================
# src/data.py
# ==============================
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
import re
import matplotlib.pyplot as plt

# ==============================
# 1. Load & Inspect Data
# ==============================
def load_data(path):
    """
    Load CSV và in thông tin sơ bộ về dataset.
    """
    df = pd.read_csv(path, on_bad_lines="skip", engine="python")
    print("Số mẫu:", len(df))
    print("Số loại MBTI khác nhau:", df["type"].nunique())
    print(df["type"].value_counts())
    return df

# ==============================
# 2. Encode MBTI -> 4 nhãn binary
# ==============================
def mbti_to_binary(mbti):
    """
    Chuyển MBTI thành 4 nhãn binary:
    IE, NS, TF, JP
    """
    return {
        "IE": 0 if mbti[0] == "I" else 1,
        "NS": 0 if mbti[1] == "N" else 1,
        "TF": 0 if mbti[2] == "T" else 1,
        "JP": 0 if mbti[3] == "J" else 1,
    }

def add_binary_columns(df):
    """
    Thêm 4 cột nhãn binary vào dataframe.
    """
    df = df.copy()
    df["mbti_IE"] = df["type"].apply(lambda x: mbti_to_binary(x)["IE"])
    df["mbti_NS"] = df["type"].apply(lambda x: mbti_to_binary(x)["NS"])
    df["mbti_TF"] = df["type"].apply(lambda x: mbti_to_binary(x)["TF"])
    df["mbti_JP"] = df["type"].apply(lambda x: mbti_to_binary(x)["JP"])
    return df

# ==============================
# 3. Làm sạch văn bản
# ==============================
def clean_text(text: str) -> str:
    """
    Tiền xử lý text:
    - Bỏ link
    - Bỏ ký tự lạ, chỉ giữ a-z, A-Z, 0-9, dấu câu cơ bản
    - Gom khoảng trắng thừa
    """
    text = str(text)
    text = re.sub(r"http\S+|www\.\S+", " ", text)             # bỏ link
    text = re.sub(r"[^a-zA-Z0-9\s.,!?']", " ", text)          # bỏ ký tự lạ
    text = re.sub(r"\s+", " ", text).strip()                  # gom khoảng trắng
    return text

# ==============================
# 4. Dataset class cho BERT
# ==============================
class MBTIDataset(Dataset):
    """
    Dataset PyTorch cho MBTI, trả về:
    - input_ids
    - attention_mask
    - labels (4 nhãn binary)
    """
    def __init__(self, df, tokenizer, max_len=256, augment_fn=None):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.augment_fn = augment_fn

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = clean_text(row.get("posts", ""))  # 👈 giữ nguyên toàn bộ posts của user

        # Áp dụng data augmentation nếu có
        if self.augment_fn:
            try:
                text = self.augment_fn(text)
            except:
                pass

        # Nhãn 4 chiều
        labels = torch.tensor(
            [row["mbti_IE"], row["mbti_NS"], row["mbti_TF"], row["mbti_JP"]],
            dtype=torch.float
        )

        # Tokenizer BERT
        tokens = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        # Trả về dict để DataLoader có thể dùng trực tiếp
        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "labels": labels
        }

# ==============================
# 5. Chạy trực tiếp để tiền xử lý & lưu
# ==============================
if __name__ == "__main__":
    # 5.1 Load dữ liệu
    df = load_data("/kaggle/input/mbti-type/mbti_1.csv")

    # 5.2 Thêm nhãn binary
    df = add_binary_columns(df)

    # 5.3 Làm sạch toàn bộ posts (không explode)
    df["posts"] = df["posts"].apply(clean_text)

    # 5.4 Lưu CSV mới
    df.to_csv("/kaggle/working/mbti_clean.csv", index=False)
    print("✅ Đã lưu xong mbti_clean.csv (giữ nguyên 1 dòng/user, có 4 nhãn binary)")

    # ==============================
    # 5.5 Vẽ phân phối nhãn
    # ==============================
    label_cols = ["mbti_IE", "mbti_NS", "mbti_TF", "mbti_JP"]

    fig, axes = plt.subplots(2, 2, figsize=(10, 8))
    axes = axes.flatten()

    for i, col in enumerate(label_cols):
        counts = df[col].value_counts().sort_index()  # 0 và 1
        counts.plot(kind="bar", ax=axes[i])
        axes[i].set_title(f"Phân phối nhãn {col}")
        axes[i].set_xticklabels(["0", "1"], rotation=0)
        for idx, val in enumerate(counts):
            axes[i].text(idx, val, str(val), ha="center", va="bottom")

    plt.tight_layout()
    plt.show()


# 2. MBTIModel: BERT-based Classifier

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

# ==============================
# MBTI Classification Model
# ==============================
class MBTIModel(nn.Module):
    def __init__(self, model_name="bert-base-uncased", dropout=0.4,
                 use_hidden_layer=True, pooling="cls+mean"):
        super(MBTIModel, self).__init__()

        # ------------------------------
        # Load pre-trained BERT model
        # ------------------------------
        # Không freeze BERT, sẽ fine-tune toàn bộ
        self.bert = BertModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size  # kích thước embedding đầu ra của BERT

        # ------------------------------
        # Pooling & Dropout
        # ------------------------------
        # Lựa chọn kiểu pooling: cls / mean / max / cls+mean
        self.pooling = pooling
        self.dropout = nn.Dropout(dropout)

        # Nếu sử dụng cls+mean, input dimension cho classifier sẽ gấp đôi
        input_dim = hidden_size * 2 if pooling == "cls+mean" else hidden_size

        # ------------------------------
        # Classifier (fully connected layers)
        # ------------------------------
        if use_hidden_layer:
            # Thêm một hidden layer với BatchNorm + GELU + Dropout
            self.classifier = nn.Sequential(
                nn.Linear(input_dim, input_dim // 2),
                nn.BatchNorm1d(input_dim // 2),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(input_dim // 2, 4)  # Output 4 lớp MBTI
            )
        else:
            # Nếu không dùng hidden layer, chỉ cần linear layer đơn giản
            self.classifier = nn.Linear(input_dim, 4)

    # ==============================
    # Forward pass
    # ==============================
    def forward(self, input_ids, attention_mask):
        # ------------------------------
        # BERT forward
        # ------------------------------
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True  # output là dict có last_hidden_state
        )

        # ------------------------------
        # Pooling layer
        # ------------------------------
        if self.pooling == "cls":
            # Chỉ lấy embedding CLS token
            pooled = outputs.last_hidden_state[:, 0, :]
        elif self.pooling == "mean":
            # Trung bình embeddings theo attention mask
            pooled = (outputs.last_hidden_state * attention_mask.unsqueeze(-1)).sum(1)
            pooled = pooled / attention_mask.sum(1, keepdim=True)
        elif self.pooling == "max":
            # Lấy max embedding, ignore padding
            masked = outputs.last_hidden_state.masked_fill(
                attention_mask.unsqueeze(-1) == 0, -1e9
            )
            pooled = masked.max(1).values
        elif self.pooling == "cls+mean":
            # Kết hợp CLS + mean pooling
            cls_emb = outputs.last_hidden_state[:, 0, :]
            mean_emb = (outputs.last_hidden_state * attention_mask.unsqueeze(-1)).sum(1)
            mean_emb = mean_emb / attention_mask.sum(1, keepdim=True)
            pooled = torch.cat([cls_emb, mean_emb], dim=1)
        else:
            raise ValueError(f"Unknown pooling: {self.pooling}")

        # ------------------------------
        # Dropout + Classifier
        # ------------------------------
        x = self.dropout(pooled)
        logits = self.classifier(x)  # output logits 4 lớp

        return logits


3. MBTI Model Training (BERT + BCEWithLogitsLoss + Mixed Precision)

In [None]:
# ==============================
# src/train.py
# ==============================
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
import random
import numpy as np

# ==============================
# 0. Set seed for reproducibility
# ==============================
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    # deterministic cuDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42) 

# ==============================
# 1. Train helper
# ==============================
def train_epoch(model, dataloader, optimizer, scheduler, device, loss_fn, epoch, epochs, scaler):
    """
    Huấn luyện một epoch.
    """
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs} [Train]", leave=False)

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(device, non_blocking=True)
        labels = batch["labels"].to(device, non_blocking=True)

        optimizer.zero_grad()

        with autocast():  # Mixed precision
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    return total_loss / len(dataloader)

# ==============================
# 2. Evaluation helper
# ==============================
def eval_epoch(model, dataloader, device, loss_fn, epoch, epochs, mode="Valid"):
    """
    Đánh giá model trên validation/test.
    """
    model.eval()
    total_loss = 0
    all_labels, all_preds = [], []

    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs} [{mode}]", leave=False)
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device, non_blocking=True)
            labels = batch["labels"].to(device, non_blocking=True)

            with autocast():
                outputs = model(input_ids, attention_mask)
                loss = loss_fn(outputs, labels)

            total_loss += loss.item()

            # Chuyển logits sang xác suất và dự đoán nhị phân
            probs = torch.sigmoid(outputs).cpu().numpy()
            preds = (probs >= 0.5).astype(int)

            all_labels.append(labels.cpu().numpy())
            all_preds.append(preds)

            progress_bar.set_postfix(loss=loss.item())

    all_labels = np.vstack(all_labels)
    all_preds = np.vstack(all_preds)

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="macro")

    return total_loss / len(dataloader), acc, f1

# ==============================
# 3. Main training loop
# ==============================
def main():
    # ------------------------------
    # Config
    # ------------------------------
    model_name = "bert-base-uncased"
    batch_size = 8
    max_len = 256
    lr = 2e-5
    epochs = 40
    num_workers = 4
    save_dir = "/kaggle/working/"
    os.makedirs(save_dir, exist_ok=True)

    best_model_path = os.path.join(save_dir, "mbti_best.pt")
    ckpt_path = os.path.join(save_dir, "mbti_ckpt.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    # ------------------------------
    # Load & preprocess data
    # ------------------------------
    data_dir = "/kaggle/working/"
    os.makedirs(data_dir, exist_ok=True)

    clean_csv = os.path.join(data_dir, "mbti_clean.csv")
    if not os.path.exists(clean_csv):
        df = load_data(os.path.join(data_dir, "mbti_1.csv"))
        df = add_binary_columns(df)
        df["posts"] = df["posts"].apply(clean_text)
        df.to_csv(clean_csv, index=False)

    df = pd.read_csv(clean_csv)

    # Split train/valid/test (70/20/10)
    train_df, temp_df = train_test_split(
        df, test_size=0.3, random_state=42, stratify=df["type"]
    )
    valid_df, test_df = train_test_split(
        temp_df, test_size=0.3333, random_state=42, stratify=temp_df["type"]
    )
    test_df.to_csv(os.path.join(data_dir, "test.csv"), index=False)
    print(f"Train: {len(train_df)}, Valid: {len(valid_df)}, Test: {len(test_df)}")

    # ------------------------------
    # Tokenizer + Dataset + DataLoader
    # ------------------------------
    tokenizer = BertTokenizer.from_pretrained(model_name)
    train_dataset = MBTIDataset(train_df, tokenizer, max_len=max_len)
    valid_dataset = MBTIDataset(valid_df, tokenizer, max_len=max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                              num_workers=num_workers, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size,
                              num_workers=num_workers, pin_memory=True)

    # ------------------------------
    # Model + Optimizer + Scheduler + Loss
    # ------------------------------
    model = MBTIModel(model_name=model_name, pooling="cls+mean", dropout=0.4).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps,
    )

    # pos_weight cho BCE để cân bằng lớp
    labels = train_df[["mbti_IE","mbti_NS","mbti_TF","mbti_JP"]].values
    pos_weights = (labels.shape[0] - labels.sum(axis=0)) / labels.sum(axis=0)
    pos_weights = torch.tensor(pos_weights, dtype=torch.float).to(device)
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

    # Gradient scaler cho mixed precision
    scaler = GradScaler()

    # ------------------------------
    # Resume checkpoint nếu có
    # ------------------------------
    start_epoch, best_valid_f1 = 0, 0.0
    if os.path.exists(ckpt_path):
        print(f"🔄 Found checkpoint: {ckpt_path}, loading...")
        ckpt = torch.load(ckpt_path, map_location=device)
        model.load_state_dict(ckpt["model_state"])
        optimizer.load_state_dict(ckpt["optimizer_state"])
        scheduler.load_state_dict(ckpt["scheduler_state"])
        start_epoch = ckpt["epoch"] + 1
        best_valid_f1 = ckpt["best_valid_f1"]
        print(f"👉 Resume training from epoch {start_epoch}")

    # ------------------------------
    # Training loop
    # ------------------------------
    for epoch in range(start_epoch, epochs):
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device, loss_fn, epoch, epochs, scaler)
        valid_loss, valid_acc, valid_f1 = eval_epoch(model, valid_loader, device, loss_fn, epoch, epochs)

        print(f"Epoch {epoch+1}/{epochs} | "
              f"Train Loss: {train_loss:.4f} | "
              f"Valid Loss: {valid_loss:.4f} | "
              f"Acc: {valid_acc:.4f} | Macro-F1: {valid_f1:.4f}")

        # Lưu best model dựa trên Macro-F1
        if valid_f1 > best_valid_f1:
            best_valid_f1 = valid_f1
            torch.save(model.state_dict(), best_model_path)
            print(f"✅ Saved best model to {best_model_path}")

        # Lưu checkpoint mỗi epoch
        torch.save({
            "epoch": epoch,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "scheduler_state": scheduler.state_dict(),
            "best_valid_f1": best_valid_f1,
        }, ckpt_path)
        print(f"💾 Saved checkpoint (epoch {epoch + 1}) to {ckpt_path}")

# ==============================
# Run
# ==============================
if __name__ == "__main__":
    main()


# 4. 🧪 MBTI Model Evaluation & Analysis

In [None]:
# ==============================
# src/evaluate.py
# ==============================
import os
import json
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from torch.utils.data import DataLoader
from transformers import BertTokenizer

# ==============================
# 1. Core evaluation function
# ==============================
def evaluate(model, dataloader, device):
    """
    Đánh giá model trên test/validation set.
    Trả về:
    - all_labels: ground truth (numpy array)
    - all_preds: nhãn dự đoán (numpy array)
    - all_probs: xác suất dự đoán (numpy array)
    """
    model.eval()
    all_labels, all_preds, all_probs = [], [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].cpu().numpy()

            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs >= 0.5).astype(int)

            all_labels.append(labels)
            all_preds.append(preds)
            all_probs.append(probs)

    return (
        np.vstack(all_labels),
        np.vstack(all_preds),
        np.vstack(all_probs)
    )

# ==============================
# 2. Confusion matrices
# ==============================
def plot_confusion_matrices(y_true, y_pred, axes, save_dir=None):
    """
    Vẽ confusion matrix cho từng nhãn IE, NS, TF, JP.
    """
    fig, axs = plt.subplots(2, 2, figsize=(10, 8))
    axs = axs.ravel()

    for i, ax in enumerate(axs):
        cm = confusion_matrix(y_true[:, i], y_pred[:, i])
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, ax=ax)
        ax.set_title(f"Confusion Matrix - {axes[i]}")
        ax.set_xlabel("Predicted")
        ax.set_ylabel("True")

    plt.tight_layout()
    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
        path = os.path.join(save_dir, "confusion_matrices.png")
        plt.savefig(path)
        plt.close()
        print(f"✅ Saved confusion matrix to {path}")
    else:
        plt.show()

# ==============================
# 3. Probability distribution
# ==============================
def plot_probability_distribution(probs, axes, save_dir=None):
    """
    Vẽ histogram xác suất dự đoán cho từng nhãn.
    """
    fig, axs = plt.subplots(2, 2, figsize=(10, 8))
    axs = axs.ravel()

    for i, ax in enumerate(axs):
        sns.histplot(probs[:, i], bins=20, kde=True, ax=ax)
        ax.set_title(f"Predicted Prob Distribution - {axes[i]}")
        ax.set_xlabel("Probability")
        ax.set_ylabel("Count")

    plt.tight_layout()
    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
        path = os.path.join(save_dir, "prob_dist.png")
        plt.savefig(path)
        plt.close()
        print(f"✅ Saved probability distribution to {path}")
    else:
        plt.show()

# ==============================
# 4. Error analysis
# ==============================
def error_analysis(test_df, y_true, y_pred, y_probs, axes, n_samples=15):
    """
    Lưu lại các mẫu dự đoán sai để phân tích.
    """
    errors = []
    for i in range(len(y_true)):
        for j, axis in enumerate(axes):
            if y_true[i, j] != y_pred[i, j]:
                errors.append({
                    "index": i,
                    "axis": axis,
                    "true": int(y_true[i, j]),
                    "pred": int(y_pred[i, j]),
                    "prob": float(y_probs[i, j]),
                    "text": str(test_df.iloc[i]["posts"])[:200] + "..."
                })

    errors_df = pd.DataFrame(errors)
    os.makedirs("reports", exist_ok=True)
    errors_df.to_csv("reports/error_samples.csv", index=False)
    print(f"\n❌ Saved {len(errors_df)} misclassified samples to reports/error_samples.csv")

    if len(errors_df) > 0:
        print("\n=== Sample Errors ===")
        print(errors_df.sample(min(n_samples, len(errors_df))))

# ==============================
# 5. Main evaluation pipeline
# ==============================
def main():
    # ------------------------------
    # Config
    # ------------------------------
    set_seed(42)
    model_name = "bert-base-uncased"
    batch_size = 8
    max_len = 256
    model_path = "/kaggle/working/mbti_best.pt"
    test_csv = "/kaggle/working/test.csv"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    # ------------------------------
    # Load test set
    # ------------------------------
    test_df = pd.read_csv(test_csv)
    tokenizer = BertTokenizer.from_pretrained(model_name)
    test_dataset = MBTIDataset(test_df, tokenizer, max_len=max_len)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # ------------------------------
    # Load trained model
    # ------------------------------
    model = MBTIModel(model_name=model_name, pooling="cls+mean", dropout=0.4)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)

    # ------------------------------
    # Evaluate
    # ------------------------------
    y_true, y_pred, y_probs = evaluate(model, test_loader, device)

    axes = ["IE", "NS", "TF", "JP"]
    metrics = {}

    # Metrics per axis
    print("\n=== Metrics per axis ===")
    for i, axis in enumerate(axes):
        acc = accuracy_score(y_true[:, i], y_pred[:, i])
        f1 = f1_score(y_true[:, i], y_pred[:, i], average="macro")
        metrics[axis] = {"Accuracy": acc, "Macro-F1": f1}
        print(f"{axis}: Acc={acc:.4f}, Macro-F1={f1:.4f}")

    # Save metrics
    metrics_df = pd.DataFrame(metrics).T
    os.makedirs("reports", exist_ok=True)
    metrics_df.to_csv("reports/metrics.csv")
    print("\n✅ Metrics saved to reports/metrics.csv")

    # Save classification report (multi-label)
    report = classification_report(
        y_true, y_pred, output_dict=True, zero_division=0
    )
    pd.DataFrame(report).to_csv("reports/classification_report.csv")
    print("✅ Classification report saved to reports/classification_report.csv")

    # ------------------------------
    # Visualization
    # ------------------------------
    plot_confusion_matrices(y_true, y_pred, axes, save_dir="reports/figures")
    plot_probability_distribution(y_probs, axes, save_dir="reports/figures")

    # ------------------------------
    # Error analysis
    # ------------------------------
    error_analysis(test_df, y_true, y_pred, y_probs, axes)

# ==============================
# Run evaluation
# ==============================
if __name__ == "__main__":
    os.makedirs("reports", exist_ok=True)
    main()
