## Data Preparation: Load MBTI dataset, create binary labels, split train/test


In [None]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split

# --- Set seed để tái lập kết quả ---
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)

# --- Load CSV, in info cơ bản ---
def load_data(path):
    df = pd.read_csv(path, on_bad_lines="skip", engine="python")
    print(f"Số mẫu: {len(df)}, Số loại MBTI: {df['type'].nunique()}")
    print(df["type"].value_counts())
    return df

# --- Chuyển MBTI string sang nhãn nhị phân 0/1 ---
def mbti_to_binary(mbti):
    return {
        "IE": 0 if mbti[0] == "I" else 1,
        "NS": 0 if mbti[1] == "N" else 1,
        "TF": 0 if mbti[2] == "T" else 1,
        "JP": 0 if mbti[3] == "J" else 1,
    }

# --- Thêm 4 cột nhị phân vào dataframe ---
def add_binary_columns(df):
    df = df.copy()
    for axis in ["IE","NS","TF","JP"]:
        df[f"mbti_{axis}"] = df["type"].apply(lambda x: mbti_to_binary(x)[axis])
    return df

# --- Chuẩn bị train/test split ---
def prepare_data(path, seed=42, test_size=0.2):
    set_seed(seed)  # set seed
    df = load_data(path)  # load CSV
    df = add_binary_columns(df)  # thêm nhãn nhị phân

    # Chuẩn hóa cột text
    if "posts" in df.columns:
        df["text"] = df["posts"].fillna("").astype(str).str.replace("///", " ").str.replace("|||", " ")
    else:
        df["text"] = df["post"].fillna("").astype(str)

    X = df["text"].values  # lấy cột text
    y = df[["mbti_IE","mbti_NS","mbti_TF","mbti_JP"]].values  # lấy nhãn nhị phân

    # Train/test split, stratify theo nhãn để giữ tỉ lệ
    return train_test_split(X, y, test_size=test_size, random_state=seed, stratify=y)


## MBTI Classifier (TF-IDF + Logistic/SVM): training, inference, and persistence


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import joblib

class MBTIClassifierML:
    def __init__(self, use_svm=False, max_features=20000):
        """
        Khởi tạo ML MBTI classifier
        - use_svm: nếu True thì dùng LinearSVC, ngược lại LogisticRegression
        - max_features: số lượng feature tối đa cho TF-IDF
        """
        self.vectorizer = TfidfVectorizer(
            max_features=max_features, 
            ngram_range=(1,2), 
            stop_words="english"  # bỏ stopwords tiếng Anh
        )
        self.classifiers = {}  # dict lưu classifier cho từng axis
        self.use_svm = use_svm

    def fit(self, X_train, y_train):
        """
        Train model trên tập X_train, y_train
        X_train: list/array text
        y_train: numpy array (num_samples, 4) cho các axis IE, NS, TF, JP
        """
        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        for i, axis in enumerate(["IE","NS","TF","JP"]):
            # Chọn classifier theo use_svm
            if self.use_svm:
                clf = LinearSVC(class_weight="balanced")
            else:
                clf = LogisticRegression(max_iter=1000, class_weight="balanced")
            clf.fit(X_train_tfidf, y_train[:, i])
            self.classifiers[axis] = clf

    def predict_text(self, text):
        """
        Dự đoán MBTI string từ một đoạn văn bản
        Trả về string 4 ký tự, ví dụ 'INTJ'
        """
        X_tfidf = self.vectorizer.transform([text])
        axes = ["IE","NS","TF","JP"]
        mbti = ""
        for axis in axes:
            mbti += self.classifiers[axis].predict(X_tfidf)[0]
        return mbti

    def save(self, path="ml_model.pkl"):
        """
        Lưu toàn bộ object MBTIClassifierML vào file .pkl
        """
        joblib.dump(self, path)

    @staticmethod
    def load(path="ml_model.pkl"):
        """
        Load object MBTIClassifierML đã lưu từ file .pkl
        """
        return joblib.load(path)


## MBTI Classifier (TF-IDF + Logistic/SVM): training, inference, and save/load


In [None]:
import os
import sys


def main():
    # ------------------------------
    # Chuẩn bị dữ liệu
    # ------------------------------
    # Trả về X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = prepare_data("data/mbti_1.csv")

    # ------------------------------
    # Khởi tạo và train model
    # ------------------------------
    model = MBTIClassifierML(use_svm=False)  # Dùng LogisticRegression (không SVM)
    model.fit(X_train, y_train)              # Train model trên tập huấn luyện

    # ------------------------------
    # Lưu model
    # ------------------------------
    os.makedirs("reports", exist_ok=True)   # Tạo thư mục nếu chưa tồn tại
    model.save("reports/mbti_ml.pkl")       # Lưu model
    print("✅ Saved ML model to reports/mbti_ml.pkl")

if __name__ == "__main__":
    main()


## Full Training Pipeline: load data, train MBTI model, and save the trained model


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
import os
import sys
import matplotlib.pyplot as plt


def main():
    # --- Load dữ liệu và train/test split ---
    X_train, X_test, y_train, y_test = prepare_data("data/mbti_1.csv")

    # --- Load model ML đã train sẵn ---
    model = MBTIClassifierML.load(
        r"D:\Progamming\Progamming_courses\Quorsk\project\reports\mbti_ml.pkl"
    )

    # --- Dự đoán từng axis ---
    preds = []
    for axis, clf in model.classifiers.items():
        preds.append(clf.predict(model.vectorizer.transform(X_test)))
    preds = np.array(preds).T  # shape = (num_samples, 4)

    axes = ["IE", "NS", "TF", "JP"]
    accs, f1s = [], []

    # --- File để lưu classification reports ---
    report_path = r"D:\Progamming\Progamming_courses\Quorsk\project\reports\classification_reports_ml.txt"
    with open(report_path, "w", encoding="utf-8") as f:
        for i, axis in enumerate(axes):
            # Tính Accuracy & F1
            acc = accuracy_score(y_test[:, i], preds[:, i])
            f1 = f1_score(y_test[:, i], preds[:, i], average="macro")
            accs.append(acc)
            f1s.append(f1)

            # In ra console
            print(f"{axis} - Acc: {acc:.4f}, F1: {f1:.4f}")
            print(classification_report(y_test[:, i], preds[:, i]))

            # Ghi vào file txt
            f.write(f"\n===== {axis} =====\n")
            f.write(f"Acc: {acc:.4f}, F1: {f1:.4f}\n")
            f.write(classification_report(y_test[:, i], preds[:, i]))
            f.write("\n\n")

    print(f"✅ Classification reports saved at {report_path}")

    # --- Vẽ biểu đồ Accuracy & F1 cho 4 axis ---
    x = np.arange(len(axes))
    width = 0.35
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.bar(x - width/2, accs, width, label="Accuracy")  # bar cho Accuracy
    ax.bar(x + width/2, f1s, width, label="F1 Score")   # bar cho F1

    ax.set_xticks(x)
    ax.set_xticklabels(axes)
    ax.set_ylim(0, 1)
    ax.set_ylabel("Score")
    ax.set_title("MBTI Axis Classification Performance")
    ax.legend()

    # --- Lưu biểu đồ ---
    save_path = r"D:\Progamming\Progamming_courses\Quorsk\project\reports\metrics_ml.png"
    plt.tight_layout()
    plt.savefig(save_path)
    print(f"✅ Metrics chart saved at {save_path}")

if __name__ == "__main__":
    main()


## Model Evaluation: compute Accuracy & F1 per MBTI axis, save reports, and plot metrics


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
import os 
import sys 
import matplotlib.pyplot as plt

sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from data import prepare_data
from models import MBTIClassifierML

def main():
    X_train, X_test, y_train, y_test = prepare_data("data/mbti_1.csv")

    # Load model
    model = MBTIClassifierML.load(
        r"D:\Progamming\Progamming_courses\Quorsk\project\reports\mbti_ml.pkl"
    )

    # Dự đoán từng axis
    preds = []
    for axis, clf in model.classifiers.items():
        preds.append(clf.predict(model.vectorizer.transform(X_test)))
    preds = np.array(preds).T  # shape = (num_samples, 4)

    axes = ["IE", "NS", "TF", "JP"]
    accs, f1s = [], []

    # File để lưu classification reports
    report_path = r"D:\Progamming\Progamming_courses\Quorsk\project\reports\classification_reports.txt"
    with open(report_path, "w", encoding="utf-8") as f:
        for i, axis in enumerate(axes):
            acc = accuracy_score(y_test[:, i], preds[:, i])
            f1 = f1_score(y_test[:, i], preds[:, i], average="macro")
            accs.append(acc)
            f1s.append(f1)

            # In ra console
            print(f"{axis} - Acc: {acc:.4f}, F1: {f1:.4f}")
            print(classification_report(y_test[:, i], preds[:, i]))

            # Ghi vào file txt
            f.write(f"\n===== {axis} =====\n")
            f.write(f"Acc: {acc:.4f}, F1: {f1:.4f}\n")
            f.write(classification_report(y_test[:, i], preds[:, i]))
            f.write("\n\n")

    print(f"✅ Classification reports saved at {report_path}")

    # Vẽ biểu đồ Accuracy & F1 cho 4 axis
    x = np.arange(len(axes))
    width = 0.35
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.bar(x - width/2, accs, width, label="Accuracy")
    ax.bar(x + width/2, f1s, width, label="F1 Score")

    ax.set_xticks(x)
    ax.set_xticklabels(axes)
    ax.set_ylim(0, 1)
    ax.set_ylabel("Score")
    ax.set_title("MBTI Axis Classification Performance")
    ax.legend()

    save_path = r"D:\Progamming\Progamming_courses\Quorsk\project\reports\metrics.png"
    plt.tight_layout()
    plt.savefig(save_path)
    print(f"✅ Metrics chart saved at {save_path}")

if __name__ == "__main__":
    main()
