In [None]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)

def load_data(path):
    df = pd.read_csv(path, on_bad_lines="skip", engine="python")
    print("Số mẫu:", len(df))
    print("Số loại MBTI:", df["type"].nunique())
    print(df["type"].value_counts())
    return df

def mbti_to_binary(mbti):
    return {
        "IE": 0 if mbti[0] == "I" else 1,
        "NS": 0 if mbti[1] == "N" else 1,
        "TF": 0 if mbti[2] == "T" else 1,
        "JP": 0 if mbti[3] == "J" else 1,
    }

def add_binary_columns(df):
    df = df.copy()
    df["mbti_IE"] = df["type"].apply(lambda x: mbti_to_binary(x)["IE"])
    df["mbti_NS"] = df["type"].apply(lambda x: mbti_to_binary(x)["NS"])
    df["mbti_TF"] = df["type"].apply(lambda x: mbti_to_binary(x)["TF"])
    df["mbti_JP"] = df["type"].apply(lambda x: mbti_to_binary(x)["JP"])
    return df

def prepare_data(path, seed=42, test_size=0.2):
    set_seed(seed)
    df = load_data(path)
    df = add_binary_columns(df)
    if "posts" in df.columns:
        df["text"] = df["posts"].fillna("").astype(str).str.replace("///", " ").str.replace("|||", " ")
    else:
        df["text"] = df["post"].fillna("").astype(str)
    X = df["text"].values
    y = df[["mbti_IE","mbti_NS","mbti_TF","mbti_JP"]].values
    return train_test_split(X, y, test_size=test_size, random_state=seed, stratify=y)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import joblib

class MBTIClassifierML:
    def __init__(self, use_svm=False, max_features=20000):
        self.vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1,2), stop_words="english")
        self.classifiers = {}
        self.use_svm = use_svm

    def fit(self, X_train, y_train):
        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        for i, axis in enumerate(["IE","NS","TF","JP"]):
            if self.use_svm:
                clf = LinearSVC(class_weight="balanced")
            else:
                clf = LogisticRegression(max_iter=1000, class_weight="balanced")
            clf.fit(X_train_tfidf, y_train[:, i])
            self.classifiers[axis] = clf

    def predict_text(self, text):
        """Dự đoán MBTI string từ một đoạn văn bản"""
        X_tfidf = self.vectorizer.transform([text])
        axes = ["IE","NS","TF","JP"]
        mbti = ""
        for axis in axes:
            mbti += self.classifiers[axis].predict(X_tfidf)[0]
        return mbti

    def save(self, path="ml_model.pkl"):
        """Lưu toàn bộ object"""
        joblib.dump(self, path)

    @staticmethod
    def load(path="ml_model.pkl"):
        """Load object đã lưu"""
        return joblib.load(path)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import joblib

class MBTIClassifierML:
    def __init__(self, use_svm=False, max_features=20000):
        self.vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1,2), stop_words="english")
        self.classifiers = {}
        self.use_svm = use_svm

    def fit(self, X_train, y_train):
        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        for i, axis in enumerate(["IE","NS","TF","JP"]):
            if self.use_svm:
                clf = LinearSVC(class_weight="balanced")
            else:
                clf = LogisticRegression(max_iter=1000, class_weight="balanced")
            clf.fit(X_train_tfidf, y_train[:, i])
            self.classifiers[axis] = clf

    def predict_text(self, text):
        """Dự đoán MBTI string từ một đoạn văn bản"""
        X_tfidf = self.vectorizer.transform([text])
        axes = ["IE","NS","TF","JP"]
        mbti = ""
        for axis in axes:
            mbti += self.classifiers[axis].predict(X_tfidf)[0]
        return mbti

    def save(self, path="ml_model.pkl"):
        """Lưu toàn bộ object"""
        joblib.dump(self, path)

    @staticmethod
    def load(path="ml_model.pkl"):
        """Load object đã lưu"""
        return joblib.load(path)


In [None]:
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from data import prepare_data
from models import MBTIClassifierML

def main():
    # Chuẩn bị dữ liệu
    X_train, X_test, y_train, y_test = prepare_data("data/mbti_1.csv")

    # Khởi tạo model
    model = MBTIClassifierML(use_svm=False)
    model.fit(X_train, y_train)

    # Lưu model
    os.makedirs("reports", exist_ok=True)
    model.save("reports/mbti_ml.pkl")
    print("✅ Saved ML model to reports/mbti_ml.pkl")

if __name__ == "__main__":
    main()


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
import os 
import sys 
import matplotlib.pyplot as plt

sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from data import prepare_data
from models import MBTIClassifierML

def main():
    X_train, X_test, y_train, y_test = prepare_data("data/mbti_1.csv")

    # Load model
    model = MBTIClassifierML.load(
        r"D:\Progamming\Progamming_courses\Quorsk\project\reports\mbti_ml.pkl"
    )

    # Dự đoán từng axis
    preds = []
    for axis, clf in model.classifiers.items():
        preds.append(clf.predict(model.vectorizer.transform(X_test)))
    preds = np.array(preds).T  # shape = (num_samples, 4)

    axes = ["IE", "NS", "TF", "JP"]
    accs, f1s = [], []

    # File để lưu classification reports
    report_path = r"D:\Progamming\Progamming_courses\Quorsk\project\reports\classification_reports.txt"
    with open(report_path, "w", encoding="utf-8") as f:
        for i, axis in enumerate(axes):
            acc = accuracy_score(y_test[:, i], preds[:, i])
            f1 = f1_score(y_test[:, i], preds[:, i], average="macro")
            accs.append(acc)
            f1s.append(f1)

            # In ra console
            print(f"{axis} - Acc: {acc:.4f}, F1: {f1:.4f}")
            print(classification_report(y_test[:, i], preds[:, i]))

            # Ghi vào file txt
            f.write(f"\n===== {axis} =====\n")
            f.write(f"Acc: {acc:.4f}, F1: {f1:.4f}\n")
            f.write(classification_report(y_test[:, i], preds[:, i]))
            f.write("\n\n")

    print(f"✅ Classification reports saved at {report_path}")

    # Vẽ biểu đồ Accuracy & F1 cho 4 axis
    x = np.arange(len(axes))
    width = 0.35
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.bar(x - width/2, accs, width, label="Accuracy")
    ax.bar(x + width/2, f1s, width, label="F1 Score")

    ax.set_xticks(x)
    ax.set_xticklabels(axes)
    ax.set_ylim(0, 1)
    ax.set_ylabel("Score")
    ax.set_title("MBTI Axis Classification Performance")
    ax.legend()

    save_path = r"D:\Progamming\Progamming_courses\Quorsk\project\reports\metrics.png"
    plt.tight_layout()
    plt.savefig(save_path)
    print(f"✅ Metrics chart saved at {save_path}")

if __name__ == "__main__":
    main()
