In [None]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

MODEL_NAME = "bert-base-uncased"  
BATCH_SIZE = 16
MAX_LEN = 64
TEST_SIZE = 0.2
RANDOM_STATE = 42

csv_path = "../../data/metadata.csv"
print(f"üìÇ ƒêang load dataset: {csv_path}")
df = pd.read_csv(csv_path)


texts = df["notes"].astype(str).tolist()
labels = df["finding"].astype("category").cat.codes
label_names = list(df["finding"].astype("category").cat.categories)

print(f"‚úÖ S·ªë m·∫´u: {len(texts)} | S·ªë nh√£n: {len(label_names)} | Nh√£n: {label_names}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
print(f"üß† D√πng thi·∫øt b·ªã: {device}")

def get_bert_embeddings(texts, batch_size=BATCH_SIZE, max_len=MAX_LEN):
    """T·∫°o CLS embedding cho danh s√°ch vƒÉn b·∫£n."""
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="üîπƒêang t·∫°o embedding"):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=max_len
        ).to(device)

        with torch.no_grad():
            outputs = model(**enc)
            cls_embeds = outputs.last_hidden_state[:, 0, :]  # vector CLS
            embeddings.append(cls_embeds.cpu().numpy())

    return np.vstack(embeddings)

print("üöÄ ƒêang t√≠nh to√°n BERT embeddings...")
X = get_bert_embeddings(texts)
y = labels.values
print(f"‚úÖ Embeddings shape: {X.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

results = {}
trained_models = {}

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
results["Logistic Regression"] = {
    "accuracy": accuracy_score(y_test, y_pred_lr),
    "f1": f1_score(y_test, y_pred_lr, average="weighted"),
    "report": classification_report(y_test, y_pred_lr, target_names=label_names, digits=4)
}
trained_models["Logistic Regression"] = lr

# Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
results["Random Forest"] = {
    "accuracy": accuracy_score(y_test, y_pred_rf),
    "f1": f1_score(y_test, y_pred_rf, average="weighted"),
    "report": classification_report(y_test, y_pred_rf, target_names=label_names, digits=4)
}
trained_models["Random Forest"] = rf

print("\n==== üßæ K·∫æT QU·∫¢ (BERT + ML) ====\n")
for name, res in results.items():
    print(f"--- {name} ---")
    print("Accuracy:", res["accuracy"])
    print("F1-weighted:", res["f1"])
    print(res["report"])

save_dir = "../models"
os.makedirs(save_dir, exist_ok=True)

for name, model_obj in trained_models.items():
    file_path = os.path.join(save_dir, f"bert_{name.replace(' ', '_').lower()}.pkl")
    with open(file_path, "wb") as f:
        pickle.dump(model_obj, f)
    print(f"‚úÖ Saved model: {file_path}")

np.save(os.path.join(save_dir, "bert_embeddings.npy"), X)
print("‚úÖ Saved embeddings.")

save_results_path = os.path.join(save_dir, "results_bert.json")
with open(save_results_path, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)
print(f"‚úÖ Saved evaluation results to {save_results_path}")

for name, model_obj in trained_models.items():
    print(f"\nüîç Model: {name}")
    print("Classes learned:", model_obj.classes_)
    print("Number of classes:", len(model_obj.classes_))


