In [2]:
import time
import json
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from xgboost import XGBClassifier
from datetime import datetime

DATA = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
MODEL_OUT = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/insider_demo/best_model.joblib"
INFO_OUT = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/insider_demo/api_info.json"
RESULTS_CSV = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/insider_demo/model_results.csv"

df = pd.read_csv(DATA)
print(f"Indlæst {len(df):,} rækker")

N = 500_000
if len(df) > N:
    df = df.sample(n=N, random_state=42)
    print(f"Reduceret til {N:,} rækker")

FEATURES = [
    "has_appointment",
    "has_observation",
    "has_encounter",
    "has_btg_access",
    "has_care_access",
    "num_btg_events",
    "num_care_events",
    "avg_time_between_events",
]

for col in FEATURES:
    if col not in df.columns:
        df[col] = 0
for col in ["has_appointment", "has_observation", "has_encounter", "has_btg_access", "has_care_access"]:
    df[col] = df[col].fillna(0).astype(int)
df["num_btg_events"] = df["num_btg_events"].fillna(0).astype(int)
df["num_care_events"] = df["num_care_events"].fillna(0).astype(int)
df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0.0).astype(float)

y = df["label"].map({"Normal": 0, "Anomaly": 1}).astype(int)
X = df[FEATURES]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.10,
    subsample=0.9,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    tree_method="hist",
    eval_metric="logloss"
)

t0 = time.time()
model.fit(X_train, y_train)
train_time = time.time() - t0

t1 = time.time()
y_pred = model.predict(X_test)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average="binary", pos_label=1
)
acc = accuracy_score(y_test, y_pred)

print("\n=== RESULTATER ===")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1:        {f1:.3f}")
print(f"Accuracy:  {acc:.3f}")
print(f"Train(s):  {train_time:.2f}  Predict(s): {pred_time:.4f}")
print(f"Predicted positives: {(y_pred==1).sum()} / {len(y_pred)}")

joblib.dump(model, MODEL_OUT)
print("Model gemt til:", MODEL_OUT)

results_df = pd.DataFrame([{
    "Model": "XGBoost",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(acc, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": int((y_pred == 0).sum()),
    "Predicted Anomaly": int((y_pred == 1).sum()),
    "Timestamp": datetime.now().isoformat()
}])
results_df.to_csv(RESULTS_CSV, index=False)
print("Resultater gemt til:", RESULTS_CSV)

info_data = {
    "model_name": "XGBoost",
    "trained_on_rows": int(len(df)),
    "features": FEATURES,
    "metrics": {
        "precision": round(precision, 3),
        "recall": round(recall, 3),
        "f1_score": round(f1, 3),
        "accuracy": round(acc, 3)
    },
    "train_time_sec": round(train_time, 3),
    "predict_time_sec": round(pred_time, 3),
    "last_trained": datetime.now().isoformat()
}
with open(INFO_OUT, "w") as f:
    json.dump(info_data, f, indent=4)
print("api_info.json opdateret:", INFO_OUT)



Indlæst 573,365 rækker
Reduceret til 500,000 rækker

=== RESULTATER ===
Precision: 1.000
Recall:    1.000
F1:        1.000
Accuracy:  1.000
Train(s):  0.97  Predict(s): 0.0418
Predicted positives: 32852 / 150000
Model gemt til: /Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/insider_demo/best_model.joblib
Resultater gemt til: /Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/insider_demo/model_results.csv
api_info.json opdateret: /Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/insider_demo/api_info.json
