In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=100000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "SVM":        SVC(kernel='rbf', gamma='scale', random_state=42),
    "KNN":        KNeighborsClassifier(n_neighbors=5),
    "NaiveBayes": GaussianNB()
}

results = []
for name, model in models.items():
    print(f"\n=== {name} ===")
    
    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1
    pred_counts = pd.Series(y_pred).value_counts().to_dict()
    normal_pred = pred_counts.get("Normal", 0)
    anomaly_pred = pred_counts.get("Anomaly", 0)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label='Anomaly'
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })


print("\n=== Samlet oversigt ===")
results_df = pd.DataFrame(results)
print(results_df)


results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_24h_sample100k.csv", index=False)
print("saved")



In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=250000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)


features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)


models = {
    "SVM":        LinearSVC(class_weight='balanced', max_iter=10000, random_state=42),
    "KNN":        KNeighborsClassifier(n_neighbors=5),
    "NaiveBayes": GaussianNB()
}


results = []
for name, model in models.items():
    print(f"\n=== {name} ===")
    

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0


    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

    
    pred_counts = pd.Series(y_pred).value_counts().to_dict()
    normal_pred = pred_counts.get("Normal", 0)
    anomaly_pred = pred_counts.get("Anomaly", 0)

    
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label='Anomaly'
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })


print("\n=== Samlet oversigt ===")
results_df = pd.DataFrame(results)
print(results_df)


results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_24h_sample250k.csv", index=False)
print(" saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=300000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "SVM":        LinearSVC(class_weight='balanced', max_iter=10000, random_state=42),
    "KNN":        KNeighborsClassifier(n_neighbors=5),
    "NaiveBayes": GaussianNB()
}

results = []
for name, model in models.items():
    print(f"\n=== {name} ===")
    
    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

    pred_counts = pd.Series(y_pred).value_counts().to_dict()
    normal_pred = pred_counts.get("Normal", 0)
    anomaly_pred = pred_counts.get("Anomaly", 0)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label='Anomaly'
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })

print("\n=== Samlet oversigt ===")
results_df = pd.DataFrame(results)
print(results_df)
results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_24h_sample300k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)


df = df.sample(n=400000, random_state=42)


df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)


features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)


models = {
    "SVM":        LinearSVC(class_weight='balanced', max_iter=10000, random_state=42),
    "KNN":        KNeighborsClassifier(n_neighbors=5),
    "NaiveBayes": GaussianNB()
}


results = []
for name, model in models.items():
    print(f"\n=== {name} ===")
    
   
    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    
    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

   
    pred_counts = pd.Series(y_pred).value_counts().to_dict()
    normal_pred = pred_counts.get("Normal", 0)
    anomaly_pred = pred_counts.get("Anomaly", 0)

    
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label='Anomaly'
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })


print("\n=== Samlet oversigt ===")
results_df = pd.DataFrame(results)
print(results_df)


results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_24h_sample400k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=100000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label"]

label_mapping = {"Normal": 0, "Anomaly": 1}
y = y.map(label_mapping)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "MLP":     MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)
}

results = []
reverse_map = {0: "Normal", 1: "Anomaly"}

for name, model in models.items():
    print(f"\n=== {name} ===")
    
  
    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

    pred_labels = pd.Series(y_pred).map(reverse_map)
    pred_counts = pred_labels.value_counts().to_dict()
    normal_pred = pred_counts.get("Normal", 0)
    anomaly_pred = pred_counts.get("Anomaly", 0)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label=1
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })

print("\n=== Samlet oversigt ===")
results_df = pd.DataFrame(results)
print(results_df)

results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_xgb_mlp_100k.csv", index=False)
print("saved")


In [None]:
pip install xgboost


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=250000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label"]

label_mapping = {"Normal": 0, "Anomaly": 1}
y = y.map(label_mapping)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "MLP":     MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)
}
results = []
reverse_map = {0: "Normal", 1: "Anomaly"}

for name, model in models.items():
    print(f"\n=== {name} ===")
    
    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

    pred_labels = pd.Series(y_pred).map(reverse_map)
    pred_counts = pred_labels.value_counts().to_dict()
    normal_pred = pred_counts.get("Normal", 0)
    anomaly_pred = pred_counts.get("Anomaly", 0)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label=1
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })

print("\n=== Samlet oversigt ===")
results_df = pd.DataFrame(results)
print(results_df)

results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_xgb_mlp_250k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=300000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label"]

label_mapping = {"Normal": 0, "Anomaly": 1}
y = y.map(label_mapping)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "MLP":     MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)
}

results = []
reverse_map = {0: "Normal", 1: "Anomaly"}

for name, model in models.items():
    print(f"\n=== {name} ===")

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

    pred_labels = pd.Series(y_pred).map(reverse_map)
    pred_counts = pred_labels.value_counts().to_dict()
    normal_pred = pred_counts.get("Normal", 0)
    anomaly_pred = pred_counts.get("Anomaly", 0)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label=1
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })

print("\n=== Samlet oversigt ===")
results_df = pd.DataFrame(results)
print(results_df)

results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_xgb_mlp_300k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=400000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label"]

label_mapping = {"Normal": 0, "Anomaly": 1}
y = y.map(label_mapping)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "MLP":     MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)
}

results = []
reverse_map = {0: "Normal", 1: "Anomaly"}

for name, model in models.items():
    print(f"\n=== {name} ===")

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

    pred_labels = pd.Series(y_pred).map(reverse_map)
    pred_counts = pred_labels.value_counts().to_dict()
    normal_pred = pred_counts.get("Normal", 0)
    anomaly_pred = pred_counts.get("Anomaly", 0)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label=1
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })

print("\n=== Samlet oversigt ===")
results_df = pd.DataFrame(results)
print(results_df)

results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_xgb_mlp_400k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=500000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)

df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label_num"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "MLP": MLPClassifier(hidden_layer_sizes=(50,), max_iter=300, random_state=42)
}

results = []
for name, model in models.items():
    print(f"\n=== {name} ===")

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

    pred_counts = pd.Series(y_pred).value_counts().to_dict()
    normal_pred = pred_counts.get(0, 0)
    anomaly_pred = pred_counts.get(1, 0)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label=1
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })

results_df = pd.DataFrame(results)

output_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_xgb_mlp_500k.csv"
results_df.to_csv(output_path, index=False)
print("saved")

if name == "XGBoost":
    import joblib, pathlib
    OUT = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/insider demo/best_model.joblib"
    joblib.dump(model, OUT)
    print("saved XGBoost to:", OUT)



In [None]:
import time
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from xgboost import XGBClassifier

DATA = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
OUT  = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/insider_demo/best_model.joblib"

df = pd.read_csv(DATA)

N = 500000
if len(df) > N:
    df = df.sample(n=N, random_state=42)

FEATURES = [
    "has_appointment",
    "has_observation",
    "has_encounter",
    "has_btg_access",
    "has_care_access",
    "num_btg_events",
    "num_care_events",
    "avg_time_between_events",
]

for col in FEATURES:
    if col not in df.columns:
        df[col] = 0

for col in ["has_appointment","has_observation","has_encounter","has_btg_access","has_care_access"]:
    df[col] = df[col].fillna(0).astype(int)

df["num_btg_events"] = df["num_btg_events"].fillna(0).astype(int)
df["num_care_events"] = df["num_care_events"].fillna(0).astype(int)
df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0.0).astype(float)

X = df[FEATURES]
y = df["label"].map({"Normal": 0, "Anomaly": 1}).astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.10,
    subsample=0.9,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    tree_method="hist",   
    eval_metric="logloss"
)

t0 = time.time()
model.fit(X_train, y_train)
train_time = time.time() - t0

t1 = time.time()
y_pred = model.predict(X_test)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average="binary", pos_label=1
)
acc = accuracy_score(y_test, y_pred)

print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1:        {f1:.3f}")
print(f"Accuracy:  {acc:.3f}")
print(f"Train(s):  {train_time:.2f}  Predict(s): {pred_time:.4f}")
print(f"Predicted positives: {(y_pred==1).sum()} / {len(y_pred)}")

joblib.dump(model, OUT)
print("saved :", OUT)


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=100000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)

df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label_num"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
}

results = []
for name, model in models.items():
    print(f"\n=== {name} ===")

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

    pred_counts = pd.Series(y_pred).value_counts().to_dict()
    normal_pred = pred_counts.get(0, 0)
    anomaly_pred = pred_counts.get(1, 0)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label=1
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })

results_df = pd.DataFrame(results)
results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_rf_lr_100k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=250000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label_num"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
}

results = []
for name, model in models.items():
    print(f"\n=== {name} ===")

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

    pred_counts = pd.Series(y_pred).value_counts().to_dict()
    normal_pred = pred_counts.get(0, 0)
    anomaly_pred = pred_counts.get(1, 0)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label=1
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })

results_df = pd.DataFrame(results)
results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_rf_lr_250k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=300000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label_num"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
}


results = []
for name, model in models.items():
    print(f"\n=== {name} ===")

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

    pred_counts = pd.Series(y_pred).value_counts().to_dict()
    normal_pred = pred_counts.get(0, 0)
    anomaly_pred = pred_counts.get(1, 0)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label=1
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })

results_df = pd.DataFrame(results)
results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_rf_lr_300k.csv", index=False)
print("saved")

if name == "XGBoost":
    import joblib, pathlib
    OUT = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/insider demo/best_model.joblib"
    joblib.dump(model, OUT)
    print("saved XGBoost to:", OUT)



In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=400000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label_num"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
}

results = []
for name, model in models.items():
    print(f"\n=== {name} ===")

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

    pred_counts = pd.Series(y_pred).value_counts().to_dict()
    normal_pred = pred_counts.get(0, 0)
    anomaly_pred = pred_counts.get(1, 0)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label=1
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })

results_df = pd.DataFrame(results)
results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_rf_lr_400k.csv", index=False)
print("Saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=500000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features]
y = df["label_num"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
}

results = []
for name, model in models.items():
    print(f"\n=== {name} ===")

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    t1 = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - t1

    pred_counts = pd.Series(y_pred).value_counts().to_dict()
    normal_pred = pred_counts.get(0, 0)
    anomaly_pred = pred_counts.get(1, 0)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', pos_label=1
    )
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Præcision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Træningstid (s):    {train_time:.3f}")
    print(f"Prediktionstid (s): {pred_time:.3f}")
    print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

    results.append({
        "Model": name,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1-score": round(f1, 3),
        "Accuracy": round(accuracy, 3),
        "Train time (s)": round(train_time, 3),
        "Pred time (s)": round(pred_time, 3),
        "Predicted Normal": normal_pred,
        "Predicted Anomaly": anomaly_pred
    })

results_df = pd.DataFrame(results)
results_df.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_rf_lr_500k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from hmmlearn.hmm import GaussianHMM
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=100000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values
y = df["label_num"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

X_train_normal = X_train[y_train == 0]

print("\n=== HMM ===")
t0 = time.time()
hmm = GaussianHMM(n_components=4, covariance_type="diag", n_iter=100)
hmm.fit(X_train_normal)
train_time = time.time() - t0

t1 = time.time()
log_likelihoods = [hmm.score(x.reshape(1, -1)) for x in X_test]
log_likelihoods = pd.Series(log_likelihoods)

normal_scores = log_likelihoods[y_test == 0]
threshold = normal_scores.quantile(0.05)

y_pred = (log_likelihoods < threshold).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "HMM",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_hmm_100k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from hmmlearn.hmm import GaussianHMM
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=250000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values
y = df["label_num"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

X_train_normal = X_train[y_train == 0]

print("\n=== HMM ===")
t0 = time.time()
hmm = GaussianHMM(n_components=4, covariance_type="diag", n_iter=100)
hmm.fit(X_train_normal)
train_time = time.time() - t0

t1 = time.time()
log_likelihoods = [hmm.score(x.reshape(1, -1)) for x in X_test]
log_likelihoods = pd.Series(log_likelihoods)

normal_scores = log_likelihoods[y_test == 0]
threshold = normal_scores.quantile(0.05)
y_pred = (log_likelihoods < threshold).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "HMM",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_hmm_250k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from hmmlearn.hmm import GaussianHMM
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=300000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values
y = df["label_num"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

X_train_normal = X_train[y_train == 0]

print("\n=== HMM ===")
t0 = time.time()
hmm = GaussianHMM(n_components=4, covariance_type="diag", n_iter=100)
hmm.fit(X_train_normal)
train_time = time.time() - t0

t1 = time.time()
log_likelihoods = [hmm.score(x.reshape(1, -1)) for x in X_test]
log_likelihoods = pd.Series(log_likelihoods)

threshold = log_likelihoods[y_test == 0].quantile(0.05)
y_pred = (log_likelihoods < threshold).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "HMM",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_hmm_300k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from hmmlearn.hmm import GaussianHMM
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=400000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})


features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values
y = df["label_num"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

X_train_normal = X_train[y_train == 0]

print("\n=== HMM ===")
t0 = time.time()
hmm = GaussianHMM(n_components=4, covariance_type="diag", n_iter=100)
hmm.fit(X_train_normal)
train_time = time.time() - t0

t1 = time.time()
log_likelihoods = [hmm.score(x.reshape(1, -1)) for x in X_test]
log_likelihoods = pd.Series(log_likelihoods)

threshold = log_likelihoods[y_test == 0].quantile(0.05)
y_pred = (log_likelihoods < threshold).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "HMM",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_hmm_400k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from hmmlearn.hmm import GaussianHMM
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=500000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values
y = df["label_num"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

X_train_normal = X_train[y_train == 0]

print("\n=== HMM ===")
t0 = time.time()
hmm = GaussianHMM(n_components=4, covariance_type="diag", n_iter=100)
hmm.fit(X_train_normal)
train_time = time.time() - t0

t1 = time.time()
log_likelihoods = [hmm.score(x.reshape(1, -1)) for x in X_test]
log_likelihoods = pd.Series(log_likelihoods)

threshold = log_likelihoods[y_test == 0].quantile(0.05)
y_pred = (log_likelihoods < threshold).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "HMM",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_hmm_500k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Input
from tensorflow.keras.utils import to_categorical

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=100000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values
y = df["label_num"].values

X_rnn = X.reshape((X.shape[0], 1, X.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(
    X_rnn, y, test_size=0.3, stratify=y, random_state=42
)

model = Sequential([
    Input(shape=(1, X.shape[1])),
    SimpleRNN(32, activation='tanh'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("\n=== RNN ===")
t0 = time.time()
history = model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
train_time = time.time() - t0

t1 = time.time()
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "RNN",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_rnn_100k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Input
from tensorflow.keras.utils import 


data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=250000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values
y = df["label_num"].values

X_rnn = X.reshape((X.shape[0], 1, X.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(
    X_rnn, y, test_size=0.3, stratify=y, random_state=42
)

model = Sequential([
    Input(shape=(1, X.shape[1])),
    SimpleRNN(32, activation='tanh'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("\n=== RNN ===")
t0 = time.time()
history = model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
train_time = time.time() - t0

t1 = time.time()
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "RNN",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_rnn_250k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Input
from tensorflow.keras.utils import 

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=300000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values
y = df["label_num"].values

X_rnn = X.reshape((X.shape[0], 1, X.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(
    X_rnn, y, test_size=0.3, stratify=y, random_state=42
)

model = Sequential([
    Input(shape=(1, X.shape[1])),
    SimpleRNN(32, activation='tanh'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("\n=== RNN ===")
t0 = time.time()
history = model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
train_time = time.time() - t0

t1 = time.time()
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "RNN",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_rnn_300k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Input
from tensorflow.keras.utils import 

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=400000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values
y = df["label_num"].values

X_rnn = X.reshape((X.shape[0], 1, X.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(
    X_rnn, y, test_size=0.3, stratify=y, random_state=42
)

model = Sequential([
    Input(shape=(1, X.shape[1])),
    SimpleRNN(32, activation='tanh'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("\n=== RNN ===")
t0 = time.time()
history = model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
train_time = time.time() - t0

t1 = time.time()
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "RNN",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_rnn_400k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Input
from tensorflow.keras.utils import 

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=500000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values
y = df["label_num"].values

X_rnn = X.reshape((X.shape[0], 1, X.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(
    X_rnn, y, test_size=0.3, stratify=y, random_state=42
)

model = Sequential([
    Input(shape=(1, X.shape[1])),
    SimpleRNN(32, activation='tanh'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


print("\n=== RNN ===")
t0 = time.time()
history = model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
train_time = time.time() - t0


t1 = time.time()
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "RNN",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_rnn_500k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout, Add, MultiHeadAttention, Flatten

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=100000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values.astype(np.float32)
y = df["label_num"].values

X_trans = X.reshape((X.shape[0], 1, X.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(
    X_trans, y, test_size=0.3, stratify=y, random_state=42
)

def build_transformer_model(input_shape):
    inp = Input(shape=input_shape)
    attn = MultiHeadAttention(num_heads=2, key_dim=input_shape[-1])(inp, inp)
    attn = Add()([inp, attn])
    attn = LayerNormalization()(attn)

    ff = Dense(32, activation='relu')(attn)
    ff = Dense(16)(ff)
    attn_proj = Dense(16)(attn)
    ff = Add()([ff, attn_proj])
    ff = LayerNormalization()(ff)

    out = Dense(1, activation='sigmoid')(ff)
    out = Flatten()(out)

    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = build_transformer_model(X_train.shape[1:])

print("\n=== Transformer ===")
t0 = time.time()
model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
train_time = time.time() - t0

t1 = time.time()
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "Transformer",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_transformer_100k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout, Add, MultiHeadAttention, Flatten

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=250000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values.astype(np.float32)
y = df["label_num"].values

X_trans = X.reshape((X.shape[0], 1, X.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(
    X_trans, y, test_size=0.3, stratify=y, random_state=42
)

def build_transformer_model(input_shape):
    inp = Input(shape=input_shape)
    attn = MultiHeadAttention(num_heads=2, key_dim=input_shape[-1])(inp, inp)
    attn = Add()([inp, attn])
    attn = LayerNormalization()(attn)

    ff = Dense(32, activation='relu')(attn)
    ff = Dense(16)(ff)
    attn_proj = Dense(16)(attn)
    ff = Add()([ff, attn_proj])
    ff = LayerNormalization()(ff)

    out = Dense(1, activation='sigmoid')(ff)
    out = Flatten()(out)

    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = build_transformer_model(X_train.shape[1:])

print("\n=== Transformer ===")
t0 = time.time()
model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
train_time = time.time() - t0

t1 = time.time()
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "Transformer",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_transformer_250k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout, Add, MultiHeadAttention, Flatten

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=300000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values.astype(np.float32)
y = df["label_num"].values

X_trans = X.reshape((X.shape[0], 1, X.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(
    X_trans, y, test_size=0.3, stratify=y, random_state=42
)

def build_transformer_model(input_shape):
    inp = Input(shape=input_shape)
    attn = MultiHeadAttention(num_heads=2, key_dim=input_shape[-1])(inp, inp)
    attn = Add()([inp, attn])
    attn = LayerNormalization()(attn)

    ff = Dense(32, activation='relu')(attn)
    ff = Dense(16)(ff)
    attn_proj = Dense(16)(attn)
    ff = Add()([ff, attn_proj])
    ff = LayerNormalization()(ff)

    out = Dense(1, activation='sigmoid')(ff)
    out = Flatten()(out)

    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = build_transformer_model(X_train.shape[1:])

print("\n=== Transformer ===")
t0 = time.time()
model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
train_time = time.time() - t0

t1 = time.time()
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "Transformer",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_transformer_300k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout, Add, MultiHeadAttention, Flatten

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=400000, random_state=42)

df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values.astype(np.float32)
y = df["label_num"].values

X_trans = X.reshape((X.shape[0], 1, X.shape[1]))


X_train, X_test, y_train, y_test = train_test_split(
    X_trans, y, test_size=0.3, stratify=y, random_state=42
)

def build_transformer_model(input_shape):
    inp = Input(shape=input_shape)
    attn = MultiHeadAttention(num_heads=2, key_dim=input_shape[-1])(inp, inp)
    attn = Add()([inp, attn])
    attn = LayerNormalization()(attn)

    ff = Dense(32, activation='relu')(attn)
    ff = Dense(16)(ff)
    attn_proj = Dense(16)(attn)
    ff = Add()([ff, attn_proj])
    ff = LayerNormalization()(ff)

    out = Dense(1, activation='sigmoid')(ff)
    out = Flatten()(out)

    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = build_transformer_model(X_train.shape[1:])

print("\n=== Transformer ===")
t0 = time.time()
model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
train_time = time.time() - t0

t1 = time.time()
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "Transformer",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_transformer_400k.csv", index=False)
print("saved")


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout, Add, MultiHeadAttention, Flatten

data_path = "/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/labeled_events_full-24h.csv"
df = pd.read_csv(data_path)

df = df.sample(n=500000, random_state=42)


df["avg_time_between_events"] = df["avg_time_between_events"].fillna(0)
df["num_unique_resources_accessed"] = df["num_unique_resources_accessed"].fillna(0)
df["label_num"] = df["label"].map({"Normal": 0, "Anomaly": 1})

features = [
    "num_btg_events",
    "num_care_events",
    "num_total_events",
    "avg_time_between_events",
    "num_unique_resources_accessed",
]
X = df[features].values.astype(np.float32)
y = df["label_num"].values

X_trans = X.reshape((X.shape[0], 1, X.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(
    X_trans, y, test_size=0.3, stratify=y, random_state=42
)

def build_transformer_model(input_shape):
    inp = Input(shape=input_shape)
    attn = MultiHeadAttention(num_heads=2, key_dim=input_shape[-1])(inp, inp)
    attn = Add()([inp, attn])
    attn = LayerNormalization()(attn)

    ff = Dense(32, activation='relu')(attn)
    ff = Dense(16)(ff)
    attn_proj = Dense(16)(attn)
    ff = Add()([ff, attn_proj])
    ff = LayerNormalization()(ff)

    out = Dense(1, activation='sigmoid')(ff)
    out = Flatten()(out)

    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = build_transformer_model(X_train.shape[1:])

print("\n=== Transformer ===")
t0 = time.time()
model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
train_time = time.time() - t0

t1 = time.time()
y_pred_prob = model.predict(X_test).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)
pred_time = time.time() - t1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary', pos_label=1
)
accuracy = accuracy_score(y_test, y_pred)
pred_counts = pd.Series(y_pred).value_counts().to_dict()
normal_pred = pred_counts.get(0, 0)
anomaly_pred = pred_counts.get(1, 0)

print(f"Præcision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Træningstid (s):    {train_time:.3f}")
print(f"Prediktionstid (s): {pred_time:.3f}")
print(f"Forudsagde: {normal_pred} Normal, {anomaly_pred} Anomaly")

results = pd.DataFrame([{
    "Model": "Transformer",
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-score": round(f1, 3),
    "Accuracy": round(accuracy, 3),
    "Train time (s)": round(train_time, 3),
    "Pred time (s)": round(pred_time, 3),
    "Predicted Normal": normal_pred,
    "Predicted Anomaly": anomaly_pred
}])
results.to_csv("/Users/melisadzanovic/Documents/ML - Thesis/hcs-synthetic-data-generator-main/model_results_transformer_500k.csv", index=False)
print("saved")
