In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import roc_auc_score, precision_score, confusion_matrix


# Load your processed audit logs CSV with features
df = pd.read_csv("../data/logs/simulated_audit_logs_with_features.csv", parse_dates=["timestamp"])
df.head()

Unnamed: 0,timestamp,user,ip_address,event_type,resource,event_hour,event_type_code,resource_depth,is_privileged_event,is_weekend,failed_logins_last_1h,ip_event_count,user_event_rate
0,2025-06-26 17:16:27.282458,user_3,196.99.48.56,login_success,search,17,0,0,0,False,0,1,18.533333
1,2025-06-26 17:27:27.019647,user_9,205.133.6.139,file_access,posts/search/search,17,3,2,0,False,0,1,14.266667
2,2025-06-26 17:27:49.797973,user_18,198.219.77.22,config_change,categories/list/app,17,2,2,1,False,0,1,17.6
3,2025-06-26 17:39:08.849585,user_18,142.121.233.60,login_success,blog/app/blog,17,0,2,0,False,0,1,17.6
4,2025-06-26 17:42:01.023333,user_7,50.217.213.142,login_failure,main/explore/explore,17,1,2,0,False,0,1,14.4


In [3]:
# Simulate anomaly labels
np.random.seed(42)
if "is_anomaly" not in df.columns:
    anomaly_fraction = 0.01  # 1% anomalies
    df['is_anomaly'] = 0
    anomaly_indices = df.sample(frac=anomaly_fraction).index
    df.loc[anomaly_indices, 'is_anomaly'] = 1

In [9]:
# Define features and target
features = [
    "event_hour", "event_type_code", "resource_depth",
    "is_privileged_event", "is_weekend",
    "failed_logins_last_1h", "ip_event_count", "user_event_rate"
]

X = df[features]
y = df["is_anomaly"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [11]:
import mlflow
import mlflow.xgboost

with mlflow.start_run(run_name="xgboost_anomaly_detection"):

    # Log hyperparameters
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 6)
    mlflow.log_param("learning_rate", 0.1)

    # Train model
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        use_label_encoder=False,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)

    auc = roc_auc_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("AUC", auc)
    mlflow.log_metric("Precision", precision)

    # Log the model
    mlflow.xgboost.log_model(model, artifact_path="model")

    print(f"AUC: {auc:.4f}")
    print(f"Precision: {precision:.4f}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  self.get_booster().save_model(fname)


AUC: 0.4791
Precision: 0.0000
