In [1]:
import os
import joblib
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, precision_recall_curve, roc_curve
from xgboost import XGBClassifier

In [2]:
# Load the datasets
attack_label = pd.read_csv('/local/scratch/exported/MP_Defi_txs_TY_23/guanda/attack_label.csv')
unlabeled = pd.read_csv('/local/scratch/exported/MP_Defi_txs_TY_23/guanda/unlabeled.csv')

# Select features
features = ['from_address_profit', 'to_address_profit', 'highest_profit_in_usd',
            'highest_price_change_ratio', 'path_length', 'num_swap_events', 'flashloan_in_usd']

# Preprocessing
unlabeled['flashloan_in_usd'] = pd.to_numeric(unlabeled['flashloan_in_usd'], errors='coerce')
normal_label = unlabeled[unlabeled['highest_profit_in_usd'] <= 1000]
unlabeled = unlabeled[unlabeled['highest_profit_in_usd'] > 1000].reset_index(drop=True)

# Add a new column for labels: 1 for attack and 0 for normal
attack_label['label'] = 1  # Attack
normal_label['label'] = 0  # Normal

# Combine the datasets
combined_df = pd.concat([attack_label, normal_label], ignore_index=True)

# Convert 'flashloan_in_usd' to numeric, coercing errors to NaN
combined_df['flashloan_in_usd'] = pd.to_numeric(combined_df['flashloan_in_usd'], errors='coerce')

X = combined_df[features]
y = combined_df['label']
X_unlabeled = unlabeled[features]

# Combine X and X_unlabeled temporarily for imputation and scaling
X_combined = np.vstack([X, X_unlabeled])

# Handle missing values by imputing with the mean strategy
imputer = SimpleImputer(strategy='mean')
X_combined_imputed = imputer.fit_transform(X_combined)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined_imputed)

# Separate them back into X and X_unlabeled
X_scaled = X_combined_scaled[:len(X), :]
X_unlabeled_scaled = X_combined_scaled[len(X):, :]

temp = [
"0x3b19e152943f31fe0830b67315ddc89be9a066dc89174256e17bc8c2d35b5af8",
"0xcb0ad9da33ecabf75df0a24aabf8a4517e4a7c5b1b2f11fee3b6a1ad9299a282",
"0xcb58fb952914896b35d909136b9f719b71fc8bc60b59853459fc2476d4369c3a",
"0xf72f1d10fc6923f87279ce6c0aef46e372c6652a696f280b0465a301a92f2e26",
"0x118b7b7c11f9e9bd630ea84ef267b183b34021b667f4a3061f048207d266437a",
"0x3503253131644dd9f52802d071de74e456570374d586ddd640159cf6fb9b8ad8",
"0x35f8d2f572fceaac9288e5d462117850ef2694786992a8c3f6d02612277b0877",
"0x0fc6d2ca064fc841bc9b1c1fad1fbb97bcea5c9a1b2b66ef837f1227e06519a6",
"0x958236266991bc3fe3b77feaacea120f172c0708ad01c7a715b255f218f9313c",
"0x46a03488247425f845e444b9c10b52ba3c14927c687d38287c0faddc7471150a",
"0x8bb8dc5c7c830bac85fa48acad2505e9300a91c3ff239c9517d0cae33b595090",
"0xf6022012b73770e7e2177129e648980a82aab555f9ac88b8a9cda3ec44b30779",
"0xcd314668aaa9bbfebaf1a0bd2b6553d01dd58899c508d4729fa7311dc5d33ad7"
]

indices_temp = combined_df[combined_df['tx_hash'].isin(temp)].index

X_temp_scaled = X_scaled[indices_temp]
y_temp = y.iloc[indices_temp]

X_scaled_removed = np.delete(X_scaled, indices_temp, axis=0)
y_array = np.array(y) 
y_removed = np.delete(y_array, indices_temp, axis=0)

X_train, X_test, y_train, y_test = train_test_split(X_scaled_removed, y_removed, test_size=0.2, random_state=42, stratify=y_removed)

X_train = np.vstack([X_train, X_temp_scaled])
y_train = np.concatenate([y_train, y_temp])

unlabeled_predictions = pd.DataFrame()

  unlabeled = pd.read_csv('/local/scratch/exported/MP_Defi_txs_TY_23/guanda/unlabeled.csv')


# Random Forest Classifier

In [3]:
# Define the objective function for Optuna
def rf_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, 
                                   min_samples_split=min_samples_split, random_state=42)
    
    # Use StratifiedKFold for stratified cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=skf, n_jobs=-1, scoring='f1').mean()
    return score

# Check if the model already exists
if os.path.exists('Models/random_forest_model.pkl'):
    # Load the existing model
    best_model = joblib.load('Models/random_forest_model.pkl')
    print("Loaded existing Random Forest model.")
    best_params = best_model.get_params()
    for param, value in best_params.items():
        print(f"{param}: {value}")
else: 
    # Optimize with Optuna
    study = optuna.create_study(direction='maximize')
    study.optimize(rf_objective, n_trials=50)

    # Best parameters
    best_params = study.best_params
    best_model = RandomForestClassifier(**best_params, random_state=42)
    # Save the model
    joblib.dump(best_model, 'Models/random_forest_model.pkl')
    print("Trained and saved new Random Forest model.")

# Fit and evaluate
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate confusion matrix and AUC-ROC score
conf_matrix = confusion_matrix(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_proba)

# Calculate classification report
class_report = classification_report(y_test, y_pred, output_dict=True)

# Calculate precision, recall, and thresholds
precision, recall, _ = precision_recall_curve(y_test, y_proba)

# Create a DataFrame to store Precision-Recall data
pr_data_df = pd.DataFrame({
    'Recall': recall,
    'Precision': precision
})

# Save the Precision-Recall data to a CSV file
pr_data_df.to_csv('Models/random_forest_precision_recall_data.csv', index=False)

# Prepare confusion matrix data
tn, fp, fn, tp = conf_matrix.ravel()

# Create a DataFrame to store the metrics
metrics_list = [
    {'Metric': 'True Negatives', 'Value': tn},
    {'Metric': 'False Positives', 'Value': fp},
    {'Metric': 'False Negatives', 'Value': fn},
    {'Metric': 'True Positives', 'Value': tp},
    {'Metric': 'AUC-ROC', 'Value': auc_roc}
]

# Add classification report data to the metrics DataFrame
for label, metrics in class_report.items():
    if label not in ['accuracy', 'macro avg', 'weighted avg']:
        metrics_list.append({'Metric': f'Precision ({label})', 'Value': metrics['precision']})
        metrics_list.append({'Metric': f'Recall ({label})', 'Value': metrics['recall']})
        metrics_list.append({'Metric': f'F1-Score ({label})', 'Value': metrics['f1-score']})

# Save the DataFrame to a CSV file
metrics_df = pd.DataFrame(metrics_list)
metrics_df.to_csv('Models/random_forest_metrics.csv', index=False)

# Calculate ROC curve data
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Create a DataFrame for ROC curve data
roc_data_df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr,
    'Thresholds': thresholds
})



# Save ROC curve data to CSV
roc_data_df.to_csv('Models/random_forest_roc_data.csv', index=False)

# Predict the labels for the unlabeled dataset
rf_y_unlabeled_pred = best_model.predict(X_unlabeled_scaled)

# Count the number of 0s and 1s in the predictions
num_zeros = np.sum(rf_y_unlabeled_pred == 0)
num_ones = np.sum(rf_y_unlabeled_pred == 1)

print(f"Number of 0s (normal): {num_zeros}")
print(f"Number of 1s (attack): {num_ones}")

unlabeled['prediction'] = rf_y_unlabeled_pred
platform_prediction_counts = unlabeled.groupby(['platform', 'prediction']).size().unstack(fill_value=0)
platform_prediction_counts.to_csv('Models/random_forest_unlabeled_platform_prediction_counts.csv')

unlabeled_predictions['Random_Forest'] = rf_y_unlabeled_pred

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loaded existing Random Forest model.
bootstrap: True
ccp_alpha: 0.0
class_weight: None
criterion: gini
max_depth: 18
max_features: sqrt
max_leaf_nodes: None
max_samples: None
min_impurity_decrease: 0.0
min_samples_leaf: 1
min_samples_split: 4
min_weight_fraction_leaf: 0.0
monotonic_cst: None
n_estimators: 117
n_jobs: None
oob_score: False
random_state: 42
verbose: 0
warm_start: False
Number of 0s (normal): 174492
Number of 1s (attack): 58175


# K-Nearest Neighbors

In [4]:
# Define the objective function for Optuna
def knn_objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 50)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    
    model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
    
    # Use StratifiedKFold for stratified cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=skf, n_jobs=-1, scoring='f1').mean()
    return score

# Check if the model already exists
if os.path.exists('Models/knn_model.pkl'):
    # Load the existing model
    best_model = joblib.load('Models/knn_model.pkl')
    print("Loaded existing K-Nearest Neighbors model.")
    best_params = best_model.get_params()
    for param, value in best_params.items():
        print(f"{param}: {value}")
else: 
    # Optimize with Optuna
    study = optuna.create_study(direction='maximize')
    study.optimize(knn_objective, n_trials=50)

    # Best parameters
    best_params = study.best_params
    best_model = KNeighborsClassifier(**best_params)
    # Save the model
    joblib.dump(best_model, 'Models/knn_model.pkl')
    print("Trained and saved new K-Nearest Neighbors model.")

# Fit and evaluate
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate confusion matrix and AUC-ROC score
conf_matrix = confusion_matrix(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_proba)

# Calculate classification report
class_report = classification_report(y_test, y_pred, output_dict=True)

# Calculate precision, recall, and thresholds
precision, recall, _ = precision_recall_curve(y_test, y_proba)

# Create a DataFrame to store Precision-Recall data
pr_data_df = pd.DataFrame({
    'Recall': recall,
    'Precision': precision
})

# Save the Precision-Recall data to a CSV file
pr_data_df.to_csv('Models/knn_precision_recall_data.csv', index=False)

# Prepare confusion matrix data
tn, fp, fn, tp = conf_matrix.ravel()

# Create a DataFrame to store the metrics
metrics_list = [
    {'Metric': 'True Negatives', 'Value': tn},
    {'Metric': 'False Positives', 'Value': fp},
    {'Metric': 'False Negatives', 'Value': fn},
    {'Metric': 'True Positives', 'Value': tp},
    {'Metric': 'AUC-ROC', 'Value': auc_roc}
]

# Add classification report data to the metrics DataFrame
for label, metrics in class_report.items():
    if label not in ['accuracy', 'macro avg', 'weighted avg']:
        metrics_list.append({'Metric': f'Precision ({label})', 'Value': metrics['precision']})
        metrics_list.append({'Metric': f'Recall ({label})', 'Value': metrics['recall']})
        metrics_list.append({'Metric': f'F1-Score ({label})', 'Value': metrics['f1-score']})

# Save the DataFrame to a CSV file
metrics_df = pd.DataFrame(metrics_list)
metrics_df.to_csv('Models/knn_metrics.csv', index=False)

# Calculate ROC curve data
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Create a DataFrame for ROC curve data
roc_data_df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr,
    'Thresholds': thresholds
})

# Save ROC curve data to CSV
roc_data_df.to_csv('Models/knn_roc_data.csv', index=False)

# Predict the labels for the unlabeled dataset
knn_y_unlabeled_pred = best_model.predict(X_unlabeled_scaled)

# Count the number of 0s and 1s in the predictions
num_zeros = np.sum(knn_y_unlabeled_pred == 0)
num_ones = np.sum(knn_y_unlabeled_pred == 1)

print(f"Number of 0s (normal): {num_zeros}")
print(f"Number of 1s (attack): {num_ones}")

unlabeled['prediction'] = knn_y_unlabeled_pred
platform_prediction_counts = unlabeled.groupby(['platform', 'prediction']).size().unstack(fill_value=0)
platform_prediction_counts.to_csv('Models/knn_unlabeled_platform_prediction_counts.csv')

unlabeled_predictions['KNN'] = knn_y_unlabeled_pred

Loaded existing K-Nearest Neighbors model.
algorithm: auto
leaf_size: 30
metric: minkowski
metric_params: None
n_jobs: None
n_neighbors: 5
p: 2
weights: distance


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Number of 0s (normal): 206391
Number of 1s (attack): 26276


# Decision Tree Classifier

In [5]:
# Define the objective function for Optuna
def dt_objective(trial):
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
    
    # Use StratifiedKFold for stratified cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=skf, n_jobs=-1, scoring='f1').mean()
    return score

# Check if the model already exists
if os.path.exists('Models/decision_tree_model.pkl'):
    # Load the existing model
    best_model = joblib.load('Models/decision_tree_model.pkl')
    print("Loaded existing Decision Tree model.")
    best_params = best_model.get_params()
    for param, value in best_params.items():
        print(f"{param}: {value}")
else: 
    # Optimize with Optuna
    study = optuna.create_study(direction='maximize')
    study.optimize(dt_objective, n_trials=50)

    # Best parameters
    best_params = study.best_params
    best_model = DecisionTreeClassifier(**best_params, random_state=42)
    # Save the model
    joblib.dump(best_model, 'Models/decision_tree_model.pkl')
    print("Trained and saved new Decision Tree model.")

# Fit and evaluate
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate confusion matrix and AUC-ROC score
conf_matrix = confusion_matrix(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_proba)

# Calculate classification report
class_report = classification_report(y_test, y_pred, output_dict=True)

# Calculate precision, recall, and thresholds
precision, recall, _ = precision_recall_curve(y_test, y_proba)

# Create a DataFrame to store Precision-Recall data
pr_data_df = pd.DataFrame({
    'Recall': recall,
    'Precision': precision
})

# Save the Precision-Recall data to a CSV file
pr_data_df.to_csv('Models/decision_tree_precision_recall_data.csv', index=False)

# Prepare confusion matrix data
tn, fp, fn, tp = conf_matrix.ravel()

# Create a DataFrame to store the metrics
metrics_list = [
    {'Metric': 'True Negatives', 'Value': tn},
    {'Metric': 'False Positives', 'Value': fp},
    {'Metric': 'False Negatives', 'Value': fn},
    {'Metric': 'True Positives', 'Value': tp},
    {'Metric': 'AUC-ROC', 'Value': auc_roc}
]

# Add classification report data to the metrics DataFrame
for label, metrics in class_report.items():
    if label not in ['accuracy', 'macro avg', 'weighted avg']:
        metrics_list.append({'Metric': f'Precision ({label})', 'Value': metrics['precision']})
        metrics_list.append({'Metric': f'Recall ({label})', 'Value': metrics['recall']})
        metrics_list.append({'Metric': f'F1-Score ({label})', 'Value': metrics['f1-score']})

# Save the DataFrame to a CSV file
metrics_df = pd.DataFrame(metrics_list)
metrics_df.to_csv('Models/decision_tree_metrics.csv', index=False)

# Calculate ROC curve data
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Create a DataFrame for ROC curve data
roc_data_df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr,
    'Thresholds': thresholds
})

# Save ROC curve data to CSV
roc_data_df.to_csv('Models/decision_tree_roc_data.csv', index=False)

# Predict the labels for the unlabeled dataset
dt_y_unlabeled_pred = best_model.predict(X_unlabeled_scaled)

# Count the number of 0s and 1s in the predictions
num_zeros = np.sum(dt_y_unlabeled_pred == 0)
num_ones = np.sum(dt_y_unlabeled_pred == 1)

print(f"Number of 0s (normal): {num_zeros}")
print(f"Number of 1s (attack): {num_ones}")

unlabeled['prediction'] = dt_y_unlabeled_pred
platform_prediction_counts = unlabeled.groupby(['platform', 'prediction']).size().unstack(fill_value=0)
platform_prediction_counts.to_csv('Models/decision_tree_unlabeled_platform_prediction_counts.csv')

unlabeled_predictions['Decision_Tree'] = dt_y_unlabeled_pred

Loaded existing Decision Tree model.
ccp_alpha: 0.0
class_weight: None
criterion: gini
max_depth: 13
max_features: None
max_leaf_nodes: None
min_impurity_decrease: 0.0
min_samples_leaf: 1
min_samples_split: 9
min_weight_fraction_leaf: 0.0
monotonic_cst: None
random_state: 42
splitter: best


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Number of 0s (normal): 177768
Number of 1s (attack): 54899


# Gradient Boosting Classifier

In [6]:
# Define the objective function for Optuna
def gb_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    
    model = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    
    # Use StratifiedKFold for stratified cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=skf, n_jobs=-1, scoring='f1').mean()
    return score

# Check if the model already exists
if os.path.exists('Models/gradient_boosting_model.pkl'):
    # Load the existing model
    best_model = joblib.load('Models/gradient_boosting_model.pkl')
    print("Loaded existing Gradient Boosting model.")
    best_params = best_model.get_params()
    for param, value in best_params.items():
        print(f"{param}: {value}")
else: 
    # Optimize with Optuna
    study = optuna.create_study(direction='maximize')
    study.optimize(gb_objective, n_trials=50)

    # Best parameters
    best_params = study.best_params
    best_model = GradientBoostingClassifier(**best_params, random_state=42)
    # Save the model
    joblib.dump(best_model, 'Models/gradient_boosting_model.pkl')
    print("Trained and saved new Gradient Boosting model.")

# Fit and evaluate
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate confusion matrix and AUC-ROC score
conf_matrix = confusion_matrix(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_proba)

# Calculate classification report
class_report = classification_report(y_test, y_pred, output_dict=True)

# Calculate precision, recall, and thresholds
precision, recall, _ = precision_recall_curve(y_test, y_proba)

# Create a DataFrame to store Precision-Recall data
pr_data_df = pd.DataFrame({
    'Recall': recall,
    'Precision': precision
})

# Save the Precision-Recall data to a CSV file
pr_data_df.to_csv('Models/gradient_boosting_precision_recall_data.csv', index=False)

# Prepare confusion matrix data
tn, fp, fn, tp = conf_matrix.ravel()

# Create a DataFrame to store the metrics
metrics_list = [
    {'Metric': 'True Negatives', 'Value': tn},
    {'Metric': 'False Positives', 'Value': fp},
    {'Metric': 'False Negatives', 'Value': fn},
    {'Metric': 'True Positives', 'Value': tp},
    {'Metric': 'AUC-ROC', 'Value': auc_roc}
]

# Add classification report data to the metrics DataFrame
for label, metrics in class_report.items():
    if label not in ['accuracy', 'macro avg', 'weighted avg']:
        metrics_list.append({'Metric': f'Precision ({label})', 'Value': metrics['precision']})
        metrics_list.append({'Metric': f'Recall ({label})', 'Value': metrics['recall']})
        metrics_list.append({'Metric': f'F1-Score ({label})', 'Value': metrics['f1-score']})

# Save the DataFrame to a CSV file
metrics_df = pd.DataFrame(metrics_list)
metrics_df.to_csv('Models/gradient_boosting_metrics.csv', index=False)

# Calculate ROC curve data
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Create a DataFrame for ROC curve data
roc_data_df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr,
    'Thresholds': thresholds
})

# Save ROC curve data to CSV
roc_data_df.to_csv('Models/gradient_boosting_roc_data.csv', index=False)

# Predict the labels for the unlabeled dataset
gb_y_unlabeled_pred = best_model.predict(X_unlabeled_scaled)

# Count the number of 0s and 1s in the predictions
num_zeros = np.sum(gb_y_unlabeled_pred == 0)
num_ones = np.sum(gb_y_unlabeled_pred == 1)

print(f"Number of 0s (normal): {num_zeros}")
print(f"Number of 1s (attack): {num_ones}")

unlabeled['prediction'] = gb_y_unlabeled_pred
platform_prediction_counts = unlabeled.groupby(['platform', 'prediction']).size().unstack(fill_value=0)
platform_prediction_counts.to_csv('Models/gradient_boosting_unlabeled_platform_prediction_counts.csv')

unlabeled_predictions['Gradient_Boosting'] = gb_y_unlabeled_pred

Loaded existing Gradient Boosting model.
ccp_alpha: 0.0
criterion: friedman_mse
init: None
learning_rate: 0.1
loss: log_loss
max_depth: 8
max_features: None
max_leaf_nodes: None
min_impurity_decrease: 0.0
min_samples_leaf: 1
min_samples_split: 2
min_weight_fraction_leaf: 0.0
n_estimators: 92
n_iter_no_change: None
random_state: 42
subsample: 1.0
tol: 0.0001
validation_fraction: 0.1
verbose: 0
warm_start: False


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Number of 0s (normal): 171611
Number of 1s (attack): 61056


# AdaBoost Classifier

In [7]:
# Define the objective function for Optuna
def ab_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 400)
    model = AdaBoostClassifier(n_estimators=n_estimators, algorithm='SAMME', random_state=42)
    
    # Use StratifiedKFold for stratified cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=skf, n_jobs=-1, scoring='f1').mean()
    return score

# Check if the model already exists
if os.path.exists('Models/adaboost_model.pkl'):
    # Load the existing model
    best_model = joblib.load('Models/adaboost_model.pkl')
    print("Loaded existing AdaBoost model.")
    best_params = best_model.get_params()
    for param, value in best_params.items():
        print(f"{param}: {value}")
else: 
    # Optimize with Optuna
    study = optuna.create_study(direction='maximize')
    study.optimize(ab_objective, n_trials=50)

    # Best parameters
    best_params = study.best_params
    best_model = AdaBoostClassifier(**best_params, algorithm='SAMME', random_state=42)
    # Save the model
    joblib.dump(best_model, 'Models/adaboost_model.pkl')
    print("Trained and saved new AdaBoost model.")

# Fit and evaluate
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate confusion matrix and AUC-ROC score
conf_matrix = confusion_matrix(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_proba)

# Calculate classification report
class_report = classification_report(y_test, y_pred, output_dict=True)

# Calculate precision, recall, and thresholds
precision, recall, _ = precision_recall_curve(y_test, y_proba)

# Create a DataFrame to store Precision-Recall data
pr_data_df = pd.DataFrame({
    'Recall': recall,
    'Precision': precision
})

# Save the Precision-Recall data to a CSV file
pr_data_df.to_csv('Models/adaboost_precision_recall_data.csv', index=False)

# Prepare confusion matrix data
tn, fp, fn, tp = conf_matrix.ravel()

# Create a DataFrame to store the metrics
metrics_list = [
    {'Metric': 'True Negatives', 'Value': tn},
    {'Metric': 'False Positives', 'Value': fp},
    {'Metric': 'False Negatives', 'Value': fn},
    {'Metric': 'True Positives', 'Value': tp},
    {'Metric': 'AUC-ROC', 'Value': auc_roc}
]

# Add classification report data to the metrics DataFrame
for label, metrics in class_report.items():
    if label not in ['accuracy', 'macro avg', 'weighted avg']:
        metrics_list.append({'Metric': f'Precision ({label})', 'Value': metrics['precision']})
        metrics_list.append({'Metric': f'Recall ({label})', 'Value': metrics['recall']})
        metrics_list.append({'Metric': f'F1-Score ({label})', 'Value': metrics['f1-score']})

# Save the DataFrame to a CSV file
metrics_df = pd.DataFrame(metrics_list)
metrics_df.to_csv('Models/adaboost_metrics.csv', index=False)

# Calculate ROC curve data
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Create a DataFrame for ROC curve data
roc_data_df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr,
    'Thresholds': thresholds
})

# Save ROC curve data to CSV
roc_data_df.to_csv('Models/adaboost_roc_data.csv', index=False)

# Predict the labels for the unlabeled dataset
ada_y_unlabeled_pred = best_model.predict(X_unlabeled_scaled)

# Count the number of 0s and 1s in the predictions
num_zeros = np.sum(ada_y_unlabeled_pred == 0)
num_ones = np.sum(ada_y_unlabeled_pred == 1)

print(f"Number of 0s (normal): {num_zeros}")
print(f"Number of 1s (attack): {num_ones}")

unlabeled['prediction'] = ada_y_unlabeled_pred
platform_prediction_counts = unlabeled.groupby(['platform', 'prediction']).size().unstack(fill_value=0)
platform_prediction_counts.to_csv('Models/adaboost_unlabeled_platform_prediction_counts.csv')

unlabeled_predictions['AdaBoost'] = ada_y_unlabeled_pred

Loaded existing AdaBoost model.
algorithm: SAMME
estimator: None
learning_rate: 1.0
n_estimators: 376
random_state: 42


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Number of 0s (normal): 197214
Number of 1s (attack): 35453


# XGBoost Classifier

In [8]:
# Define the objective function for Optuna
def xgb_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    
    model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, eval_metric='logloss', random_state=42)
    
    # Use StratifiedKFold for stratified cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=skf, n_jobs=-1, scoring='f1').mean()
    return score

# Check if the model already exists
if os.path.exists('Models/xgboost_model.pkl'):
    # Load the existing model
    best_model = joblib.load('Models/xgboost_model.pkl')
    print("Loaded existing XGBoost model.")
    best_params = best_model.get_params()
    for param, value in best_params.items():
        print(f"{param}: {value}")
else: 
    # Optimize with Optuna
    study = optuna.create_study(direction='maximize')
    study.optimize(xgb_objective, n_trials=50)

    # Best parameters
    best_params = study.best_params
    best_model = XGBClassifier(**best_params, eval_metric='logloss', random_state=42)
    # Save the model
    joblib.dump(best_model, 'Models/xgboost_model.pkl')
    print("Trained and saved new XGBoost model.")

# Fit and evaluate
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate confusion matrix and AUC-ROC score
conf_matrix = confusion_matrix(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_proba)

# Calculate classification report
class_report = classification_report(y_test, y_pred, output_dict=True)

# Calculate precision, recall, and thresholds
precision, recall, _ = precision_recall_curve(y_test, y_proba)

# Create a DataFrame to store Precision-Recall data
pr_data_df = pd.DataFrame({
    'Recall': recall,
    'Precision': precision
})

# Save the Precision-Recall data to a CSV file
pr_data_df.to_csv('Models/xgboost_precision_recall_data.csv', index=False)

# Prepare confusion matrix data
tn, fp, fn, tp = conf_matrix.ravel()

# Create a DataFrame to store the metrics
metrics_list = [
    {'Metric': 'True Negatives', 'Value': tn},
    {'Metric': 'False Positives', 'Value': fp},
    {'Metric': 'False Negatives', 'Value': fn},
    {'Metric': 'True Positives', 'Value': tp},
    {'Metric': 'AUC-ROC', 'Value': auc_roc}
]

# Add classification report data to the metrics DataFrame
for label, metrics in class_report.items():
    if label not in ['accuracy', 'macro avg', 'weighted avg']:
        metrics_list.append({'Metric': f'Precision ({label})', 'Value': metrics['precision']})
        metrics_list.append({'Metric': f'Recall ({label})', 'Value': metrics['recall']})
        metrics_list.append({'Metric': f'F1-Score ({label})', 'Value': metrics['f1-score']})

# Save the DataFrame to a CSV file
metrics_df = pd.DataFrame(metrics_list)
metrics_df.to_csv('Models/xgboost_metrics.csv', index=False)

# Calculate ROC curve data
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Create a DataFrame for ROC curve data
roc_data_df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr,
    'Thresholds': thresholds
})

# Save ROC curve data to CSV
roc_data_df.to_csv('Models/xgboost_roc_data.csv', index=False)

# Predict the labels for the unlabeled dataset
xg_y_unlabeled_pred = best_model.predict(X_unlabeled_scaled)

# Count the number of 0s and 1s in the predictions
num_zeros = np.sum(xg_y_unlabeled_pred == 0)
num_ones = np.sum(xg_y_unlabeled_pred == 1)

print(f"Number of 0s (normal): {num_zeros}")
print(f"Number of 1s (attack): {num_ones}")

unlabeled['prediction'] = xg_y_unlabeled_pred
platform_prediction_counts = unlabeled.groupby(['platform', 'prediction']).size().unstack(fill_value=0)
platform_prediction_counts.to_csv('Models/xgboost_unlabeled_platform_prediction_counts.csv')

unlabeled_predictions['XGBoost'] = xg_y_unlabeled_pred

Loaded existing XGBoost model.
objective: binary:logistic
base_score: None
booster: None
callbacks: None
colsample_bylevel: None
colsample_bynode: None
colsample_bytree: None
device: None
early_stopping_rounds: None
enable_categorical: False
eval_metric: logloss
feature_types: None
gamma: None
grow_policy: None
importance_type: None
interaction_constraints: None
learning_rate: None
max_bin: None
max_cat_threshold: None
max_cat_to_onehot: None
max_delta_step: None
max_depth: 12
max_leaves: None
min_child_weight: None
missing: nan
monotone_constraints: None
multi_strategy: None
n_estimators: 24
n_jobs: None
num_parallel_tree: None
random_state: 42
reg_alpha: None
reg_lambda: None
sampling_method: None
scale_pos_weight: None
subsample: None
tree_method: None
validate_parameters: None
verbosity: None
Number of 0s (normal): 160965
Number of 1s (attack): 71702


In [9]:
unlabeled['Random_Forest'] = unlabeled_predictions['Random_Forest']
unlabeled['KNN'] = unlabeled_predictions['KNN']
unlabeled['Decision_Tree'] = unlabeled_predictions['Decision_Tree']
unlabeled['Gradient_Boosting'] = unlabeled_predictions['Gradient_Boosting']
unlabeled['AdaBoost'] = unlabeled_predictions['AdaBoost']
unlabeled['XGBoost'] = unlabeled_predictions['XGBoost']
unlabeled_predictions = unlabeled_predictions.astype(bool)
unlabeled['All_Attack'] = unlabeled_predictions.sum(axis=1) == len(unlabeled_predictions.columns)
unlabeled.to_csv('Models/unlabeled_all_models.csv', index=False)
all_attack_indices = unlabeled[unlabeled['All_Attack']].index
all_attack_data = unlabeled.loc[all_attack_indices]
all_attack_data.to_csv('Models/unlabeled_all_attack_data.csv', index=False)