In [1]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier

In [2]:
# Load the datasets
attack_label = pd.read_csv('/local/scratch/exported/MP_Defi_txs_TY_23/guanda/attack_label.csv')
unlabeled = pd.read_csv('/local/scratch/exported/MP_Defi_txs_TY_23/guanda/unlabeled.csv')
unlabeled['flashloan_in_usd'] = unlabeled['flashloan_in_usd'].astype(float)
normal_label = unlabeled[unlabeled['highest_profit_in_usd'] <= 1000]
test_set = unlabeled[unlabeled['highest_profit_in_usd'] > 1000]

# Add a new column for labels: 1 for attack and 0 for normal
attack_label['label'] = 1  # Attack
normal_label['label'] = 0 

# Combine the datasets
combined_df = pd.concat([attack_label, normal_label], ignore_index=True)

# Convert 'flashloan_in_usd' to numeric, coercing errors to NaN
combined_df['flashloan_in_usd'] = pd.to_numeric(combined_df['flashloan_in_usd'], errors='coerce')

# Select features
features = ['from_address_profit', 'to_address_profit', 'highest_profit_in_usd',
            'highest_price_change_ratio', 'path_length', 'num_swap_events', 'flashloan_in_usd']

X = combined_df[features]
y = combined_df['label']

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

  unlabeled = pd.read_csv('/local/scratch/exported/MP_Defi_txs_TY_23/guanda/unlabeled.csv')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_label['label'] = 0


# Random Forest Classifier

In [6]:
# Define the objective function for Optuna
def rf_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, 
                                   min_samples_split=min_samples_split, random_state=42)
    
    score = cross_val_score(model, X_scaled, y, n_jobs=-1, cv=10, scoring='f1').mean()
    return score

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(rf_objective, n_trials=50)

# Best parameters
best_params = study.best_params
best_model = RandomForestClassifier(**best_params, random_state=42)

# Fit and evaluate
best_model.fit(X_scaled, y)
y_pred = best_model.predict(X_scaled)
print("Random Forest Classifier")
print(classification_report(y, y_pred))
print(f"Number of 1s (attacks): {np.sum(y_pred == 1)}")
print(f"Number of 0s (normals): {np.sum(y_pred == 0)}")

[I 2024-10-03 15:39:09,306] A new study created in memory with name: no-name-972114a8-e766-4850-b630-a53f23139d1d
[I 2024-10-03 15:39:50,665] Trial 0 finished with value: 0.6272415955720378 and parameters: {'n_estimators': 185, 'max_depth': 19, 'min_samples_split': 2}. Best is trial 0 with value: 0.6272415955720378.
[I 2024-10-03 15:40:09,367] Trial 1 finished with value: 0.6269255882354642 and parameters: {'n_estimators': 81, 'max_depth': 18, 'min_samples_split': 8}. Best is trial 0 with value: 0.6272415955720378.
[I 2024-10-03 15:40:21,018] Trial 2 finished with value: 0.6358822104532931 and parameters: {'n_estimators': 59, 'max_depth': 12, 'min_samples_split': 2}. Best is trial 2 with value: 0.6358822104532931.
[I 2024-10-03 15:40:35,203] Trial 3 finished with value: 0.6205965017869571 and parameters: {'n_estimators': 89, 'max_depth': 9, 'min_samples_split': 7}. Best is trial 2 with value: 0.6358822104532931.
[I 2024-10-03 15:40:44,389] Trial 4 finished with value: 0.207151079765060

Random Forest Classifier
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    281759
           1       0.97      0.65      0.78      4476

    accuracy                           0.99    286235
   macro avg       0.98      0.83      0.89    286235
weighted avg       0.99      0.99      0.99    286235

Number of 1s (attacks): 2994
Number of 0s (normals): 283241


# Logistic Regression

In [5]:
# Define the objective function for Optuna
def lr_objective(trial):
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
    max_iter = trial.suggest_int("max_iter", 100, 5000)
    model = LogisticRegression(solver=solver, max_iter=max_iter, random_state=42)
    
    score = cross_val_score(model, X_scaled, y, n_jobs=-1, cv=10, scoring='f1').mean()
    return score

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(lr_objective, n_trials=50)

# Best parameters
best_params = study.best_params
best_model = LogisticRegression(**best_params, random_state=42)

# Fit and evaluate
best_model.fit(X_scaled, y)
y_pred = best_model.predict(X_scaled)
print("Logistic Regression")
print(classification_report(y, y_pred))
print(f"Number of 1s (attacks): {np.sum(y_pred == 1)}")
print(f"Number of 0s (normals): {np.sum(y_pred == 0)}")

[I 2024-10-03 20:28:01,878] A new study created in memory with name: no-name-774f7017-3c34-41d6-8b72-1e82f5577d22
[I 2024-10-03 20:31:34,935] Trial 0 finished with value: 0.007007337413077824 and parameters: {'solver': 'saga', 'max_iter': 2798}. Best is trial 0 with value: 0.007007337413077824.
[I 2024-10-03 20:31:36,272] Trial 1 finished with value: 0.011813310059420008 and parameters: {'solver': 'liblinear', 'max_iter': 4316}. Best is trial 1 with value: 0.011813310059420008.
[I 2024-10-03 20:31:37,546] Trial 2 finished with value: 0.011813310059420008 and parameters: {'solver': 'liblinear', 'max_iter': 4110}. Best is trial 1 with value: 0.011813310059420008.
[I 2024-10-03 20:31:38,896] Trial 3 finished with value: 0.011813310059420008 and parameters: {'solver': 'liblinear', 'max_iter': 2173}. Best is trial 1 with value: 0.011813310059420008.
[I 2024-10-03 20:34:33,659] Trial 4 finished with value: 0.007007337413077824 and parameters: {'solver': 'saga', 'max_iter': 2360}. Best is tri

Logistic Regression
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    281759
           1       0.09      0.01      0.01      4476

    accuracy                           0.98    286235
   macro avg       0.54      0.50      0.50    286235
weighted avg       0.97      0.98      0.98    286235

Number of 1s (attacks): 277
Number of 0s (normals): 285958


# Support Vector Classifier

In [10]:
# Define the objective function for Optuna
def svc_objective(trial):
    C = trial.suggest_float("C", 1e-5, 1e2, log=True)  # Tune C, the regularization strength
    model = SVC(C=C, kernel='linear', random_state=42)
    
    score = cross_val_score(model, X_scaled, y, n_jobs=-1, cv=10, scoring='f1').mean()
    return score

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(svc_objective, n_trials=50)

# Best parameters
best_params = study.best_params
best_model = SVC(**best_params, kernel='linear', random_state=42)

# Fit and evaluate
best_model.fit(X_scaled, y)
y_pred = best_model.predict(X_scaled)
print("Support Vector Classifier")
print(classification_report(y, y_pred))
print(f"Number of 1s (attacks): {np.sum(y_pred == 1)}")
print(f"Number of 0s (normals): {np.sum(y_pred == 0)}")

[I 2024-10-03 22:56:58,805] A new study created in memory with name: no-name-f8c26383-0033-4f93-9d63-0af5d99df1c5
[W 2024-10-03 23:13:38,883] Trial 0 failed with parameters: {'C': 71.00269330396786} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/user/gzhao/.local/lib/python3.9/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_2107340/2756606407.py", line 6, in svc_objective
    score = cross_val_score(model, X_scaled, y, n_jobs=-1, cv=10, scoring='f1').mean()
  File "/home/user/gzhao/.local/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/user/gzhao/.local/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 712, in cross_val_score
    cv_results = cross_validate(
  File "/home/user/gzhao/.local/lib/python3.9/site-packages/sklearn/utils/_param_validation.py

KeyboardInterrupt: 

# K-Nearest Neighbors

In [15]:
# Define the objective function for Optuna
def knn_objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 50)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    
    model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
    
    score = cross_val_score(model, X_scaled, y, n_jobs=-1, cv=10, scoring='f1').mean()
    return score

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(knn_objective, n_trials=50)

# Best parameters
best_params = study.best_params
best_model = KNeighborsClassifier(**best_params)

# Fit and evaluate
best_model.fit(X_scaled, y)
y_pred = best_model.predict(X_scaled)
print("K-Nearest Neighbors")
print(classification_report(y, y_pred))
print(f"Number of 1s (attacks): {np.sum(y_pred == 1)}")
print(f"Number of 0s (normals): {np.sum(y_pred == 0)}")

[I 2024-10-03 16:26:34,229] A new study created in memory with name: no-name-8d78cb29-517d-4ce8-8247-a5afd72c6cbe
[I 2024-10-03 16:26:48,905] Trial 0 finished with value: 0.5068483972642754 and parameters: {'n_neighbors': 46, 'weights': 'distance'}. Best is trial 0 with value: 0.5068483972642754.
[I 2024-10-03 16:26:53,078] Trial 1 finished with value: 0.5294913336377284 and parameters: {'n_neighbors': 12, 'weights': 'uniform'}. Best is trial 1 with value: 0.5294913336377284.
[I 2024-10-03 16:26:58,161] Trial 2 finished with value: 0.45555887083307967 and parameters: {'n_neighbors': 45, 'weights': 'uniform'}. Best is trial 1 with value: 0.5294913336377284.
[I 2024-10-03 16:27:03,054] Trial 3 finished with value: 0.5096373492662829 and parameters: {'n_neighbors': 42, 'weights': 'distance'}. Best is trial 1 with value: 0.5294913336377284.
[I 2024-10-03 16:27:06,074] Trial 4 finished with value: 0.5371189617285504 and parameters: {'n_neighbors': 1, 'weights': 'uniform'}. Best is trial 4 w

K-Nearest Neighbors
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    281759
           1       0.92      0.76      0.83      4476

    accuracy                           1.00    286235
   macro avg       0.96      0.88      0.91    286235
weighted avg       0.99      1.00      0.99    286235

Number of 1s (attacks): 3702
Number of 0s (normals): 282533


# Decision Tree Classifier

In [16]:
# Define the objective function for Optuna
def dt_objective(trial):
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
    
    score = cross_val_score(model, X_scaled, y, n_jobs=-1, cv=10, scoring='f1').mean()
    return score

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(dt_objective, n_trials=50)

# Best parameters
best_params = study.best_params
best_model = DecisionTreeClassifier(**best_params, random_state=42)

# Fit and evaluate
best_model.fit(X_scaled, y)
y_pred = best_model.predict(X_scaled)
print("Decision Tree Classifier")
print(classification_report(y, y_pred))
print(f"Number of 1s (attacks): {np.sum(y_pred == 1)}")
print(f"Number of 0s (normals): {np.sum(y_pred == 0)}")

[I 2024-10-03 16:30:30,883] A new study created in memory with name: no-name-43369257-3eb3-44b2-aeab-71c89b8c4579
[I 2024-10-03 16:30:32,203] Trial 0 finished with value: 0.5975301612219843 and parameters: {'max_depth': 13, 'min_samples_split': 6}. Best is trial 0 with value: 0.5975301612219843.
[I 2024-10-03 16:30:33,125] Trial 1 finished with value: 0.4627810240508392 and parameters: {'max_depth': 4, 'min_samples_split': 10}. Best is trial 0 with value: 0.5975301612219843.
[I 2024-10-03 16:30:34,308] Trial 2 finished with value: 0.5902268524103327 and parameters: {'max_depth': 12, 'min_samples_split': 6}. Best is trial 0 with value: 0.5975301612219843.
[I 2024-10-03 16:30:35,613] Trial 3 finished with value: 0.5654927710720139 and parameters: {'max_depth': 20, 'min_samples_split': 7}. Best is trial 0 with value: 0.5975301612219843.
[I 2024-10-03 16:30:36,799] Trial 4 finished with value: 0.5883440583155423 and parameters: {'max_depth': 15, 'min_samples_split': 4}. Best is trial 0 wit

Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    281759
           1       0.91      0.63      0.74      4476

    accuracy                           0.99    286235
   macro avg       0.95      0.82      0.87    286235
weighted avg       0.99      0.99      0.99    286235

Number of 1s (attacks): 3126
Number of 0s (normals): 283109


# Gradient Boosting Classifier

In [4]:
# Define the objective function for Optuna
def gb_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    
    model = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    
    score = cross_val_score(model, X_scaled, y, n_jobs=-1, cv=10, scoring='f1').mean()
    return score

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(gb_objective, n_trials=50)

# Best parameters
best_params = study.best_params
best_model = GradientBoostingClassifier(**best_params, random_state=42)

# Fit and evaluate
best_model.fit(X_scaled, y)
y_pred = best_model.predict(X_scaled)
print("Gradient Boosting Classifier")
print(classification_report(y, y_pred))
print(f"Number of 1s (attacks): {np.sum(y_pred == 1)}")
print(f"Number of 0s (normals): {np.sum(y_pred == 0)}")

[I 2024-10-03 17:58:36,144] A new study created in memory with name: no-name-325e1d7c-2cee-4e3e-b824-9a8f60bb55c4
[I 2024-10-03 18:03:34,122] Trial 0 finished with value: 0.5972298483842005 and parameters: {'n_estimators': 158, 'max_depth': 18}. Best is trial 0 with value: 0.5972298483842005.
[I 2024-10-03 18:06:19,835] Trial 1 finished with value: 0.5965094572595899 and parameters: {'n_estimators': 118, 'max_depth': 16}. Best is trial 0 with value: 0.5972298483842005.
[I 2024-10-03 18:07:05,329] Trial 2 finished with value: 0.6071638498166216 and parameters: {'n_estimators': 48, 'max_depth': 11}. Best is trial 2 with value: 0.6071638498166216.
[I 2024-10-03 18:12:02,311] Trial 3 finished with value: 0.593786293583489 and parameters: {'n_estimators': 180, 'max_depth': 17}. Best is trial 2 with value: 0.6071638498166216.
[I 2024-10-03 18:12:25,218] Trial 4 finished with value: 0.6302161645924254 and parameters: {'n_estimators': 92, 'max_depth': 2}. Best is trial 4 with value: 0.63021616

Gradient Boosting Classifier
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    281759
           1       0.94      0.68      0.79      4476

    accuracy                           0.99    286235
   macro avg       0.97      0.84      0.89    286235
weighted avg       0.99      0.99      0.99    286235

Number of 1s (attacks): 3255
Number of 0s (normals): 282980


# AdaBoost Classifier

In [4]:
# Define the objective function for Optuna
def ab_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 400)
    model = AdaBoostClassifier(n_estimators=n_estimators, algorithm='SAMME', random_state=42)
    
    score = cross_val_score(model, X_scaled, y, n_jobs=-1, cv=10, scoring='f1').mean()
    return score

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(ab_objective, n_trials=50)

# Best parameters
best_params = study.best_params
best_model = AdaBoostClassifier(**best_params, algorithm='SAMME', random_state=42)

# Fit and evaluate
best_model.fit(X_scaled, y)
y_pred = best_model.predict(X_scaled)
print("AdaBoost Classifier")
print(classification_report(y, y_pred))
print(f"Number of 1s (attacks): {np.sum(y_pred == 1)}")
print(f"Number of 0s (normals): {np.sum(y_pred == 0)}")

[I 2024-10-03 19:41:45,376] A new study created in memory with name: no-name-0b603743-e5b4-4213-8f8c-d8a102ba96c4
[I 2024-10-03 19:42:07,700] Trial 0 finished with value: 0.36430585263464604 and parameters: {'n_estimators': 104}. Best is trial 0 with value: 0.36430585263464604.
[I 2024-10-03 19:42:33,854] Trial 1 finished with value: 0.4171036801845848 and parameters: {'n_estimators': 217}. Best is trial 1 with value: 0.4171036801845848.
[I 2024-10-03 19:43:15,083] Trial 2 finished with value: 0.5228784369904179 and parameters: {'n_estimators': 351}. Best is trial 2 with value: 0.5228784369904179.
[I 2024-10-03 19:43:35,157] Trial 3 finished with value: 0.3713510278460245 and parameters: {'n_estimators': 163}. Best is trial 2 with value: 0.5228784369904179.
[I 2024-10-03 19:43:41,531] Trial 4 finished with value: 0.2535512120278086 and parameters: {'n_estimators': 48}. Best is trial 2 with value: 0.5228784369904179.
[I 2024-10-03 19:44:13,315] Trial 5 finished with value: 0.51455971201

AdaBoost Classifier
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    281759
           1       0.75      0.47      0.58      4476

    accuracy                           0.99    286235
   macro avg       0.87      0.73      0.79    286235
weighted avg       0.99      0.99      0.99    286235

Number of 1s (attacks): 2800
Number of 0s (normals): 283435


# XGBoost Classifier

In [9]:
# Define the objective function for Optuna
def xgb_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    
    model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, eval_metric='logloss', random_state=42)
    
    score = cross_val_score(model, X_scaled, y, n_jobs=-1, cv=10, scoring='f1').mean()
    return score

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(xgb_objective, n_trials=50)

# Best parameters
best_params = study.best_params
best_model = XGBClassifier(**best_params, eval_metric='logloss', random_state=42)

# Fit and evaluate
best_model.fit(X_scaled, y)
y_pred = best_model.predict(X_scaled)
print("XGBoost Classifier")
print(classification_report(y, y_pred))
print(f"Number of 1s (attacks): {np.sum(y_pred == 1)}")
print(f"Number of 0s (normals): {np.sum(y_pred == 0)}")

[I 2024-10-03 19:27:04,153] A new study created in memory with name: no-name-c9bdbe6b-b334-4fed-bcca-a08b3289ec34
[I 2024-10-03 19:27:18,996] Trial 0 finished with value: 0.5912829892948225 and parameters: {'n_estimators': 162, 'max_depth': 16}. Best is trial 0 with value: 0.5912829892948225.
[I 2024-10-03 19:27:22,913] Trial 1 finished with value: 0.5917694729351379 and parameters: {'n_estimators': 151, 'max_depth': 17}. Best is trial 1 with value: 0.5917694729351379.
[I 2024-10-03 19:27:25,335] Trial 2 finished with value: 0.5978019001638423 and parameters: {'n_estimators': 90, 'max_depth': 13}. Best is trial 2 with value: 0.5978019001638423.
[I 2024-10-03 19:27:27,184] Trial 3 finished with value: 0.6083715023357927 and parameters: {'n_estimators': 92, 'max_depth': 6}. Best is trial 3 with value: 0.6083715023357927.
[I 2024-10-03 19:27:28,696] Trial 4 finished with value: 0.6070936195229233 and parameters: {'n_estimators': 31, 'max_depth': 11}. Best is trial 3 with value: 0.60837150

XGBoost Classifier
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    281759
           1       0.92      0.68      0.78      4476

    accuracy                           0.99    286235
   macro avg       0.96      0.84      0.89    286235
weighted avg       0.99      0.99      0.99    286235

Number of 1s (attacks): 3306
Number of 0s (normals): 282929
