In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler
from sklearn.pipeline import Pipeline
from utils import var_setup
import xgboost as xgb
import lightgbm as lgb


In [None]:
TRAIN_DATA_PATH = "./dataset/100tx/smoted/"
TEST_DATA_PATH = "./dataset/100tx/fold/"
PERFORMANCE_PATH = "./data/100tx/performance.csv"
FEATURE_IMPORTANCE_PATH = "./data/100tx/feature_importance.csv"
PREDICTION_PATH = "./data/100tx/prediction/"

In [None]:
def remove_list_preserve_sequence(list, to_remove):
    new_list = [item for item in list if item not in to_remove]
    return new_list

In [None]:
DATA_PATH = "./dataset/real-time/fold/1.csv"
df = pd.read_csv(DATA_PATH)
columns = df.columns.tolist()

feature_with_sus = remove_list_preserve_sequence(
    df.columns,
    [
        "transaction_hash",
        "from_address",
        "to_address",
        "block_timestamp",
        "add_feat_hash",
    ],
)

feature = remove_list_preserve_sequence(
    df.columns,
    [
        "transaction_hash",
        "from_address",
        "to_address",
        "block_timestamp",
        "add_feat_hash",
        "is_sus",
    ],
)
# print(len(features))

In [None]:
def var():
    variables = {}

    #### feature ###

    variables["feature_with_sus"] = feature_with_sus
    variables["feature"] = feature
    # remove_feature = [
    #     "contract_lifetime_block",
    #     "contract_lifetime_days",
    #     "sender_tx_count_call_contract",
    #     "sender_lifetime_days",
    #     "sender_lifetime_block",
    #     "contract_main_active_days",
    #     "sender_main_active_days",
    #     "value",
    #     "contract_block_ratio",
    #     "tx_sender_call_contract",
    # ]
    # remove_feature = [  # for n=11
    #     "contract_lifetime_block",
    #     "contract_lifetime_days",
    #     "sender_tx_count_call_contract",
    #     "sender_lifetime_days",
    #     "sender_lifetime_block",
    #     "contract_main_active_days",
    #     "sender_main_active_days",
    #     "value",
    #     "contract_block_ratio",
    #     "tx_sender_call_contract",
    # ]
    remove_feature = [
        # "contract_main_active_days",
        "sender_main_active_days",
        "contract_lifetime_block",
        "contract_lifetime_days",
        "sender_lifetime_days",
        "sender_lifetime_block",
    ]
    # remove_feature = [  # for 9_new
    #     "contract_lifetime_block",
    #     "contract_lifetime_days",
    #     "sender_tx_count_call_contract",
    #     "sender_lifetime_days",
    #     "sender_lifetime_block",
    #     "contract_main_active_days",
    #     "sender_main_active_days",
    #     "contract_block_ratio",
    #     "tx_sender_call_contract",
    #     "contract_block_involved",
    #     "value",
    # ]

    # remove_feature = [
    #     "contract_lifetime_block",
    #     "contract_lifetime_days",
    #     "sender_tx_count_call_contract",
    #     "sender_lifetime_days",
    #     "sender_lifetime_block",
    #     "contract_main_active_days",
    #     # "value",
    #     "contract_block_ratio",
    #     "tx_sender_call_contract",
    #     # "tx_count_call_contract_per_days",
    #     "sender_active_days_ratio",
    #     "sender_block_ratio",
    #     "distinct_sender_called_in_sample",
    #     "sender_tx_count_call_contract",
    #     # "contract_block_involved",
    #     # "gas",
    #     # "contract_tx_count",
    #     # "sender_block_involved",
    #     # "sender_tx_count",
    #     # "contract_interact",
    #     # "sender_days_call_contract",
    #     # "distinct_sender_in_contract",
    #     # "depth",
    #     # "gas_price",
    #     # "receipt_cumulative_gas_used",
    #     # "nonce",
    #     # "contract_active_day_ratio",
    #     # "tx_count_per_distinct_caller",
    # ]
    variables["target_feature"] = remove_list_preserve_sequence(feature, remove_feature)
    variables["target_feature_with_sus"] = remove_list_preserve_sequence(
        feature_with_sus, remove_feature
    )
    variables["z_score_feature"] = [
        member for member in variables["target_feature"] if member.startswith("z_")
    ]

    variables["static_feature"] = remove_list_preserve_sequence(
        variables["target_feature"], variables["z_score_feature"]
    )

    variables["static_feature_with_sus"] = remove_list_preserve_sequence(
        variables["feature_with_sus"], variables["z_score_feature"]
    )

    # best feature
    variables["best_feature"] = []

    return variables

def random_forest():
    # return ""
    static_feature = var_setup.var()["static_feature"]
    z_features = var_setup.var()["z_score_feature"]
    features = static_feature + z_features

    fold_files = ["1.csv", "2.csv", "3.csv", "4.csv", "5.csv"]
    matching_percentage = []
    f1_percentage = []

    num_pipeline = Pipeline(
        steps=[
            ("scale", MinMaxScaler()),
        ]
    )

    col_trans = ColumnTransformer(
        transformers=[
            ("num_pipeline", num_pipeline, static_feature),
            ("passthrough", "passthrough", z_features),
        ],
        n_jobs=1,
    )

    feature_importance_df = pd.DataFrame(columns=features)

    performance_df = pd.DataFrame(
        columns=[
            "Fold",
            "Total Values",
            "Matching",
            "Matching Percentage",
            "Actual Attack",
            "Predicted",
            "True Positive",
            "False Positive",
            "False Negative",
            "F1 Score",
        ]
    )

    temp = pd.read_csv(TRAIN_DATA_PATH + "1.csv")

    for i in range(len(fold_files)):
        file_path_train = TRAIN_DATA_PATH
        file_path_test = TEST_DATA_PATH

        train_data = pd.DataFrame(columns=temp.columns)

        for j in range(len(fold_files)):
            if j == i:
                continue
            df = pd.read_csv(file_path_train + fold_files[j])
            train_data = pd.concat([train_data, df], ignore_index=True)

        train_data = train_data.sort_index()
        X = train_data[features]
        X = X.sort_index()
        y = train_data.is_sus.astype(int)
        y = y.sort_index()

        rf_model = RandomForestClassifier(
            n_estimators=5,  # 11
            max_depth=5,  # 5
            min_samples_leaf=50,
            random_state=42,
            n_jobs=1,
        )  # 50
        xgb_model = xgb.XGBClassifier(
            n_estimators=5,
            max_depth=5,  # 5
            min_samples_leaf=50,
            random_state=42,
            n_jobs=1,
        )
        lgbm_model = lgb.LGBMClassifier(
            n_estimators=5,
            max_depth=5,  # 5
            min_samples_leaf=50,
            random_state=42,
            n_jobs=1,
        )
        model_pipeline = Pipeline(
            steps=[("col_trans", col_trans), ("model", xgb_model)]
        )
        model_pipeline.fit(X, y)

        final_estimator = model_pipeline.steps[-1][1]
        if isinstance(final_estimator, RandomForestClassifier):
            feature_importances = model_pipeline.named_steps[
                "model"
            ].feature_importances_
            feature_importance_df = feature_importance_df._append(
                pd.DataFrame([feature_importances], columns=features),
                ignore_index=True,
            )

        return model_pipeline
    

def test(df,model):
    test_data = pd.read_csv(file_path_test + fold_files[i])
    # test_data["contract_main_active_days"] = 1
    test_data = test_data.sort_index()
    test_X = test_data[features]
    test_X = test_X.sort_index()
    test_y = test_data.is_sus.astype(int)
    test_y = test_y.sort_index()
    predict_y = model_pipeline.predict(test_X)

    fold_data_df = pd.concat(
        [
            test_data,
            pd.DataFrame({"True Labels": test_y, "Predicted Labels": predict_y}),
        ],
        axis=1,
    )
    fold_data_df.to_csv(PREDICTION_PATH + f"{i+1}.csv", index=False)

    matching_values = np.sum(np.array(test_y) == np.array(predict_y))
    total_samples = len(test_y)
    percentage_matching = (matching_values / total_samples) * 100
    matching_percentage.append(percentage_matching)
    actualAttack = sum(test_y[test_y == 1])
    predicted = sum(predict_y[predict_y == 1])
    TP = sum((test_y == 1) & (predict_y == 1))
    FP = sum((test_y == 0) & (predict_y == 1))
    FN = sum((test_y == 1) & (predict_y == 0))

    print(f"************** Fold {i+1} **************")
    print("--- Overall Performance ---")
    print("Total Values :", total_samples)
    print("Matching :", matching_values)
    print("Matching percentage :", percentage_matching)
    print("--- Caught Performance ---")
    print("Actual Attack :", actualAttack)
    print("Predicted :", predicted)
    print("True Positive(attack caught) :", TP)
    print("False Positive :", FP)
    print("False Negative :", FN)
    f1Score = f1_score(test_y, predict_y)

    f1_percentage.append(f1Score)
    print("F1 score : ", f1Score)
    print("\n")

    performance_df = performance_df._append(
        {
            "Fold": i,
            "Total Values": total_samples,
            "Matching": matching_values,
            "Matching Percentage": percentage_matching,
            "Actual Attack": actualAttack,
            "Predicted": predicted,
            "True Positive": TP,
            "False Positive": FP,
            "False Negative": FN,
            "F1 Score": f1Score,
        },
        ignore_index=True,
    )

performance_df.to_csv(PERFORMANCE_PATH, index=False)
average_result = np.mean(matching_percentage)
average_result2 = np.mean(f1_percentage)

print(f"Average Result Across Folds: {average_result}")
print(f"Average Result Across Folds: {average_result2}")
feature_importance_df.to_csv(FEATURE_IMPORTANCE_PATH, index=False)

In [None]:
random_forest()
# # *
# #     # base_estimator = DecisionTreeClassifier(max_depth=5, min_samples_leaf=50, random_state=42)
# #     # adaboost_model = AdaBoostClassifier(estimator=base_estimator,
# #     #                                 n_estimators=250, # Adjust as needed
# #     #                                 random_state=42)
# #     xgb_model = XGBClassifier(
# #         # objective="binary:logistic",  # Adjust for binary or multiclass classification
# #         # n_estimators=250,  # Number of boosting rounds
# #         # max_depth=7,  # Maximum depth of trees
# #         # learning_rate=0.1,  # Step size shrinkage
# #         # scale_pos_weight=len(y[y == 0]) / len(y[y == 1]),  # Handling imbalance
# #         # use_label_encoder=False,  # Avoid warnings with recent xgboost versions
# #         # random_state=42,
# #     )
# #     gbm_model = LGBMClassifier(
# #         # num_leaves=10,
# #         # n_estimators=250,  # Number of boosting rounds
# #         # max_depth=5,  # Maximum depth of each tree
# #         # learning_rate=0.1,  # Learning rate
# #         # class_weight="balanced",  # Handle class imbalance
# #         # random_state=42,
# #     )