In [1]:
import pandas as pd
import numpy as np
import glob
import os
import openpyxl
from pprint import pprint

In [2]:
TRUE_COL = 'is_sus'
PRED_COL = 'predicted'
CSV_FOLDER = './test_attack_result/'
RESULT_CSV_FOLDER = './test_attack_feature_sim/'
IMPORTANCE_PATH = './feature_importance.csv'

In [3]:
def calculate_similarity(base, add_values):
    differences = [abs(base - add_value) for add_value in add_values]
    return np.mean(differences)


def ranking_feature_sim(df):
    similarity_scores = {}
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    for col in numeric_cols:
        tp_values = df[(df[TRUE_COL]==1) & (df[PRED_COL]==1)][col]
        first_tp_values = tp_values.iloc[0]
        fp_values = df[(df[TRUE_COL] == 0) & (df[PRED_COL] == 1)][col].tolist()
        similarity_scores[col] = calculate_similarity(first_tp_values, fp_values)

    # Sort the sets based on similarity score (ascending, from most to least similar)
    sorted_similarity = sorted(
        similarity_scores.items(),
        key=lambda x: (
            # 'sender' not in x[0],
            x[1])
    )
    sorted_df = pd.DataFrame(sorted_similarity, columns=['Feature', 'Similarity'])
    # for feature in (sorted_similarity):
    #     print(feature[0])
    return sorted_df


In [4]:
all_similar_df = pd.DataFrame()
importance_df = pd.read_csv(IMPORTANCE_PATH)
features = importance_df['Feature'].to_list()
features = features+['is_sus','predicted','transaction_hash','from_address','to_address','block_timestamp']
# pprint(features)
for file_path in glob.glob(os.path.join(CSV_FOLDER, "*.csv")):
    filename = os.path.basename(file_path) 
    if '0x0' in filename: continue
    # print(filename)
    df = pd.read_csv(file_path)
    pprint(features)
    df = df[features]
    false_pos_tx = df[(df[TRUE_COL] == 0) & (df[PRED_COL] == 1)]['transaction_hash'].tolist()
    true_pos_tx = df[(df[TRUE_COL]==1) & (df[PRED_COL]==1)]['transaction_hash'].iloc[0]
    for tx in false_pos_tx:
        temp_df = df[(df['transaction_hash']==true_pos_tx) | (df['transaction_hash']==tx)]
        sorted_df = ranking_feature_sim(temp_df)
        sorted_features = ['transaction_hash','from_address','to_address','block_timestamp']+sorted_df['Feature'].tolist()
        temp_df = temp_df[sorted_features]
        all_similar_df[filename.replace('.csv',f'_{tx}')] = sorted_df['Feature']
    # Save to CSV
    # df.to_excel(RESULT_CSV_FOLDER+filename.replace('.csv','.xlsx'), index=False)
    # sorted_df.to_excel(RESULT_CSV_FOLDER+'feature_order_'+filename.replace('.csv','.xlsx'), index=False)

all_similar_df.to_excel(RESULT_CSV_FOLDER+'feature_ranking_of_all_contracts_indiv_tx.xlsx',index=False)


['sender_call_contract_tx_ratio',
 'z_trace_involved_amt',
 'trace_involved_amt',
 'sender_call_contract_day_ratio',
 'trace_amt',
 'z_sender_lifetime_block',
 'z_contract_block_involved',
 'contract_tx_count',
 'contract_block_involved',
 'distinct_contract_sender_called',
 'z_trace_amt',
 'z_contract_tx_count',
 'z_contract_main_active_days',
 'sender_tx_count',
 'sender_main_active_days',
 'sender_block_involved',
 'sender_days_call_contract',
 'sender_block_ratio',
 'z_sender_lifetime_days',
 'z_contract_lifetime_days',
 'nonce',
 'z_sender_main_active_days',
 'z_contract_lifetime_block',
 'sender_tx_count_per_contract',
 'z_depth',
 'contract_interact',
 'receipt_gas_used',
 'z_receipt_gas_used',
 'sender_active_day_ratio',
 'gas',
 'z_distinct_contract_sender_called',
 'max_breadth',
 'z_sender_days_call_contract',
 'z_nonce',
 'z_gas_price',
 'z_distinct_was_called_in_sample',
 'involved_trace_ratio',
 'tx_count_per_distinct_caller',
 'distinct_sender_in_contract',
 'z_value',
 