In [85]:
import pandas as pd
import glob
import os
from scipy.stats import zscore

In [86]:
PATH_SAMPLE = '../dataset/real-time/test_attack/realtime_sturdy.csv'
CSV_FOLDER = '../dataset/real-time/test_attack'
CSV_PREPROCESSED_FOLDER = '../dataset/real-time/test_attack_preprocessed'

Z_SCORE_COLS =  ['contract_block_involved','contract_tx_count','contract_block_per_tx','contract_main_active_days',
                'sender_block_involved','sender_tx_count','sender_block_per_tx','sender_main_active_days',
                'contract_interact','sender_tx_count_call_contract','sender_days_call_contract','trace_amt',
                'distinct_sender_in_contract','contract_lifetime_days','contract_lifetime_block','distinct_contract_sender_called',
                'sender_lifetime_days','sender_lifetime_block','contract_involved_amt','max_breadth','depth',
                'distinct_was_called_in_sample','distinct_sender_call_in_sample','gas','gas_price','receipt_cumulative_gas_used',
                'receipt_gas_used','nonce']

SUS_FILE = '../dataset/sus_tx.csv'  # Replace with the actual file path

In [87]:
def impute_depth_and_max_breadth(path):
    df = pd.read_csv(path)
    df['max_breadth'] = df['max_breadth']+1
    df['max_breadth'] = df['max_breadth'].fillna(0)
    df['depth'] = df['depth'].fillna(0)
    # find_nulls(df)
    return df

def find_nulls(df):
    null_count = df.isnull().sum()
    null_count = null_count[null_count>0]
    print(null_count)
    print('=================')

def add_z_score(df,cols_to_calculate):
    # display(df.info())
    for column in cols_to_calculate:
        z_score_column_name = f'z_{column}'
        df[z_score_column_name] = zscore(df[column])
        df.loc[df[z_score_column_name].isnull(),z_score_column_name] = 0
    return df

def add_ratio_features(df):
    df['involved_trace_ratio'] = df['trace_involved_amt'] / df['trace_amt']
    df['contract_active_day_ratio'] = df['contract_main_active_days'] / (df['contract_lifetime_days']+1)
    df['tx_count_per_distinct_caller'] = df['contract_tx_count'] / df['distinct_sender_in_contract']
    df['contract_block_ratio'] = df['contract_block_involved'] / (df['contract_lifetime_block']+1)
    df['sender_active_day_ratio'] = df['sender_main_active_days'] / (df['sender_lifetime_days']+1)
    df['sender_tx_count_per_contract'] = df['sender_tx_count'] / df['distinct_contract_sender_called'] 
    df['sender_block_ratio'] = df['sender_block_involved'] / (df['sender_lifetime_block']+1)
    df['tx_sender_call_contract'] = df['sender_tx_count_call_contract'] / df['sender_tx_count']
    df['sender_call_contract_tx_ratio'] = df['sender_tx_count_call_contract'] / df['contract_tx_count']
    df['sender_tx_count_call_contract_per_days'] = df['sender_tx_count_call_contract'] / df['sender_days_call_contract']
    df['sender_block_per_tx'] = df['sender_block_involved']/df['sender_tx_count']
    df['contract_block_per_tx'] = df['contract_block_involved']/df['contract_tx_count']
    df['sender_call_contract_day_ratio'] = df['sender_days_call_contract'] / df['contract_main_active_days']
    return df

def add_sus_col(df,attack_transactions):

    def set_value_based_on_condition(row):
        if (row['transaction_hash'] in attack_transactions):
            return 1
        else:
            return 0
        
    df['is_sus'] = df.apply(set_value_based_on_condition, axis=1)

    sus_data = df[df['is_sus'] == 1]
    not_sus_data = df[df['is_sus'] == 0]

    print(len(sus_data))
    print(len(not_sus_data))

    return df

In [88]:
for file_path in glob.glob(os.path.join(CSV_FOLDER, "*.csv")):
    df = pd.read_csv(file_path)
    find_nulls(df)

Series([], dtype: int64)
Series([], dtype: int64)
Series([], dtype: int64)


In [89]:
for file_path in glob.glob(os.path.join(CSV_FOLDER, "*.csv")):
    filename = os.path.basename(file_path)
    preprocessed_file_path = CSV_PREPROCESSED_FOLDER+'/'+filename
    imputed_df = impute_depth_and_max_breadth(file_path)

    z_score_df = add_z_score(imputed_df,Z_SCORE_COLS)

    ratio_df = add_ratio_features(z_score_df)

    sus_df = pd.read_csv(SUS_FILE)
    sus_transactions = sus_df['sus_tx']
    sus_transactions = set(sus_transactions)
    added_sus_df = add_sus_col(ratio_df,sus_transactions)

    print('Preprocessed file:',filename)
    find_nulls(z_score_df)

    added_sus_df.to_csv(preprocessed_file_path,index=False)
    

1
999
Preprocessed file: realtime_curve.csv
Series([], dtype: int64)
1
86
Preprocessed file: realtime_earningfarm.csv
Series([], dtype: int64)
1
672
Preprocessed file: realtime_sturdy.csv
Series([], dtype: int64)
