In [13]:
import pandas as pd
import glob
import os
from scipy.stats import zscore
from utils import find_nulls, add_ratio_features, add_z_score

In [14]:
DATASET_SUFFIX = 's_m_a_d_1'
PATH_SAMPLE = f'../dataset/real-time/test_normal/test_bacon.csv'
CSV_BASE_FOLDER = f'../dataset/real-time/test_normal'
CSV_PREPROCESSED_FOLDER = f'../dataset/real-time-{DATASET_SUFFIX}/test_normal_preprocessed'

Z_SCORE_COLS =  ['trace_involved_amt','contract_block_involved','contract_tx_count','contract_main_active_days',
                'sender_block_involved','sender_tx_count','sender_main_active_days','contract_interact',
                'sender_tx_count_call_contract','sender_days_call_contract','trace_amt','distinct_sender_in_contract',
                'contract_lifetime_days','contract_lifetime_block','distinct_contract_sender_called',
                'sender_lifetime_days','sender_lifetime_block','contract_involved_amt','max_breadth','depth',
                'distinct_was_called_in_sample','distinct_sender_call_in_sample','gas','gas_price',
                'receipt_cumulative_gas_used','receipt_gas_used','value','nonce']

SUS_FILE = '../dataset/sus_tx.csv'  # Replace with the actual file path

In [15]:
# Check if the directory exists
if not os.path.exists(CSV_PREPROCESSED_FOLDER):
    # Create the directory if it doesn't exist
    os.makedirs(CSV_PREPROCESSED_FOLDER)
    print(f"Directory '{CSV_PREPROCESSED_FOLDER}' created successfully.")
else:
    print(f"Directory '{CSV_PREPROCESSED_FOLDER}' already exists.")

Directory '../dataset/real-time-s_m_a_d_1/test_normal_preprocessed' already exists.


In [16]:
def impute_depth_and_max_breadth(path):
    df = pd.read_csv(path)
    df['max_breadth'] = df['max_breadth']+1
    df['max_breadth'] = df['max_breadth'].fillna(0)
    df['depth'] = df['depth'].fillna(0)
    return df

def add_sus_col(df,attack_transactions):

    def set_value_based_on_condition(row):
        if (row['transaction_hash'] in attack_transactions):
            return 1
        else:
            return 0
        
    df['is_sus'] = df.apply(set_value_based_on_condition, axis=1)

    sus_data = df[df['is_sus'] == 1]
    not_sus_data = df[df['is_sus'] == 0]

    print(len(sus_data))
    print(len(not_sus_data))

    return df

In [17]:
sample_df = pd.read_csv(PATH_SAMPLE)
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 37 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   transaction_hash                 1000 non-null   object 
 1   trace_involved_amt               1000 non-null   int64  
 2   from_address                     1000 non-null   object 
 3   to_address                       1000 non-null   object 
 4   block_timestamp                  1000 non-null   object 
 5   gas                              1000 non-null   int64  
 6   gas_price                        1000 non-null   int64  
 7   receipt_cumulative_gas_used      1000 non-null   int64  
 8   receipt_gas_used                 1000 non-null   int64  
 9   value                            1000 non-null   int64  
 10  nonce                            1000 non-null   int64  
 11  transaction_type                 1000 non-null   int64  
 12  rn                   

In [18]:
for file_path in glob.glob(os.path.join(CSV_BASE_FOLDER, "*.csv")):
    df = pd.read_csv(file_path)
    find_nulls(df)

Series([], dtype: int64)
transaction_type    1000
max_breadth            1
depth                  1
dtype: int64
Series([], dtype: int64)
Series([], dtype: int64)
transaction_type     38
max_breadth         816
depth               816
dtype: int64
max_breadth    3
depth          3
dtype: int64
max_breadth    4
depth          4
dtype: int64
Series([], dtype: int64)
Series([], dtype: int64)
transaction_type    1000
max_breadth           18
depth                 18
dtype: int64
transaction_type    47
max_breadth          1
depth                1
dtype: int64
transaction_type    1000
max_breadth            8
depth                  8
dtype: int64
max_breadth    709
depth          709
dtype: int64
transaction_type    342
max_breadth          19
depth                19
dtype: int64
Series([], dtype: int64)
transaction_type    66
max_breadth         11
depth               11
dtype: int64
max_breadth    455
depth          455
dtype: int64


In [19]:
for file_path in glob.glob(os.path.join(CSV_BASE_FOLDER, "*.csv")):
    filename = os.path.basename(file_path)
    print(filename)
    preprocessed_file_path = CSV_PREPROCESSED_FOLDER+'/'+filename
    imputed_df = impute_depth_and_max_breadth(file_path)
    imputed_df['sender_main_active_days'] = 1
    imputed_df['value'] = imputed_df['value'].astype('float32')

    z_score_df = add_z_score(imputed_df,Z_SCORE_COLS)

    ratio_df = add_ratio_features(z_score_df)

    sus_df = pd.read_csv(SUS_FILE)
    sus_transactions = sus_df['sus_tx']
    sus_transactions = set(sus_transactions)
    added_sus_df = add_sus_col(ratio_df,sus_transactions)

    print('Preprocessed file:',filename)
    find_nulls(z_score_df)

    added_sus_df.to_csv(preprocessed_file_path,index=False)
    

test_bacon.csv
0
1000
Preprocessed file: test_bacon.csv
Series([], dtype: int64)
test_cream.csv


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


0
1000
Preprocessed file: test_cream.csv
transaction_type    1000
dtype: int64
test_dfx.csv
0
6
Preprocessed file: test_dfx.csv
Series([], dtype: int64)
test_fei.csv


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


1
368
Preprocessed file: test_fei.csv
Series([], dtype: int64)
test_hypebear.csv
0
997
Preprocessed file: test_hypebear.csv
transaction_type    38
dtype: int64
test_jay.csv
1
14
Preprocessed file: test_jay.csv
Series([], dtype: int64)
test_noodle.csv
0
48
Preprocessed file: test_noodle.csv
Series([], dtype: int64)
test_omni.csv
0
628
Preprocessed file: test_omni.csv
Series([], dtype: int64)
test_orion.csv


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


0
1000
Preprocessed file: test_orion.csv
Series([], dtype: int64)
test_rari.csv


  x = um.multiply(x, x, out=x)


0
1000
Preprocessed file: test_rari.csv
transaction_type    1000
dtype: int64
test_revest.csv
0
496
Preprocessed file: test_revest.csv
transaction_type    47
dtype: int64
test_sanshu.csv
0
1000
Preprocessed file: test_sanshu.csv
transaction_type    1000
dtype: int64
test_tether.csv
0
980
Preprocessed file: test_tether.csv
Series([], dtype: int64)
test_uniswap.csv
0
998
Preprocessed file: test_uniswap.csv
transaction_type    342
dtype: int64
test_usdc.csv
0
998
Preprocessed file: test_usdc.csv
Series([], dtype: int64)


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  x = um.multiply(x, x, out=x)


test_visor.csv
0
676
Preprocessed file: test_visor.csv
transaction_type    66
dtype: int64
test_wyvern.csv
0
999
Preprocessed file: test_wyvern.csv
Series([], dtype: int64)


  x = um.multiply(x, x, out=x)
