In [15]:
import pandas as pd
import glob
import os
from scipy.stats import zscore
from utils import find_nulls, add_ratio_features, add_z_score

In [16]:
DATASET_SUFFIX = '50-1y'
PATH_SAMPLE = f'../dataset/real-time-{DATASET_SUFFIX}/test_after_attack_preprocessed/after_attack_nfttrader.csv'
CSV_BASE_FOLDER = f'../dataset/real-time-{DATASET_SUFFIX}/test_after_attack_preprocessed'
CSV_PREPROCESSED_FOLDER = f'../dataset/real-time-{DATASET_SUFFIX}/test_after_attack_preprocessed'

Z_SCORE_COLS =  ['trace_involved_amt','contract_block_involved','contract_tx_count','contract_main_active_days',
                'sender_block_involved','sender_tx_count','sender_main_active_days','contract_interact',
                'sender_tx_count_call_contract','sender_days_call_contract','trace_amt','distinct_sender_in_contract',
                'contract_lifetime_days','contract_lifetime_block','distinct_contract_sender_called',
                'sender_lifetime_days','sender_lifetime_block','contract_involved_amt','max_breadth','depth',
                'distinct_was_called_in_sample','distinct_sender_call_in_sample','gas','gas_price',
                'receipt_cumulative_gas_used','receipt_gas_used','value','nonce']

SUS_FILE = '../dataset/sus_tx.csv'  # Replace with the actual file path

In [17]:
# Check if the directory exists
if not os.path.exists(CSV_PREPROCESSED_FOLDER):
    # Create the directory if it doesn't exist
    os.makedirs(CSV_PREPROCESSED_FOLDER)
    print(f"Directory '{CSV_PREPROCESSED_FOLDER}' created successfully.")
else:
    print(f"Directory '{CSV_PREPROCESSED_FOLDER}' already exists.")

In [18]:
def impute_depth_and_max_breadth(path):
    df = pd.read_csv(path)
    df['max_breadth'] = df['max_breadth']+1
    df['max_breadth'] = df['max_breadth'].fillna(0)
    df['depth'] = df['depth'].fillna(0)
    return df

def add_sus_col(df,attack_transactions):

    def set_value_based_on_condition(row):
        if (row['transaction_hash'] in attack_transactions):
            return 1
        else:
            return 0
        
    df['is_sus'] = df.apply(set_value_based_on_condition, axis=1)

    sus_data = df[df['is_sus'] == 1]
    not_sus_data = df[df['is_sus'] == 0]

    print(len(sus_data))
    print(len(not_sus_data))

    return df

In [19]:
sample_df = pd.read_csv(PATH_SAMPLE)
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 78 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   index                                   35 non-null     int64  
 1   transaction_hash                        35 non-null     object 
 2   trace_involved_amt                      35 non-null     int64  
 3   from_address                            35 non-null     object 
 4   to_address                              35 non-null     object 
 5   block_timestamp                         35 non-null     object 
 6   gas                                     35 non-null     int64  
 7   gas_price                               35 non-null     int64  
 8   receipt_cumulative_gas_used             35 non-null     int64  
 9   receipt_gas_used                        35 non-null     int64  
 10  value                                   35 non-null     float64


In [20]:
for file_path in glob.glob(os.path.join(CSV_FOLDER, "*.csv")):
    df = pd.read_csv(file_path)
    find_nulls(df)

Series([], dtype: int64)
Series([], dtype: int64)
Series([], dtype: int64)
Series([], dtype: int64)
Series([], dtype: int64)
Series([], dtype: int64)
Series([], dtype: int64)
Series([], dtype: int64)
Series([], dtype: int64)


In [21]:
for file_path in glob.glob(os.path.join(CSV_BASE_FOLDER, "*.csv")):
    filename = os.path.basename(file_path)
    print(filename)
    preprocessed_file_path = CSV_PREPROCESSED_FOLDER+'/'+filename
    imputed_df = impute_depth_and_max_breadth(file_path)

    imputed_df['value'] = imputed_df['value'].astype('float32')

    z_score_df = add_z_score(imputed_df,Z_SCORE_COLS)

    ratio_df = add_ratio_features(z_score_df)

    sus_df = pd.read_csv(SUS_FILE)
    sus_transactions = sus_df['sus_tx']
    sus_transactions = set(sus_transactions)
    added_sus_df = add_sus_col(ratio_df,sus_transactions)

    print('Preprocessed file:',filename)
    find_nulls(z_score_df)

    added_sus_df.to_csv(preprocessed_file_path,index=False)
    

after_attack_barley.csv
0
50
Preprocessed file: after_attack_barley.csv
Series([], dtype: int64)
after_attack_chainpaint.csv
0
50
Preprocessed file: after_attack_chainpaint.csv
Series([], dtype: int64)
after_attack_curve.csv
0
50
Preprocessed file: after_attack_curve.csv
Series([], dtype: int64)
after_attack_earningfarm.csv
0
8
Preprocessed file: after_attack_earningfarm.csv
Series([], dtype: int64)
after_attack_gooddollar.csv
0
17
Preprocessed file: after_attack_gooddollar.csv
Series([], dtype: int64)
after_attack_nfttrader.csv
0
35
Preprocessed file: after_attack_nfttrader.csv
Series([], dtype: int64)
after_attack_peapods.csv
0
50
Preprocessed file: after_attack_peapods.csv
Series([], dtype: int64)
after_attack_rugged.csv
0
50
Preprocessed file: after_attack_rugged.csv
Series([], dtype: int64)
after_attack_sturdy.csv
0
50
Preprocessed file: after_attack_sturdy.csv
Series([], dtype: int64)
