In [17]:
import pandas as pd
import glob
import os
from scipy.stats import zscore
from utils import find_nulls, add_ratio_features, add_z_score

In [18]:
PATH_SAMPLE = '../dataset/real-time/train/realtime_bacon.csv'
CSV_FOLDER = '../dataset/real-time/train'
CSV_PREPROCESSED_FOLDER = '../dataset/real-time/train_preprocessed'

Z_SCORE_COLS =  ['trace_involved_amt','contract_block_involved','contract_tx_count','contract_main_active_days',
                'sender_block_involved','sender_tx_count','sender_main_active_days','contract_interact',
                'sender_tx_count_call_contract','sender_days_call_contract','trace_amt','distinct_sender_in_contract',
                'contract_lifetime_days','contract_lifetime_block','distinct_contract_sender_called',
                'sender_lifetime_days','sender_lifetime_block','contract_involved_amt','max_breadth','depth',
                'distinct_was_called_in_sample','distinct_sender_call_in_sample','gas','gas_price',
                'receipt_cumulative_gas_used','receipt_gas_used','value','nonce']

SUS_FILE = '../dataset/sus_tx.csv'  # Replace with the actual file path

In [19]:
def impute_depth_and_max_breadth(path):
    df = pd.read_csv(path)
    df['max_breadth'] = df['max_breadth']+1
    df['max_breadth'] = df['max_breadth'].fillna(0)
    df['depth'] = df['depth'].fillna(0)
    return df

def add_sus_col(df,attack_transactions):

    def set_value_based_on_condition(row):
        if (row['transaction_hash'] in attack_transactions):
            return 1
        else:
            return 0
        
    df['is_sus'] = df.apply(set_value_based_on_condition, axis=1)

    sus_data = df[df['is_sus'] == 1]
    not_sus_data = df[df['is_sus'] == 0]

    print(len(sus_data))
    print(len(not_sus_data))

    return df

In [20]:
sample_df = pd.read_csv(PATH_SAMPLE)
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 775 entries, 0 to 774
Data columns (total 35 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   transaction_hash                 775 non-null    object 
 1   trace_involved_amt               775 non-null    int64  
 2   from_address                     775 non-null    object 
 3   to_address                       775 non-null    object 
 4   block_timestamp                  775 non-null    object 
 5   contract_block_involved          775 non-null    int64  
 6   contract_tx_count                775 non-null    int64  
 7   contract_block_per_tx            775 non-null    float64
 8   contract_main_active_days        775 non-null    int64  
 9   sender_block_involved            775 non-null    int64  
 10  sender_tx_count                  775 non-null    int64  
 11  sender_block_per_tx              775 non-null    float64
 12  sender_main_active_day

In [21]:
for file_path in glob.glob(os.path.join(CSV_FOLDER, "*.csv")):
    df = pd.read_csv(file_path)
    find_nulls(df)

Series([], dtype: int64)
max_breadth    3
depth          3
dtype: int64
Series([], dtype: int64)
Series([], dtype: int64)
max_breadth    805
depth          805
dtype: int64
max_breadth    27
depth          27
dtype: int64
max_breadth    21
depth          21
dtype: int64
Series([], dtype: int64)
Series([], dtype: int64)
max_breadth    20
depth          20
dtype: int64
Series([], dtype: int64)
max_breadth    6
depth          6
dtype: int64
max_breadth    1
depth          1
dtype: int64


In [22]:
for file_path in glob.glob(os.path.join(CSV_FOLDER, "*.csv")):
    filename = os.path.basename(file_path)
    preprocessed_file_path = CSV_PREPROCESSED_FOLDER+'/'+filename
    imputed_df = impute_depth_and_max_breadth(file_path)

    imputed_df['value'] = imputed_df['value'].astype('float32')

    z_score_df = add_z_score(imputed_df,Z_SCORE_COLS)

    ratio_df = add_ratio_features(z_score_df)

    sus_df = pd.read_csv(SUS_FILE)
    sus_transactions = sus_df['sus_tx']
    sus_transactions = set(sus_transactions)
    added_sus_df = add_sus_col(ratio_df,sus_transactions)

    print('Preprocessed file:',filename)
    find_nulls(z_score_df)

    added_sus_df.to_csv(preprocessed_file_path,index=False)
    

1
774
Preprocessed file: realtime_bacon.csv
Series([], dtype: int64)
11
988
Preprocessed file: realtime_cream.csv
Series([], dtype: int64)
6
49
Preprocessed file: realtime_dfx.csv
Series([], dtype: int64)


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


1
661
Preprocessed file: realtime_fei.csv
Series([], dtype: int64)
1
991
Preprocessed file: realtime_hypebear.csv
Series([], dtype: int64)
1
918
Preprocessed file: realtime_jay.csv
Series([], dtype: int64)


  x = um.multiply(x, x, out=x)


1
112
Preprocessed file: realtime_noodle.csv
Series([], dtype: int64)
3
988
Preprocessed file: realtime_omni.csv
Series([], dtype: int64)
1
999
Preprocessed file: realtime_orion.csv
Series([], dtype: int64)


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


93
907
Preprocessed file: realtime_rari.csv
Series([], dtype: int64)
2
998
Preprocessed file: realtime_revest.csv
Series([], dtype: int64)


  x = um.multiply(x, x, out=x)


3
996
Preprocessed file: realtime_sanshu.csv
Series([], dtype: int64)
1
999
Preprocessed file: realtime_visor.csv
Series([], dtype: int64)
