In [None]:
import os
import pandas as pd
from imblearn.over_sampling import SMOTE
from scipy.stats import zscore
from utils import add_ratio_features, add_z_score

In [None]:
TO_SAMPLING_COLS =  ['trace_involved_amt','contract_block_involved','contract_tx_count','contract_main_active_days',
                    'sender_block_involved','sender_tx_count','sender_main_active_days','contract_interact',
                    'sender_tx_count_call_contract','sender_days_call_contract','trace_amt','distinct_sender_in_contract',
                    'contract_lifetime_days','contract_lifetime_block','distinct_contract_sender_called',
                    'sender_lifetime_days','sender_lifetime_block','contract_involved_amt','max_breadth','depth',
                    'distinct_was_called_in_sample','distinct_sender_call_in_sample','gas','gas_price',
                    'receipt_cumulative_gas_used','receipt_gas_used','value','transaction_type','nonce']
TARGET_COL = 'is_sus'

PATH_TO_LOAD = '../dataset/fold'
PATH_TO_SAVE = '../dataset/fold_upsamped'

In [None]:
def resample(data):

    y = data[TARGET_COL]
    X = data[TO_SAMPLING_COLS]

    smote = SMOTE(sampling_strategy='auto', k_neighbors=3,random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X, y)

    resampled_data = pd.concat([pd.DataFrame(X_train_resampled, columns=X.columns), pd.DataFrame({TARGET_COL: y_train_resampled})], axis=1)

    return resampled_data

In [None]:
csv_files = [file for file in os.listdir(PATH_TO_LOAD) if file.endswith('.csv')]

for file in csv_files:
    file_path = os.path.join(PATH_TO_LOAD, file)

    data = pd.read_csv(file_path)

    resamped_data = resample(data)

    add_z_data = add_z_score(resamped_data)
    add_ratio_data = add_ratio_features(add_z_data)

    # Save the resampled data to a new CSV file
    output_file = os.path.join(PATH_TO_SAVE,file)
    add_ratio_data.to_csv(output_file, index=False)
    print(add_ratio_data.info())
    print(f'Resampled data saved to {output_file}')