In [6]:
import os
import pandas as pd
from imblearn.over_sampling import SMOTE,ADASYN
from scipy.stats import zscore
from utils import add_ratio_features, add_z_score, find_nulls

In [7]:
TO_SAMPLING_COLS =  ['trace_involved_amt','contract_block_involved','contract_tx_count','contract_main_active_days',
                    'sender_block_involved','sender_tx_count','sender_main_active_days','contract_interact',
                    'sender_tx_count_call_contract','sender_days_call_contract','trace_amt','distinct_sender_in_contract',
                    'contract_lifetime_days','contract_lifetime_block','distinct_contract_sender_called',
                    'sender_lifetime_days','sender_lifetime_block','contract_involved_amt','max_breadth','depth',
                    'distinct_was_called_in_sample','distinct_sender_call_in_sample','gas','gas_price',
                    'receipt_cumulative_gas_used','receipt_gas_used','value','nonce',
                    'z_trace_involved_amt','z_contract_block_involved','z_contract_tx_count','z_contract_main_active_days',
                    'z_sender_block_involved','z_sender_tx_count','z_sender_main_active_days','z_contract_interact',
                    'z_sender_tx_count_call_contract','z_sender_days_call_contract','z_trace_amt','z_distinct_sender_in_contract',
                    'z_contract_lifetime_days','z_contract_lifetime_block','z_distinct_contract_sender_called',
                    'z_sender_lifetime_days','z_sender_lifetime_block','z_contract_involved_amt','z_max_breadth','z_depth',
                    'z_distinct_was_called_in_sample','z_distinct_sender_call_in_sample','z_gas','z_gas_price',
                    'z_receipt_cumulative_gas_used','z_receipt_gas_used','z_value','z_nonce']
TARGET_COL = 'is_sus'

PATH_TO_LOAD = '../50_dataset/real-time/fold'
PATH_TO_SAVE = '../50_dataset/real-time/fold_upsamped'

In [8]:
def resample_smote(data):

    y = data[TARGET_COL]
    X = data[TO_SAMPLING_COLS]

    find_nulls(X)

    smote = SMOTE(sampling_strategy='auto', k_neighbors=3,random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X, y)
    print(f'Total attacks: {y_train_resampled.sum()}')

    resampled_data = pd.concat([pd.DataFrame(X_train_resampled, columns=X.columns), pd.DataFrame({TARGET_COL: y_train_resampled})], axis=1)

    return resampled_data

def resample_adasyn(data):

    y = data[TARGET_COL]
    X = data[TO_SAMPLING_COLS]

    find_nulls(X)

    adasyn = ADASYN(sampling_strategy='auto', random_state=42, n_neighbors=3)
    X_train_resampled, y_train_resampled = adasyn.fit_resample(X, y)
    print(f'Total attacks: {y_train_resampled.sum()}')

    resampled_data = pd.concat([pd.DataFrame(X_train_resampled, columns=X.columns), pd.DataFrame({TARGET_COL: y_train_resampled})], axis=1)

    return resampled_data

In [13]:
csv_files = [file for file in os.listdir(PATH_TO_LOAD) if file.endswith('.csv')]
os.makedirs(f'{PATH_TO_SAVE}_smote')
os.makedirs(f'{PATH_TO_SAVE}_adasyn')
for file in csv_files:
    file_path = os.path.join(PATH_TO_LOAD, file)

    data = pd.read_csv(file_path)

    resamped_smote_data = resample_smote(data)
    resamped_adasyn_data = resample_adasyn(data)

    add_ratio_smote_data = add_ratio_features(resamped_smote_data)
    add_ratio_adasyn_data = add_ratio_features(resamped_adasyn_data)

    # Save the resampled data to a new CSV file
    output_file_smote = os.path.join(f'{PATH_TO_SAVE}_smote',file)
    output_file_adasyn = os.path.join(f'{PATH_TO_SAVE}_adasyn',file)

    add_ratio_smote_data.to_csv(output_file_smote, index=False)
    print(add_ratio_smote_data.info())
    print(f'Resampled data saved to {output_file_smote}')

    add_ratio_adasyn_data.to_csv(output_file_adasyn, index=False)
    print(add_ratio_adasyn_data.info())
    print(f'Resampled data saved to {output_file_adasyn}')

Series([], dtype: int64)
Total attacks: 192
Series([], dtype: int64)
Total attacks: 193
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 70 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   trace_involved_amt                      384 non-null    int64  
 1   contract_block_involved                 384 non-null    int64  
 2   contract_tx_count                       384 non-null    int64  
 3   contract_main_active_days               384 non-null    int64  
 4   sender_block_involved                   384 non-null    int64  
 5   sender_tx_count                         384 non-null    int64  
 6   sender_main_active_days                 384 non-null    int64  
 7   contract_interact                       384 non-null    int64  
 8   sender_tx_count_call_contract           384 non-null    int64  
 9   sender_days_call_contract               384