In [7]:
import pandas as pd

pool_day_data_df = pd.read_csv('../data/pool_day_data.csv', low_memory=False)
rugpull_labels_df = pd.read_csv('../data/rugpulls_with_token_info.csv', low_memory=False)

# Convert 'date' to a datetime object
pool_day_data_df['date'] = pd.to_datetime(pool_day_data_df['date'], unit='s')

# Ensure that 'id' is the correct identifier for pools
print(pool_day_data_df.head())


          close       date                      feeGrowthGlobal0X128  \
0  1.078152e-08 2023-06-27  1950663683887649585341457959283660012971   
1  9.998356e-05 2021-07-17       11229880233198962195483552111498235   
2  8.339408e-01 2022-03-14                                         0   
3  1.199533e+00 2022-03-15                                         0   
4  1.206535e+00 2022-03-16                                         0   

               feeGrowthGlobal1X128    feesUSD          high  \
0  61306156093378588702209917153492  32.158471  9.275127e+07   
1                                 0   0.001849  1.000164e+04   
2                                 0   0.000000  8.339408e-01   
3      6017672668216301140469596925   0.000000  8.336578e-01   
4    109367563722438852797476668213   0.000000  8.299676e-01   

                                                 id              liquidity  \
0  0x0000d36ab86d213c14d93cd5ae78615a20596505-19535                      0   
1  0x0001fcbba8eb491c3ccfe

In [9]:
# Convert rugpull date to datetime
rugpull_labels_df['date'] = pd.to_datetime(rugpull_labels_df['date'])


In [13]:
import numpy as np

count = 0
# Initialize a list to store the feature rows
feature_rows = []
# Initialize a DataFrame to store the features
pool_features = pd.DataFrame()

for index, pool_id in enumerate(rugpull_labels_df['pool_id']):
    if index % 200 == 0:
        print(f"Processing pool {index}: Pool ID {pool_id}")
        print(f"Number of pools skipped: {count}")
        # if index > 0: break

    rugpull_date = rugpull_labels_df[rugpull_labels_df['pool_id'] == pool_id]['date'].iloc[0] # Get the rugpull date for this pool

    token0_symbol = rugpull_labels_df[rugpull_labels_df['pool_id'] == pool_id]['token0.symbol'].iloc[0]
    token1_symbol = rugpull_labels_df[rugpull_labels_df['pool_id'] == pool_id]['token1.symbol'].iloc[0]

    # print(rugpull_date)
    pool_data = pool_day_data_df[pool_day_data_df['pool.id'] == pool_id]

    if (len(pool_data) < 3):
        # print(f"Skipping pool {pool_id} because there is no data")
        count += 1
        continue

    # If rugpull_date is NaT, use all available data; else filter data before rugpull
    pre_rugpull_data = pool_data if pd.isna(rugpull_date) else pool_data[pool_data['date'] < rugpull_date]

    pre_rugpull_data = pre_rugpull_data.select_dtypes(include=[np.number])
    
    if pre_rugpull_data.empty:
        print(f"Skipping pool {pool_id} because there is no data before the rugpull date")
        continue
    
    # Identify if token0 or token1 is WETH and rename columns
    if token0_symbol == 'WETH':
        weth_token = 'token0'
        otherToken = 'token1'
    elif token1_symbol == 'WETH':
        weth_token = 'token1'
        otherToken = 'token0'
    else:
        print(f"Skipping pool {pool_id} because neither token is WETH")
        continue

    # Rename columns
    pre_rugpull_data.columns = [col.lower().replace(weth_token, 'wethToken').replace(otherToken, 'otherToken') for col in pre_rugpull_data.columns]

    # # Debugging: Print new column names
    # print("New Column Names:", pre_rugpull_data.columns)
    
    # Calculate mean and variance for relevant columns
    # means = pre_rugpull_data.mean()
    # variances = pre_rugpull_data.var().fillna(0)  # Replacing NaN with 0

    feature_row = {}


    pre_rugpull_data['volume_change'] = pre_rugpull_data['volumeusd'].diff()
    pre_rugpull_data = pre_rugpull_data.iloc[1:]
    pre_rugpull_data['volume_change_per_swap'] = pre_rugpull_data.apply(lambda row: row['volume_change'] / row['txcount'] if row['txcount'] > 0 else np.nan, axis=1)
    avg_daily_volume_change_per_swap = pre_rugpull_data['volume_change_per_swap'].mean()

    # Create a feature row for this pool
    feature_row = {'pool_id': pool_id, 'avg_daily_volume_change_per_swap': avg_daily_volume_change_per_swap if not np.isnan(avg_daily_volume_change_per_swap) else 0}
    # for mean in means.index:
    #     feature_row['mean_' + mean] = means[mean]
    # for variance in variances.index:
    #     feature_row['variance_' + variance] = variances[variance]

    feature_rows.append(feature_row)

# Convert the list of dictionaries to a DataFrame
pool_features = pd.DataFrame(feature_rows)

print(pool_features.head())


Processing pool 0: Pool ID 0x000c0d31f6b7cecde4645eef0c4ec6a492659d62
Number of pools skipped: 0
Processing pool 200: Pool ID 0x0a1665e3f54eeb364bec6954e4497dd802840a01
Number of pools skipped: 22
Processing pool 400: Pool ID 0x152ac8b0358a215fe1f6dd75c8804e4f6ca046eb
Number of pools skipped: 42
Processing pool 600: Pool ID 0x1f138debc0721b364a967b09402b45069cba7b35
Number of pools skipped: 69
Processing pool 800: Pool ID 0x291dae7ebf6193910bf69cb14f121c07cfea4de0
Number of pools skipped: 89
Processing pool 1000: Pool ID 0x3429e990337542434f6bfac5a4d9f14ed285ac7a
Number of pools skipped: 115
Processing pool 1200: Pool ID 0x3e441eb04d28e1f8cac30f6e32d736a96d8a13b1
Number of pools skipped: 134
Processing pool 1400: Pool ID 0x46ca72148708fbc450e046fffd264a7821f70ba3
Number of pools skipped: 162
Processing pool 1600: Pool ID 0x51195c58837babb4ae172c3ac4e29bdc50db4058
Number of pools skipped: 189
Processing pool 1800: Pool ID 0x5aac14ca709846c709f746350ca1f739462027a3
Number of pools skippe

In [14]:
# Save the DataFrame to a CSV file
pool_features.to_csv('pool_features.csv', index=False)

print("Pool features saved to pool_features.csv")

Pool features saved to pool_features.csv


In [15]:
# Assuming 'rugpull_labels_df' has columns 'pool_id' and 'rugpull' and that 'pool_id' is unique
# Also assuming 'pool_features' has a 'pool_id' column

# First, set the index to 'pool_id' for merging
rugpull_labels_df.set_index('pool_id', inplace=True)
pool_features.set_index('pool_id', inplace=True)

# # Merge the DataFrames on 'pool_id'
# pool_features_with_labels = pool_features.join(rugpull_labels_df['rugpull'])

# # Reset index if you want 'pool_id' back as a column
# pool_features_with_labels.reset_index(inplace=True)

print(pool_features.head())

                                            avg_daily_volume_change_per_swap
pool_id                                                                     
0x000c0d31f6b7cecde4645eef0c4ec6a492659d62                       -238.254685
0x000ea4a83acefdd62b1b43e9ccc281f442651520                       -125.100576
0x0025ade782cc2b2415d1e841a8d52ff5dce33dfe                      -1507.205646
0x0068bb604413dfee5c453907bb150d0312a0f257                       -201.598492
0x0073ce82d9a8ffa9b695cca63cd3993c3eaef4dc                      -1358.865825


In [53]:
# # print amount true
# print(pool_features_with_labels['rugpull'].value_counts())

rugpull
False    3955
True      730
Name: count, dtype: int64


In [54]:
# Save the DataFrame with the rugpull labels to a CSV
# pool_features_with_labels.to_csv('pool_features_with_labels.csv', index=False)


In [17]:
# print(len(pool_features_with_labels))
# Load merged_dat2.csv into a DataFrame
merged_data2_df = pd.read_csv('../data/merged_pool_data2.csv', index_col='pool_id')

pool_features_with_merged_data = pool_features.join(merged_data2_df, how='inner')

# Reset index if you want 'pool_id' back as a column
pool_features_with_merged_data.reset_index(inplace=True)

# Check the first few rows of the joined DataFrame
print(pool_features_with_merged_data.head())

# Save the DataFrame with the rugpull labels to a CSV
pool_features_with_merged_data.to_csv('../data/merged_pool_data3.csv', index=False)

                                      pool_id  \
0  0x000c0d31f6b7cecde4645eef0c4ec6a492659d62   
1  0x000ea4a83acefdd62b1b43e9ccc281f442651520   
2  0x0025ade782cc2b2415d1e841a8d52ff5dce33dfe   
3  0x0068bb604413dfee5c453907bb150d0312a0f257   
4  0x0073ce82d9a8ffa9b695cca63cd3993c3eaef4dc   

   avg_daily_volume_change_per_swap    mean_close  mean_feesusd     mean_high  \
0                       -238.254685  3.935271e+04    526.090312  2.268379e+37   
1                       -125.100576  3.889389e+01     39.890917  9.373465e+35   
2                      -1507.205646  4.930590e+06     59.646370  3.105088e-07   
3                       -201.598492  4.765163e+03      0.713109  7.901676e+04   
4                      -1358.865825  4.087737e+06    266.206755  3.800094e+07   

       mean_low     mean_open      mean_tick  mean_wethTokenprice  \
0  2.082785e-05  1.134189e+37  185303.566667         2.202031e-05   
1  2.336523e+03  9.373465e+35  -74334.972452         4.906514e-04   
2  2.786997