In [2]:
import pandas as pd
import pyreadr

In [4]:
# Load survival data
indexEvent = "Withdraw/"
outcomeEvent = "Borrow/"
file_path = f"/data/IDEA_DeFi_Research/Data/Survival_Data_F24/{indexEvent}{outcomeEvent}y_train.rds"
result = pyreadr.read_r(file_path)
survivalData = result[None]
print(survivalData.head())

     timeDiff  status                                                 id  \
0  49810455.0     0.0  0x0c66ade8a26d0bedbc967005a75a00028cebf222ec45...   
1  49807265.0     0.0  0x25f636e7b49a599c765bcdac00853265ac76a5921953...   
2  49806318.0     0.0  0x17869b1999e221cb74ab4fc8c9f7087097e6e27b50dd...   
3  49734675.0     0.0  0xae1083a26ffc90a681fc856b9bb087e37a16c037227a...   
4  49730289.0     0.0  0xd12cbbad05aa4b7dbcf62ee862bf07c73ddfca58f035...   

  Index Event Outcome Event  
0    withdraw        borrow  
1    withdraw        borrow  
2    withdraw        borrow  
3    withdraw        borrow  
4    withdraw        borrow  


In [5]:
survivalData = survivalData[['id', 'timeDiff', 'status']]
survivalData_test = survivalData[:10]

In [6]:
# Load raw transaction data
file_path = 'data/transactionsAave.csv'
rawData = pd.read_csv(file_path, low_memory=False)  
sort_columns =  ['user', 'timestamp']
rawData = rawData.sort_values(by=sort_columns)

In [7]:
print(rawData.head())

                                                       id        type  \
482454  0x3fec3516c8085e089d408562dd3f9ca1bbbc5b0eb4ea...  collateral   
658731  0x452f5f9b8d503f395a1db08a2bd8f3af937d401d5358...  collateral   
317991  0x59a33c72e045d762a069ddfe10540ddc1b521cbd27e1...  collateral   
473278  0x3c07de24894604a8e9fd36be5eda98a8f9fdad794536...  collateral   
655032  0x090c9fb05186afb08a0e311160409f24b5aa33de8ea1...  collateral   

           timestamp                                        user  \
482454  1.626954e+09  0x0000000000000000000000000000000000000001   
658731  1.657433e+09  0x0000000000000000000000000000000000000001   
317991  1.613978e+09  0x000000000000000000000000000000000000dead   
473278  1.626148e+09  0x000000000000000000000000000000000000dead   
655032  1.656773e+09  0x000000000000000000000000000000000000dead   

                 userAlias onBehalfOf onBehalfOfAlias  \
482454  Hamaama al-Shaheed        NaN             NaN   
658731  Hamaama al-Shaheed        NaN 

In [17]:
samples = []

# Iterate through survival data and extract static and dynamic features
for _, row in survivalData_test.iterrows():
    index_id = row['id']
    index_status = row['status']
    
    # Find the corresponding user for the index event
    user_selection = rawData[rawData['id'] == index_id]['user']
    index_time_selection = rawData[rawData['id'] == index_id]['timestamp']
    if not user_selection.empty and not index_time_selection.empty:
        user = user_selection.iloc[0]
        index_time = index_time_selection.iloc[0]
        
        # Filter transactions for the user before the index time
        user_transactions = rawData[(rawData['user'] == user) & (rawData['timestamp'] <index_time)]
        user_transactions = user_transactions.sort_values(by='timestamp', ascending=False)
        
        # Get the top 5 recent transactions as dynamic features
        dynamic_transactions = user_transactions.head(5)
        
        # Get the remaining transactions as static features
        oldest_dynamic_time = dynamic_transactions['timestamp'].min() if not dynamic_transactions.empty else index_time
        static_transactions = user_transactions[user_transactions['timestamp'] < oldest_dynamic_time]
        
        # Append the sample to the list
        sample = {
            'indexID': index_id,
            'indexTime': index_time,
            'indexStatus': index_status,
            'static_transactions': static_transactions,  # Static feature: transactions before the sliding window
            'dynamic_transactions': dynamic_transactions  # Dynamic feature: last 5 transactions before index event
        }
        samples.append(sample)
    else:
        print(f"No matching user found for indexID: {index_id}")
    

In [18]:
# Example output of static and dynamic features
for sample in samples:
    print("IndexID:", sample['indexID'])
    print("Static Transactions:")
    print(sample['static_transactions'])
    print("Dynamic Transactions:")
    print(sample['dynamic_transactions'])
    print("----------------------------------------")


IndexID: 0x0c66ade8a26d0bedbc967005a75a00028cebf222ec45e2b90080e222ed489dcd
Static Transactions:
Empty DataFrame
Columns: [id, type, timestamp, user, userAlias, onBehalfOf, onBehalfOfAlias, pool, reserve, amount, amountUSD, amountETH, borrowRate, borrowRateMode, fromState, toState, liquidator, principalAmount, principalReserve, principalAmountUSD, principalAmountETH, collateralAmount, collateralReserve, collateralAmountUSD, collateralAmountETH, liquidatorAlias, priceInUsd, borrowRateModeTo, borrowRateModeFrom, stableBorrowRate, variableBorrowRate, target, totalFee, version, deployment]
Index: []

[0 rows x 35 columns]
Dynamic Transactions:
                                                       id        type  \
259888  0xdd1d259637c499ff61b0d142e7bb6639f3552b2dab37...  collateral   
893992  0xdd1d259637c499ff61b0d142e7bb6639f3552b2dab37...     deposit   
259887  0x7f10004def891a9965e972e57ecb8b93c517b5d4d7e4...  collateral   
893991  0x7f10004def891a9965e972e57ecb8b93c517b5d4d7e4...   

In [19]:
from transformers import BertModel, BertTokenizer
import torch

# Load pre-trained model for embedding
model_path = "./output_aave/final-model"  # Replace with your model directory
model = BertModel.from_pretrained(model_path)  # Use your custom model class if necessary
model.eval()

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of BertModel were not initialized from the model checkpoint at ./output_aave/final-model and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder

In [7]:
from sentence_transformers import SentenceTransformer
import torch

# Load pre-trained Sentence-BERT model for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

In [20]:
embedding_samples = []

for sample in samples:
    dynamic_transactions = sample['dynamic_transactions']
    if not dynamic_transactions.empty:
        # Use all columns from dynamic_transactions
        all_columns = dynamic_transactions.columns
        
        # Prepare text input for embedding using all columns
        transaction_text = " ".join(dynamic_transactions[all_columns].astype(str).agg(' '.join, axis=1))
        
        # Generate embedding using the Sentence-BERT model
        embedding = model.encode(transaction_text)
        
        # Append the embedding sample to the list
        embedding_sample = {
            'id': sample['indexID'],
            'timeDiff': sample['indexTime'],
            'status': sample['indexStatus'],
            'embedding': embedding
        }
        embedding_samples.append(embedding_sample)
    else:
        print(f"Empty or invalid input for indexID {sample['indexID']}, skipping...")


AttributeError: 'BertModel' object has no attribute 'encode'

In [9]:
# Create final dataset with embeddings
final_dataset = pd.DataFrame(embedding_samples)
print(final_dataset.head())


                                                  id      timeDiff  status  \
0  0x0c66ade8a26d0bedbc967005a75a00028cebf222ec45...  1.606837e+09     0.0   
1  0x25f636e7b49a599c765bcdac00853265ac76a5921953...  1.606841e+09     0.0   
2  0x17869b1999e221cb74ab4fc8c9f7087097e6e27b50dd...  1.606841e+09     0.0   
3  0xae1083a26ffc90a681fc856b9bb087e37a16c037227a...  1.606913e+09     0.0   
4  0xd12cbbad05aa4b7dbcf62ee862bf07c73ddfca58f035...  1.606918e+09     0.0   

                                           embedding  
0  [0.001982082, -0.006319259, -0.05993434, -0.10...  
1  [0.019646313, -0.016742138, -0.05178565, -0.09...  
2  [0.011911144, -0.026023526, -0.057819903, -0.0...  
3  [0.033537854, -0.020683905, -0.057182007, -0.1...  
4  [-0.018207029, -0.027614292, -0.07083733, -0.0...  


In [14]:
import os
save_path = "data"
final_dataset_path = os.path.join(save_path, "final_dataset.csv")
final_dataset.to_csv(final_dataset_path, index=False)
print(f"Final dataset saved at: {final_dataset_path}")

Final dataset saved at: data/final_dataset.csv
