In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime

# Load dataset
df = pd.read_csv('../data/ethereum_tx_data.csv')  # Update the path as needed
print("Original shape:", df.shape)

Original shape: (5000, 9)


In [4]:
# Drop useless or constant columns
if 'Value_OUT(ETH)' in df.columns:
    if df['Value_OUT(ETH)'].nunique() == 1 and df['Value_OUT(ETH)'].iloc[0] == 0:
        df.drop(columns=['Value_OUT(ETH)'], inplace=True)

In [5]:
# Rename confusing columns
df.rename(columns={
    'CurrentValue @ $3083.38454496098/Eth': 'CurrentValue_USD',
    'TxnFee(ETH)': 'TxnFee_ETH',
    'TxnFee(USD)': 'TxnFee_USD',
    'Historical $Price/Eth': 'HistoricalPrice_ETH'
}, inplace=True)

In [6]:
# Convert datetime
df['DateTime'] = pd.to_datetime(df['DateTime (UTC)'])
df.drop(columns=['DateTime (UTC)'], inplace=True)

# Time-based features
df['hour'] = df['DateTime'].dt.hour
df['dayofweek'] = df['DateTime'].dt.dayofweek

# Create derived features
df['Value_IN_USD'] = df['Value_IN(ETH)'] * df['HistoricalPrice_ETH']
df['GasEfficiency'] = df['TxnFee_USD'] / (df['Value_IN_USD'] + 1e-6)  # avoid divide by 0

In [7]:
# Drop rows with missing or corrupted values
df.dropna(inplace=True)
print("After cleaning:", df.shape)

# Reorder and keep final features
final_features = [
    'Blockno', 'UnixTimestamp', 'Value_IN(ETH)', 'Value_IN_USD',
    'TxnFee_ETH', 'TxnFee_USD', 'HistoricalPrice_ETH',
    'hour', 'dayofweek', 'GasEfficiency'
]
processed_df = df[final_features]

After cleaning: (5000, 12)


In [8]:
# Save cleaned dataset
os.makedirs('../data', exist_ok=True)
processed_df.to_csv('../data/processed_tx_data.csv', index=False)
print("Cleaned data saved to: '../data/processed_tx_data.csv'")

# Quick preview
processed_df.head()

Cleaned data saved to: '../data/processed_tx_data.csv'


Unnamed: 0,Blockno,UnixTimestamp,Value_IN(ETH),Value_IN_USD,TxnFee_ETH,TxnFee_USD,HistoricalPrice_ETH,hour,dayofweek,GasEfficiency
0,19557289,1711929611,0.0,0.0,0.002915,8.988436,3505.52,0,0,8988436.0
1,19557290,1711929623,0.0,0.0,0.004286,13.216714,3505.52,0,0,13216710.0
2,19557290,1711929623,0.0,0.0,0.000817,2.520163,3505.52,0,0,2520163.0
3,19557291,1711929635,0.0,0.0,0.002683,8.271781,3505.52,0,0,8271781.0
4,19557291,1711929635,0.0,0.0,0.003421,10.549526,3505.52,0,0,10549530.0
