# SMT-WEEX Notebook 1: Data Cleaning & Feature Engineering
**Project:** smt-weex-2025
**Author:** Jannet Ekka

This notebook:
1. Loads data from BigQuery
2. Cleans and validates features
3. Handles missing values and outliers
4. Feature transformations
5. Saves cleaned dataset to GCS

## 1. Setup & Authentication

In [None]:
# Install required packages
!pip install -q google-cloud-bigquery google-cloud-storage pandas numpy scikit-learn catboost db-dtypes

In [None]:
# Authenticate with Google Cloud
from google.colab import auth
auth.authenticate_user()

# Set project
PROJECT_ID = 'smt-weex-2025'
BUCKET = 'smt-weex-2025-models'

!gcloud config set project {PROJECT_ID}

In [None]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.cloud import storage
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Initialize BigQuery client
bq_client = bigquery.Client(project=PROJECT_ID)

print(f"Connected to project: {PROJECT_ID}")

## 2. Load Data from BigQuery

In [None]:
# Load whale features from BigQuery
query = """
SELECT * FROM `smt-weex-2025.ml_data.whale_features`
"""

df = bq_client.query(query).to_dataframe()
print(f"Loaded {len(df)} rows, {len(df.columns)} columns from BigQuery")
df.head()

In [None]:
# Check columns and dtypes
print("=== Column Info ===")
print(df.dtypes)
print(f"\n=== Shape: {df.shape} ===")

In [None]:
# Check category distribution
print("=== Category Distribution ===")
print(df['category'].value_counts())

# Visualize
plt.figure(figsize=(10, 5))
df['category'].value_counts().plot(kind='bar', color='steelblue')
plt.title('Whale Category Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Data Cleaning

In [None]:
# Check for missing values
print("=== Missing Values ===")
missing = df.isnull().sum()
missing = missing[missing > 0]
if len(missing) > 0:
    print(missing)
else:
    print("No missing values!")

In [None]:
# Check for duplicates
duplicates = df['address'].duplicated().sum()
print(f"Duplicate addresses: {duplicates}")

# Remove duplicates if any
if duplicates > 0:
    df = df.drop_duplicates(subset=['address'], keep='first')
    print(f"After dedup: {len(df)} rows")

In [None]:
# Define feature columns (exclude non-feature columns)
NON_FEATURES = ['address', 'category', 'sub_label']
FEATURE_COLS = [col for col in df.columns if col not in NON_FEATURES]

print(f"Feature columns ({len(FEATURE_COLS)}):")
print(FEATURE_COLS)

In [None]:
# Check for infinite values
print("=== Infinite Values ===")
for col in FEATURE_COLS:
    if df[col].dtype in ['float64', 'int64', 'Float64', 'Int64']:
        inf_count = np.isinf(df[col].astype(float)).sum()
        if inf_count > 0:
            print(f"{col}: {inf_count} infinite values")
            # Replace inf with max non-inf value
            max_val = df[col][~np.isinf(df[col].astype(float))].max()
            df[col] = df[col].replace([np.inf, -np.inf], max_val)
            print(f"  -> Replaced with {max_val}")

In [None]:
# Basic statistics
print("=== Feature Statistics ===")
df[FEATURE_COLS].describe().T

## 4. Outlier Detection & Handling

In [None]:
# Check for extreme outliers using IQR
def detect_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 3 * IQR  # Using 3x IQR for extreme outliers
    upper = Q3 + 3 * IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    return len(outliers), lower, upper

print("=== Outlier Detection (3x IQR) ===")
for col in FEATURE_COLS:
    if df[col].dtype in ['float64', 'int64', 'Float64', 'Int64']:
        count, lower, upper = detect_outliers_iqr(df, col)
        if count > 0:
            pct = count / len(df) * 100
            print(f"{col}: {count} outliers ({pct:.1f}%)")

In [None]:
# We'll use log transformation for highly skewed features instead of removing outliers
# This preserves whale behavior patterns

SKEWED_COLS = [
    'total_txs', 'outgoing_count', 'incoming_count',
    'outgoing_volume_eth', 'incoming_volume_eth',
    'avg_tx_value_eth', 'max_tx_value_eth', 'std_tx_value_eth',
    'avg_gas_used', 'max_gas_used',
    'unique_counterparties', 'unique_tokens', 'balance_eth'
]

# Create log-transformed versions
for col in SKEWED_COLS:
    if col in df.columns:
        df[f'{col}_log'] = np.log1p(df[col].clip(lower=0).astype(float))
        print(f"Created {col}_log")

In [None]:
# Visualize distribution before/after log transform
fig, axes = plt.subplots(2, 3, figsize=(15, 8))

sample_cols = ['total_txs', 'balance_eth', 'unique_counterparties']
for i, col in enumerate(sample_cols):
    if col in df.columns:
        # Original
        axes[0, i].hist(df[col].astype(float), bins=50, color='steelblue', alpha=0.7)
        axes[0, i].set_title(f'{col} (original)')
        
        # Log transformed
        axes[1, i].hist(df[f'{col}_log'], bins=50, color='coral', alpha=0.7)
        axes[1, i].set_title(f'{col} (log)')

plt.tight_layout()
plt.show()

## 5. Feature Engineering

In [None]:
# Create additional derived features

# 1. Activity intensity score
df['activity_intensity'] = df['total_txs'].astype(float) / (df['activity_span_days'].astype(float) + 1)

# 2. DeFi engagement score
df['defi_engagement'] = (df['defi_interactions'].astype(float) + df['unique_defi_protocols'].astype(float)) / (df['total_txs'].astype(float) + 1)

# 3. Token diversity normalized
df['token_diversity_norm'] = df['unique_tokens'].astype(float) / (df['erc20_tx_count'].astype(float) + 1)

# 4. Value concentration (max/avg ratio)
df['value_concentration'] = df['max_tx_value_eth'].astype(float) / (df['avg_tx_value_eth'].astype(float) + 0.001)

# 5. Flow imbalance (absolute)
df['flow_imbalance'] = abs(df['net_flow_eth'].astype(float)) / (df['incoming_volume_eth'].astype(float) + df['outgoing_volume_eth'].astype(float) + 0.001)

# 6. Gas efficiency (inverse of avg gas)
df['gas_efficiency'] = 1 / (df['avg_gas_used'].astype(float) + 1)

print("Created 6 new derived features")

In [None]:
# Handle net_flow_eth which can be extremely large positive or negative
# Use signed log transformation
def signed_log(x):
    return np.sign(x) * np.log1p(abs(x))

df['net_flow_eth_signed_log'] = df['net_flow_eth'].astype(float).apply(signed_log)
print("Created net_flow_eth_signed_log")

In [None]:
# Final feature list for ML
ML_FEATURES = [
    # Original features (non-skewed)
    'erc20_ratio', 'nft_ratio', 'internal_ratio',
    'large_tx_ratio', 'avg_time_between_tx_hours', 'std_time_between_tx_hours',
    'tx_per_day', 'business_hour_ratio', 'peak_hour_pct',
    'defi_interactions', 'unique_defi_protocols', 'cex_interactions',
    'stablecoin_ratio', 'tx_ratio_out_in',
    
    # Log-transformed features
    'total_txs_log', 'outgoing_count_log', 'incoming_count_log',
    'outgoing_volume_eth_log', 'incoming_volume_eth_log',
    'avg_tx_value_eth_log', 'max_tx_value_eth_log',
    'avg_gas_used_log', 'max_gas_used_log',
    'unique_counterparties_log', 'unique_tokens_log', 'balance_eth_log',
    
    # Derived features
    'activity_intensity', 'defi_engagement', 'token_diversity_norm',
    'value_concentration', 'flow_imbalance', 'gas_efficiency',
    'net_flow_eth_signed_log'
]

# Check which features exist
ML_FEATURES = [f for f in ML_FEATURES if f in df.columns]
print(f"Final ML features ({len(ML_FEATURES)}):")
print(ML_FEATURES)

## 6. Feature Correlation Analysis

In [None]:
# Correlation matrix
plt.figure(figsize=(16, 14))
corr_matrix = df[ML_FEATURES].corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Find highly correlated features (>0.9)
print("=== Highly Correlated Features (>0.9) ===")
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.9:
            high_corr_pairs.append((
                corr_matrix.columns[i],
                corr_matrix.columns[j],
                corr_matrix.iloc[i, j]
            ))

for f1, f2, corr in high_corr_pairs:
    print(f"{f1} <-> {f2}: {corr:.3f}")

In [None]:
# Remove highly correlated features (keep one from each pair)
# Based on correlation analysis, remove features that are redundant
FEATURES_TO_DROP = []

# Add features to drop based on correlation analysis above
# For example, if outgoing_count_log and incoming_count_log are highly correlated with total_txs_log
# we might drop them

FINAL_FEATURES = [f for f in ML_FEATURES if f not in FEATURES_TO_DROP]
print(f"Final features after removing correlations: {len(FINAL_FEATURES)}")

## 7. Save Cleaned Data

In [None]:
# Prepare final dataset
df_clean = df[['address', 'category', 'sub_label'] + FINAL_FEATURES].copy()

# Fill any remaining NaN with 0
df_clean = df_clean.fillna(0)

# Replace any remaining inf
df_clean = df_clean.replace([np.inf, -np.inf], 0)

print(f"Final dataset shape: {df_clean.shape}")
print(f"Categories: {df_clean['category'].value_counts().to_dict()}")

In [None]:
# Create GCS bucket if not exists
!gsutil ls gs://{BUCKET} || gsutil mb -l us-central1 gs://{BUCKET}

In [None]:
# Save locally
df_clean.to_csv('/content/whale_features_cleaned.csv', index=False)
print("Saved to /content/whale_features_cleaned.csv")

# Upload to GCS
!gsutil cp /content/whale_features_cleaned.csv gs://{BUCKET}/data/whale_features_cleaned.csv
print(f"Uploaded to gs://{BUCKET}/data/whale_features_cleaned.csv")

In [None]:
# Also save feature list for reference
import json

feature_config = {
    'features': FINAL_FEATURES,
    'target': 'category',
    'categories': df_clean['category'].unique().tolist(),
    'n_samples': len(df_clean),
    'n_features': len(FINAL_FEATURES)
}

with open('/content/feature_config.json', 'w') as f:
    json.dump(feature_config, f, indent=2)

!gsutil cp /content/feature_config.json gs://{BUCKET}/data/feature_config.json
print("Saved feature config")
print(json.dumps(feature_config, indent=2))

## Summary

Data cleaning completed:
1. Loaded 516 whales from BigQuery
2. Removed duplicates and handled missing values
3. Applied log transformations to skewed features
4. Created 7 derived features
5. Analyzed correlations
6. Saved cleaned dataset to GCS

**Next:** Run Notebook 2 for train/test split and model training.