In [None]:
!pip install scikit-learn

# EnhancedWhaleMetricsTransformer: Análisis Avanzado de Métricas de Whales

Componentes clave:

1. Transformación de Datos:
- Procesa múltiples CSVs de wallets
- Normaliza métricas usando StandardScaler
- Genera series temporales horarias

2. Métricas Calculadas:
- Balance total y promedio
- Volumen 24h
- Flujo neto
- Whales activas
- Tamaño promedio de transacciones
- Índice de concentración (Herfindahl-Hirschman)

3. Características Técnicas:
- Ventana móvil de 24h
- Normalización de features
- Forward fill para datos faltantes
- Manejo de timestamps
- Agregación por hora

Innovación principal: Calcula índice de concentración para medir distribución del poder económico entre whales.

Uso: Análisis de comportamiento colectivo de whales y su impacto en el mercado.

In [None]:
import pandas as pd
import numpy as np
from typing import List, Dict
from sklearn.preprocessing import StandardScaler
from datetime import datetime, timedelta

class EnhancedWhaleMetricsTransformer:
    def __init__(self, start_date: str = '2023-01-01'):
        """
        Initialize the transformer with a start date for historical data
        
        Args:
            start_date: Start date in 'YYYY-MM-DD' format
        """
        self.start_date = pd.Timestamp(start_date)
        self.end_date = pd.Timestamp.now()
        
    def _process_wallet_file(self, file_path: str) -> pd.DataFrame:
        """
        Process individual wallet CSV file
        """
        df = pd.read_csv(file_path)
        df['fecha'] = pd.to_datetime(df['fecha'])
        df['monto'] = pd.to_numeric(df['monto'], errors='coerce')
        return df
    
    def _calculate_hourly_metrics(self, wallet_dfs: List[pd.DataFrame]) -> pd.DataFrame:
        """
        Calculate hourly metrics for all wallets
        """
        # Create date range for all hours
        date_range = pd.date_range(start=self.start_date, end=self.end_date, freq='h')
        metrics_list = []
        
        for timestamp in date_range:
            window_start = timestamp - pd.Timedelta(hours=24)
            
            # Initialize metrics for this hour
            hour_metrics = {
                'timestamp': timestamp,
                'whale_total_balance': 0,
                'whale_avg_balance': 0,
                'whale_balance_std': 0,
                'whale_total_volume_24h': 0,
                'whale_net_flow': 0,
                'active_whales': 0,
                'whale_avg_tx_size': 0,
                'whale_concentration': 0
            }
            
            wallet_balances = []
            tx_sizes = []
            active_wallets = set()
            
            for wallet_df in wallet_dfs:
                # Get transactions in the last 24 hours for this wallet
                mask_24h = (wallet_df['fecha'] > window_start) & (wallet_df['fecha'] <= timestamp)
                recent_txs = wallet_df[mask_24h]
                
                if len(recent_txs) > 0:
                    # Calculate wallet metrics
                    received = recent_txs[recent_txs['tipo'] == 'Recibido']['monto'].sum()
                    sent = recent_txs[recent_txs['tipo'] == 'Enviado']['monto'].sum()
                    balance = received - sent
                    
                    wallet_balances.append(balance)
                    tx_sizes.extend(recent_txs['monto'].tolist())
                    
                    if len(recent_txs) > 0:
                        active_wallets.add(wallet_df['remitente'].iloc[0])
            
            # Calculate aggregated metrics
            if wallet_balances:
                hour_metrics.update({
                    'whale_total_balance': sum(wallet_balances),
                    'whale_avg_balance': np.mean(wallet_balances),
                    'whale_balance_std': np.std(wallet_balances) if len(wallet_balances) > 1 else 0,
                    'whale_total_volume_24h': sum(abs(x) for x in tx_sizes),
                    'whale_net_flow': sum(wallet_balances),
                    'active_whales': len(active_wallets),
                    'whale_avg_tx_size': np.mean(tx_sizes) if tx_sizes else 0,
                    'whale_concentration': self._calculate_concentration(wallet_balances)
                })
            
            metrics_list.append(hour_metrics)
        
        return pd.DataFrame(metrics_list)
    
    @staticmethod
    def _calculate_concentration(balances: List[float]) -> float:
        """
        Calculate Herfindahl-Hirschman concentration index
        """
        if not balances or sum(balances) == 0:
            return 0
            
        total = sum(abs(b) for b in balances)
        market_shares = [abs(b)/total for b in balances]
        return sum(share * share for share in market_shares)
    
    def transform(self, csv_files: List[str]) -> pd.DataFrame:
        """
        Transform wallet CSVs into consolidated metrics DataFrame
        
        Args:
            csv_files: List of paths to wallet CSV files
            
        Returns:
            DataFrame with hourly metrics
        """
        # Process all wallet files
        wallet_dfs = [self._process_wallet_file(f) for f in csv_files]
        
        # Calculate hourly metrics
        metrics_df = self._calculate_hourly_metrics(wallet_dfs)
        
        # Normalize features
        scaler = StandardScaler()
        columns_to_normalize = [col for col in metrics_df.columns if col != 'timestamp']
        metrics_df[columns_to_normalize] = scaler.fit_transform(metrics_df[columns_to_normalize])
        
        # Set timestamp as index
        metrics_df.set_index('timestamp', inplace=True)
        
        # Forward fill missing values
        metrics_df.fillna(method='ffill', inplace=True)
        
        return metrics_df

# Example usage:
if __name__ == "__main__":
    # List of CSV files
    csv_files = [
        'C:/Users/Nabucodonosor/Documents/BITLINK/Repositorios/explorer-wallets/algorand_reports/wallet_transactions_BX7UST4VVWQQPPAVPMPFW76QRKXRLQ3UVYABWBTHOAX2AN5Q5OGW2X55AQ_20241227_185401.csv',
        'C:/Users/Nabucodonosor/Documents/BITLINK/Repositorios/explorer-wallets/algorand_reports/wallet_transactions_R7ALVPEQRGECJK33LANXMPENWYALTAZQTSGNCTEQPKBSBDI5KO252SAK64_20241227_185604.csv',
        'C:/Users/Nabucodonosor/Documents/BITLINK/Repositorios/explorer-wallets/algorand_reports/wallet_transactions_UI6AGNWYGQD6HNQVEZ5ZTAQR27FHZ45VMLTS7LTSF2IZGAJMOIF4V4EEVU_20241227_184610.csv'
    ]
    
    # Create transformer and process data
    transformer = EnhancedWhaleMetricsTransformer(start_date='2023-01-01')
    metrics_df = transformer.transform(csv_files)
    
    # Save to CSV
    metrics_df.to_csv('whale_metrics_hourly.csv')