In [4]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint, adfuller
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations

In [5]:
# --- CONFIGURATION ---
DATA_DIR = 'data_raw' # Your folder name
TIMEFRAME = '15m'      # The timeframe you want to test
TOKENS = ['btc', 'sol', 'sui', 'avax', 'trx', 'uni', 'doge', 'xrp'] # I assumed 'unidoge' was UNI and DOGE

In [6]:

# --- 1. DATA LOADING ---
def load_crypto_data(tokens, timeframe):
    """
    Reads CSVs and aligns them by timestamp to create a single 'Close Price' matrix.
    """
    df_combined = pd.DataFrame()
    
    for token in tokens:
        # Construct path based on your structure: data_raw/btc/BTCUSDT_1d.csv
        # Adjust path construction if your folder structure is slightly different
        file_path = f"../../{DATA_DIR}/{token.lower()}/{token.upper()}USDT_{timeframe}.csv"
        
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è Warning: File not found for {token} at {file_path}")
            continue
            
        # Read Data
        df = pd.read_csv(file_path)
        
        # Convert timestamp (Binance uses ms timestamps usually)
        # If your 'open_time' is like '2024-12-29', pandas handles it automatically mostly
        df['datetime'] = pd.to_datetime(df['open_time'])
        df.set_index('datetime', inplace=True)
        
        # We only care about Close price for this analysis
        df_combined[token] = df['close']
    
    # Drop rows with NaN (alignment issues) to ensure fair comparison
    df_combined.dropna(inplace=True)
    print(f"‚úÖ Loaded {len(tokens)} tokens. Shared data points: {len(df_combined)} rows.")
    return df_combined

In [7]:
# --- 2. STATISTICAL TESTS ---

def calculate_hurst(series):
    """
    Calculates the Hurst Exponent to check for Mean Reversion.
    H < 0.5 = Mean Reverting
    H = 0.5 = Random Walk
    H > 0.5 = Trending
    """
    lags = range(2, 100)
    tau = [np.sqrt(np.std(np.subtract(series[lag:], series[:-lag]))) for lag in lags]
    poly = np.polyfit(np.log(lags), np.log(tau), 1)
    return poly[0] * 2.0

def calculate_half_life(spread):
    """
    Calculates how long it takes for the spread to revert halfway to the mean.
    """
    spread_lag = spread.shift(1)
    spread_lag.iloc[0] = spread_lag.iloc[1]
    spread_ret = spread - spread_lag
    spread_ret.iloc[0] = spread_ret.iloc[1]
    
    model = sm.OLS(spread_ret, sm.add_constant(spread_lag))
    res = model.fit()
    # Access by position using .iloc[1] instead of [1] since params is a Series with named indices
    lambda_param = res.params.iloc[1]
    
    if lambda_param >= 0: return np.inf # Non-reverting
    return -np.log(2) / lambda_param

In [8]:
def find_cointegrated_pairs(data):
    """
    Tests every possible pair for cointegration.
    """
    n = data.shape[1]
    keys = data.keys()
    results = []
    
    print("üîç Scanning for pairs...")
    
    # Iterate through every unique combination (e.g., BTC-SOL, BTC-ETH...)
    for asset_a, asset_b in combinations(keys, 2):
        series_a = data[asset_a]
        series_b = data[asset_b]
        
        # 1. Engle-Granger Test (Cointegration)
        # Null hypothesis: No cointegration. Low p-value means we reject null -> Cointegration exists.
        score, p_value, _ = coint(series_a, series_b)
        
        # 2. Calculate Spread (Hedge Ratio via OLS)
        # Spread = AssetA - (HedgeRatio * AssetB)
        series_b_const = sm.add_constant(series_b)
        result = sm.OLS(series_a, series_b_const).fit()
        hedge_ratio = result.params[asset_b]
        spread = series_a - (hedge_ratio * series_b)
        
        # 3. Hurst Exponent of the Spread
        hurst = calculate_hurst(spread.values)
        
        # 4. Half Life
        half_life = calculate_half_life(spread)
        
        results.append({
            'Pair': f"{asset_a}-{asset_b}",
            'P-Value': round(p_value, 5),
            'Hurst': round(hurst, 3),
            'Half_Life': round(half_life, 2),
            'Hedge_Ratio': round(hedge_ratio, 4)
        })

    # Convert to DataFrame and Filter
    results_df = pd.DataFrame(results)
    
    # FILTERING LOGIC (The "Secret Sauce")
    # P-Value < 0.05 (Statistically Significant)
    # Hurst < 0.5 (Mean Reverting)
    # Half Life > 1 and < 20 (Tradeable frequency)
    valid_pairs = results_df[
        (results_df['P-Value'] < 0.05) & 
        (results_df['Hurst'] < 0.5) &
        (results_df['Half_Life'] > 1)
    ].sort_values(by='P-Value')
    
    return results_df, valid_pairs

In [None]:
# --- EXECUTION ---

# 1. Load Data
df_prices = load_crypto_data(TOKENS, TIMEFRAME)

# 2. Run Analysis
all_pairs, best_pairs = find_cointegrated_pairs(df_prices)

# 3. Display Results
print("\nüèÜ --- TOP CANDIDATE PAIRS ---")
if best_pairs.empty:
    print("No perfect pairs found. Try a different timeframe (e.g., 1h, 15m).")
else:
    print(best_pairs)

# 4. Visualization of the Best Pair
if not best_pairs.empty:
    top_pair = best_pairs.iloc[0]
    pair_name = top_pair['Pair']
    asset_a, asset_b = pair_name.split('-')
    ratio = top_pair['Hedge_Ratio']
    
    # Reconstruct Spread
    spread = df_prices[asset_a] - (ratio * df_prices[asset_b])
    z_score = (spread - spread.mean()) / spread.std()
    
    plt.figure(figsize=(12, 6))
    
    # Plot 1: The Z-Score (The Trading Signal)
    plt.subplot(2, 1, 1)
    plt.plot(z_score, label=f'Z-Score ({pair_name})', color='purple')
    plt.axhline(0, color='black', linestyle='--', alpha=0.5)
    plt.axhline(2, color='red', linestyle='--', alpha=0.5, label='Sell Threshold (+2)')
    plt.axhline(-2, color='green', linestyle='--', alpha=0.5, label='Buy Threshold (-2)')
    plt.legend()
    plt.title(f"Spread Z-Score: {pair_name}")
    
    # Plot 2: The Raw Prices (Visual Check)
    plt.subplot(2, 1, 2)
    plt.plot(df_prices[asset_a], label=asset_a, color='orange')
    plt.plot(df_prices[asset_b] * ratio, label=f'{asset_b} x {ratio}', color='blue') # Scaled Asset B
    plt.legend()
    plt.title(f"Price Alignment (Scaled): {asset_a} vs {asset_b}")
    
    plt.tight_layout()
    plt.show()

‚úÖ Loaded 8 tokens. Shared data points: 35040 rows.
üîç Scanning for pairs...


KeyboardInterrupt: 