In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from statsmodels.tsa.api import VAR
from itertools import combinations
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm


def compute_k_matrix(data):
    if data.isnull().values.any():
        raise ValueError("Data contains NaNs, which can break eigenvalue computation.")

    from statsmodels.tsa.api import VAR
    model = VAR(data)
    results = model.fit(1)
    B = results.params.values[1:, :]
    
    if B.shape[0] == 0 or B.shape[1] == 0:
        raise ValueError("Invalid VAR coefficient matrix shape.")

    K = np.eye(B.shape[0]) - B

    if np.isnan(K).any() or np.isinf(K).any():
        raise ValueError("K matrix contains NaN or inf values.")

    eigenvalues, eigenvectors = np.linalg.eig(K)
    sorted_idx = np.argsort(np.abs(eigenvalues))[::-1]
    return eigenvalues[sorted_idx], eigenvectors[:, sorted_idx]

def score_pair_by_eigenvalue_minute(data, ticker1, ticker2):
    try:
        # Subset and clean
        subset = data[[ticker1, ticker2]]
        subset.index = pd.to_datetime(subset.index)
        subset = subset.asfreq('min')   # Minute frequency
        subset = subset.ffill().bfill().dropna()

        if len(subset) < 500:
            return (ticker1, ticker2, np.nan)

        eigenvalues, _ = compute_k_matrix(subset)
        positive = [val for val in eigenvalues if val > 0]
        score = min(positive) if positive else np.nan

        return (ticker1, ticker2, score)

    except Exception as e:
        print(f"Error with pair {ticker1}-{ticker2}: {e}")
        return (ticker1, ticker2, np.nan)

def rank_pairs_by_mean_reversion_minute_parallel(data, tickers, max_workers=4):
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        pairs = list(combinations(tickers, 2))

        for t1, t2 in pairs:
            futures.append(executor.submit(score_pair_by_eigenvalue_minute, data, t1, t2))

        for future in tqdm(as_completed(futures), total=len(futures), desc="Ranking pairs"):
            t1, t2, score = future.result()
            if not np.isnan(score):
                results.append({'Pair': f'{t1}-{t2}', 'Eigenvalue Score': score})

    df = pd.DataFrame(results).dropna().sort_values(by='Eigenvalue Score')
    return df.reset_index(drop=True)

In [None]:
path = '/Users/jayren/Desktop/stock/Stock_Daily'
dailydata = '/Users/jayren/Desktop/stock/stock_Daily/dailydata'
consolidated = '/stockPrice_consolidated'

#us: us_stock_price.csv
# ca: ca_stock_price.csv
# adhoc: adhoc_stock_price.csv

if  __name__ == "__main__":

    price_data, tickers = get_csv_files(
        path=os.path.join(dailydata+consolidated), 
        filename='ca_stock_price.csv'
        )


    ranked_pairs = rank_pairs_by_mean_reversion_minute_parallel(price_data, tickers, max_workers=os.cpu_count())
    print(ranked_pairs)

In [None]:
ranked_pairs.to_csv('/Users/jayren/Desktop/ca_ranked_pairs.csv', index=False)