In [1]:
#function to get stock and rank the co-integration pairs

import yfinance as yf
import pandas as pd
import numpy as np
from statsmodels.tsa.api import VAR
from itertools import combinations
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def get_stock_yf(tickers, interval='1d', start='2020-01-01'):
    data = yf.download(tickers, interval=interval, start=start, auto_adjust=True, progress=False)

    # If data.columns is MultiIndex (multiple tickers)
    if isinstance(data.columns, pd.MultiIndex):
        close_prices = data['Close']
    else:
        # Single ticker fallback
        close_prices = pd.DataFrame(data['Close'])
        close_prices.columns = tickers[:1]  # Rename column to match single ticker

    return close_prices.dropna()

# Step 1: Function to compute K-matrix and eigen decomposition
def compute_k_matrix(data):
    if data.isnull().values.any():
        raise ValueError("Data contains NaNs, which can break eigenvalue computation.")

    model = VAR(data)
    results = model.fit(1)
    B = results.params.values[1:, :]
    
    if B.shape[0] == 0 or B.shape[1] == 0:
        raise ValueError("Invalid VAR coefficient matrix shape.")

    K = np.eye(B.shape[0]) - B

    if np.isnan(K).any() or np.isinf(K).any():
        raise ValueError("K matrix contains NaN or inf values.")

    eigenvalues, eigenvectors = np.linalg.eig(K)
    sorted_idx = np.argsort(np.abs(eigenvalues))[::-1]
    return eigenvalues[sorted_idx], eigenvectors[:, sorted_idx]

# Step 2: Score a single pair
def score_pair_by_eigenvalue(data, ticker1, ticker2):
    try:
        subset = data[[ticker1, ticker2]].copy()
        subset.index = pd.to_datetime(subset.index)
        subset = subset.asfreq('D')   # DAY frequency
        subset = subset.ffill().bfill().dropna()

        if len(subset) < 100:
            return (ticker1, ticker2, np.nan)

        eigenvalues, _ = compute_k_matrix(subset)
        positive = [val for val in eigenvalues if val.real > 0]  # only real positive eigenvalues
        score = min(positive) if positive else np.nan
        return (ticker1, ticker2, score)

    except Exception as e:
        print(f"Error with pair {ticker1}-{ticker2}: {e}")
        return (ticker1, ticker2, np.nan)

# Step 3: Rank all pairs in parallel
def rank_pairs_by_mean_reversion(data, tickers, max_workers=8):
    results = []
    pairs = list(combinations(tickers, 2))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(score_pair_by_eigenvalue, data, t1, t2) for t1, t2 in pairs]

        for future in tqdm(as_completed(futures), total=len(futures), desc="Ranking pairs"):
            t1, t2, score = future.result()
            if not np.isnan(score):
                results.append({'Pair': f'{t1}-{t2}', 'Eigenvalue Score': score})

    df = pd.DataFrame(results).dropna().sort_values(by='Eigenvalue Score')
    return df.reset_index(drop=True)



In [None]:

# rank the pairs by eigenvalue score

path = '/Users/jayren/Desktop/stock/Stock_Daily'
dailydata = '/Users/jayren/Desktop/stock/stock_Daily/dailydata'
consolidated = '/stockPrice_consolidated'


# list of tickers
us_sp500_list = pd.read_excel(path + '/list.xlsx', sheet_name='SP500_LIST')
ca_sptsx_list = pd.read_excel(path + '/list.xlsx', sheet_name='SPTSX_LIST')
adhoc_list = pd.read_excel(path + '/list.xlsx', sheet_name='Adhoc')

us_t = us_sp500_list['Symbol'].tolist()
ca_t = ca_sptsx_list['Symbol'].tolist()
adhoc_t = adhoc_list['Symbol'].tolist()

# Example usage
tickers = us_t  # Adjust the number of tickers as needed
data = get_stock_yf(tickers)
ranked_pairs = rank_pairs_by_mean_reversion(data, tickers)
print(ranked_pairs)

ranked_pairs.to_csv('/Users/jayren/Desktop/US_DAILY_ranked_pairs.csv', index=False)