In [1]:
import numpy as np
import pandas as pd
import time
import yfinance as yf
from yahooquery import Screener
from yahooquery import Ticker  

import statsmodels
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint, adfuller


import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

In [2]:
def get_companies():
    
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    table = pd.read_html(url)
    sp500_df = table[0]
    sp500_symbols = sp500_df['Symbol'].tolist()
    tickers = Ticker(sp500_symbols)
    profiles = tickers.summary_profile
    profiles_df = pd.DataFrame(profiles).T.reset_index()
    profiles_df.rename(columns={'index': 'symbol'}, inplace=True)
    
    return profiles_df    

In [None]:
all_companies = get_companies()

In [None]:
all_companies

In [None]:
unique_sectors_list = all_companies['sector'].unique().tolist()

In [None]:
for sector in unique_sectors_list:
    if sector != 'No fundamentals data found for any of the summaryTypes=summaryProfile':
        print(sector)

In [None]:
def filter_companies(companies, sector, marketCap_thresh=2_000_000_000,averageVolume_thresh=5_000_000, start_date="2019-11-16", end_date="2023-11-16"):
    
    sector_companies = companies[companies['sector'] == sector]
    sector_symbols = sector_companies['symbol'].tolist()

    filtered_symbols = []
    for symbol in sector_symbols:
        try:
            # Fetch ticker info with error handling
            ticker_info = yf.Ticker(symbol).info

            # Check if data exists and if marketCap and averageVolume thresholds are met
            market_cap = ticker_info.get("marketCap")
            average_volume = ticker_info.get("averageVolume")
            if market_cap is not None and market_cap > marketCap_thresh and \
                    average_volume is not None and average_volume > averageVolume_thresh:
                filtered_symbols.append(symbol)
        except Exception as e:
            # Print or log error message for the symbol
            print(f"Error fetching data for {symbol}: {e}")
        # Add delay to avoid rate limiting
        time.sleep(0.5)

    # Initialize an empty DataFrame to store the historical closing prices
    filtered_symbols_df = pd.DataFrame()

    # Fetch historical data for each filtered symbol
    for symbol in filtered_symbols:
        try:
            ticker = yf.Ticker(symbol)
            historical_data = ticker.history(start=start_date, end=end_date)
            # Store the 'Close' prices in the DataFrame
            filtered_symbols_df[symbol] = historical_data['Close']
        except Exception as e:
            # Print or log error message for the symbol if historical data cannot be fetched
            print(f"Error fetching historical data for {symbol}: {e}")
        # Add delay to avoid rate limiting
        time.sleep(0.5)
        
    return filtered_symbols_df

In [None]:
def find_cointegrated_pairs(combined_df, p_value_threshold=0.02, min_data_points_threshold=100):
    """
    Finds and prints pairs of stock symbols that are cointegrated.
    
    Parameters:
    - combined_df: DataFrame containing historical daily prices with symbols as columns.
    - p_value_threshold: The significance level for cointegration (default is 0.02).
    - min_data_points_threshold: The minimum number of data points required to test cointegration.
    
    Returns:
    - pairs_sorted: List of tuples containing cointegrated pairs sorted by p-value.
    """
    
    symbols = combined_df.columns
    n = len(symbols)

    pairs = []

    # Loop through symbols and perform the cointegration test for each unique pair
    for i in range(n):
        for j in range(i + 1, n):
            symbol_1 = symbols[i]
            symbol_2 = symbols[j]

            series1 = combined_df[symbol_1]
            series2 = combined_df[symbol_2]
            
            # Combine series and drop NaN values
            combined_pair = pd.concat([series1, series2], axis=1).dropna()
            data_points = len(combined_pair)

            # Debug: Print the length of combined data
            print(f"Testing pair {symbol_1} and {symbol_2}, Number of data points: {data_points}")

            # Check if there are enough data points
            if data_points >= min_data_points_threshold:
                # Perform the cointegration test
                coint_t, p_value, _ = coint(combined_pair.iloc[:, 0], combined_pair.iloc[:, 1])

                # Check if the p-value is below the threshold
                if p_value < p_value_threshold:
                    print(f"{symbol_1} and {symbol_2} are likely cointegrated (p-value: {p_value:.4f})")
                    pairs.append((symbol_1, symbol_2, p_value))
                    # Debug: Confirm pairs being appended correctly
                    print(f"Current pairs in function after appending: {pairs}")

    # Debug: Confirm list of pairs before sorting
    print(f"Pairs before sorting: {pairs}")

    # Sort pairs by p-value
    pairs_sorted = sorted(pairs, key=lambda x: x[2])

    # Debug: Print final sorted pairs before returning
    print(f"Final sorted pairs in function: {pairs_sorted}")

    return pairs_sorted


In [None]:
all_pairs_data = []  # Initialize once before the loop

for sector in unique_sectors_list:
    if sector != 'No fundamentals data found for any of the summaryTypes=summaryProfile':
        filtered_symbols_df = filter_companies(all_companies, sector)
        print(f"Processing sector: {sector}, Number of companies: {len(filtered_symbols_df.columns)}")

        if len(filtered_symbols_df.columns) < 2:
            print(f"Not enough companies in sector {sector} for cointegration testing.")
            continue

        # Find cointegrated pairs for the current sector
        pairs_data = find_cointegrated_pairs(filtered_symbols_df, p_value_threshold=0.02, min_data_points_threshold=100)

        # Debug: Print pairs_data being returned for the current sector
        print(f"Cointegrated pairs for sector {sector}: {pairs_data}")

        # Accumulate the pairs data across all sectors
        all_pairs_data.extend(pairs_data)  # Ensure correct accumulation

        # Debug: Print the current state of all_pairs_data after each sector
        print(f"Accumulated pairs so far: {all_pairs_data}")

# Final accumulated result
print("\nAll Cointegrated Pairs Across Sectors:")
for pair in all_pairs_data:
    print(pair)


In [None]:
sorted_pairs = sorted(all_pairs_data, key=lambda x: x[2])
sorted_pairs