In [None]:
%pip install openpyxl

In [None]:
import numpy as np
import random
import yfinance as yf
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import time  # for rate-limit delays

# Your full list of Yahoo-ready tickers (replace with your actual tickers)
symbols = [
    # Example tickers, replace with your full list
    "AAPL", "MSFT", "GOOGL", "AMZN", "TSLA", "FB", "BRK-B", "V", "JNJ", "WMT",
    # Add your tickers here...
]

# Date range for historical data
start_date = "2025-07-01"
end_date = "2025-10-09"

def fetch_stock_data_batched(symbols, start_date, end_date, batch_size=100, delay=2):
    all_data = []
    for i in range(0, len(symbols), batch_size):
        batch = symbols[i:i+batch_size]
        print(f"Downloading batch {i//batch_size + 1} / {int(np.ceil(len(symbols)/batch_size))}")
        try:
            data = yf.download(batch, start=start_date, end=end_date, auto_adjust=True)["Close"]
            if isinstance(data.columns, pd.MultiIndex):
                data.columns = data.columns.get_level_values(0)
            all_data.append(data)
            time.sleep(delay)
        except Exception as e:
            print(f"Error downloading batch {i//batch_size + 1}: {e}")
    if all_data:
        combined = pd.concat(all_data, axis=1)
        combined = combined.loc[:, ~combined.columns.duplicated()]
        combined = combined.dropna(axis=1, how='all')
        return combined
    else:
        return pd.DataFrame()

def calculate_daily_returns(stock_data):
    return stock_data.pct_change(fill_method=None).dropna()

def find_high_corr_pairs(correlation_matrix, threshold=0.85):
    corr_upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
    high_corr = corr_upper.stack().reset_index()
    high_corr.columns = ['Ticker1', 'Ticker2', 'Correlation']
    high_corr = high_corr[high_corr['Correlation'] >= threshold]
    return high_corr

def plot_top_corr_cluster(correlation_matrix, high_corr_pairs, max_tickers=50):
    top_tickers = pd.unique(high_corr_pairs[['Ticker1','Ticker2']].values.ravel())[:max_tickers]
    sampled_corr = correlation_matrix.loc[top_tickers, top_tickers]

    sns.clustermap(sampled_corr, cmap="coolwarm", linewidths=0.3, figsize=(12,12))
    plt.title("Top Correlated Cluster Heatmap", y=1.05)
    plt.show()

def main():
    desired_count = 50
    available_symbols = symbols.copy()
    random.shuffle(available_symbols)

    valid_tickers = []
    batch_size = 25  # Smaller batches to check and add tickers iteratively

    while len(valid_tickers) < desired_count and len(available_symbols) > 0:
        # Pick next batch to try
        next_batch_size = min(batch_size, desired_count - len(valid_tickers), len(available_symbols))
        batch = [sym for sym in available_symbols[:next_batch_size] if sym not in valid_tickers]

        if not batch:
            break  # no new tickers left to try

        print(f"Trying batch of {len(batch)} tickers...")
        stock_data = fetch_stock_data_batched(batch, start_date, end_date)

        # Identify tickers that downloaded successfully
        successful = [sym for sym in batch if sym in stock_data.columns]

        print(f"Batch results: {len(successful)} successful tickers.")

        # Add successful tickers to valid list
        valid_tickers.extend(successful)

        # Remove all tickers in batch from available_symbols
        available_symbols = [sym for sym in available_symbols if sym not in batch]

        # Sleep between batches to avoid rate limits
        time.sleep(1)

    if len(valid_tickers) == 0:
        print("No valid tickers found. Exiting.")
        return

    # Limit to exactly desired_count if more got added
    valid_tickers = valid_tickers[:desired_count]

    print(f"Final list contains {len(valid_tickers)} tickers.")

    # Download full data for valid tickers to continue with analysis
    stock_data = fetch_stock_data_batched(valid_tickers, start_date, end_date)

    if stock_data.empty:
        print("No data downloaded for valid tickers. Exiting.")
        return

    # Calculate daily returns
    daily_returns = calculate_daily_returns(stock_data)

    # Compute correlation matrix
    corr_matrix = daily_returns.corr()

    # Find highly correlated pairs
    high_corr_pairs = find_high_corr_pairs(corr_matrix, threshold=0.85)
    print(f"Found {len(high_corr_pairs)} highly correlated pairs (≥0.85):")
    print(high_corr_pairs.head(20))

    # Plot heatmap if there are correlated pairs
    if len(high_corr_pairs) > 0:
        print("Plotting heatmap of top correlated cluster...")
        plot_top_corr_cluster(corr_matrix, high_corr_pairs, max_tickers=50)
    else:
        print("No highly correlated pairs to plot.")

if __name__ == "__main__":
    main()


In [None]:
# Uninstall problematic patsy and reinstall cleanly
!pip uninstall -y patsy seaborn
!pip install --no-cache-dir patsy seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

print("Imports successful! Ready to go.")

In [None]:
!pip install --no-cache-dir patsy seaborn matplotlib numpy pandas