In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
import time
import warnings
from multiprocessing import Pool

warnings.filterwarnings("ignore")


class Analyzer:
    def __init__(self, symbols, start_date, end_date, batch_size=25, delay=1.0, n_jobs=-1):
        self.symbols = symbols
        self.start_date = start_date
        self.end_date = end_date
        self.batch_size = batch_size
        self.delay = delay
        # Use all CPUs automatically
        self.workers = os.cpu_count() if n_jobs == -1 else max(1, n_jobs)
        self.stock_data = None
        self.correlation_matrix = None

    # Fetch a single batch
    def _fetch_batch(self, batch):
        """Download one batch of tickers safely."""
        for retry in range(3):
            try:
                data = yf.download(
                    tickers=batch,
                    start=self.start_date,
                    end=self.end_date,
                    auto_adjust=True,
                    progress=False,
                    group_by="ticker",
                )

                if data.empty:
                    continue

                # Handle both single and multi-index columns
                if isinstance(data.columns, pd.MultiIndex):
                    close = data.xs("Close", axis=1, level=1)
                else:
                    close = pd.DataFrame(data["Close"])

                close = close.dropna(axis=1, how="all")
                if not close.empty:
                    return close

            except Exception:
                time.sleep(self.delay * (retry + 1))
        return pd.DataFrame()

    # Parallel data fetching
    def fetch_stock_data_parallel(self):
        print(f"{self.workers} CPU cores for parallelization")
        batches = [self.symbols[i:i + self.batch_size] for i in range(0, len(self.symbols), self.batch_size)]

        with Pool(self.workers) as pool:
            batch_results = pool.map(self._fetch_batch, batches)

        # Combine all valid results into one clean DataFrame
        combined = pd.concat(batch_results, axis=1)
        combined = combined.loc[:, ~combined.columns.duplicated()]
        combined = combined.dropna(axis=1, how="all")

        print(f"✓ Live data combined — {len(combined.columns)} tickers loaded cleanly")
        return combined

    # Quality filter
    def filter_quality_data(self, stock_data, min_days=30, max_missing_pct=0.2, min_volatility=0.001):
        print(f"Filtering {len(stock_data.columns)} tickers for data quality live ")
        valid_tickers = []
        total = len(stock_data.columns)
        for i, col in enumerate(stock_data.columns):
            if i % 500 == 0:
                print(f"  Checked {i}/{total}")
            series = stock_data[col].dropna()
            if len(series) < min_days:
                continue
            missing_ratio = 1 - len(series) / len(stock_data)
            if missing_ratio > max_missing_pct:
                continue
            if series.pct_change().std() < min_volatility:
                continue
            valid_tickers.append(col)
        print(f"✓ {len(valid_tickers)}/{total} tickers passed")
        return stock_data[valid_tickers]

    # Correlation matrix
    def calculate_correlation_matrix(self, stock_data):
        print("Calculating correlation matrix live")
        returns = stock_data.pct_change().dropna()
        corr = returns.corr().fillna(0)
        return corr, returns

    # Find correlations for a single ticker
    def find_correlations_for_ticker(self, ticker, top_n=50):
        if ticker not in self.correlation_matrix.columns:
            print(f"Ticker '{ticker}' not found.")
            return
        corr_series = self.correlation_matrix[ticker].drop(ticker)
        print(f"\nTop {top_n} correlated with {ticker}:")
        print(corr_series.nlargest(top_n))
        print(f"\nTop {top_n} inversely correlated with {ticker}:")
        print(corr_series.nsmallest(top_n))

    # Run everything
    def run(self):
        print(f"Starting analysis on {len(self.symbols)} tickers\n{'=' * 60}")
        self.stock_data = self.fetch_stock_data_parallel()
        if self.stock_data.empty:
            print("No data fetched. Exiting.")
            return

        self.stock_data = self.filter_quality_data(self.stock_data)
        self.correlation_matrix, _ = self.calculate_correlation_matrix(self.stock_data)

        print("\n Correlation matrix ready.")
        print(f"Max correlation: {self.correlation_matrix.values.max():.3f}")
        print(f"Min correlation: {self.correlation_matrix.values.min():.3f}")

        ticker = input("Enter a ticker baby: ").upper()
        self.find_correlations_for_ticker(ticker)



if __name__ == "__main__":
    symbols = [
'AGI', 'RGLD', 'AU', 'KGC', 'EQX', 'IAG', 'HMY', 'BTG', 'LUG.TO', '1787.hk', 'EGO', 'OGC.TO', 'NGD', '600489.SS', 'CGG.TO', 'SAND', 'NG', 'GOR.AX', 'WDO.TO', 'CG.TO', 'WGX.AX', 'SA', '002202.SZ', '600988.SS', 'MDKA.JK', '8111.T', 'PNR.T', '3330.HK', 'E5H.SI', 'AYA.TO', '000766.SZ', 'BGL.AX', '000766.SZ', 'PGOLD.PS', 'GDEN', '0340.HK', '8871.T', 'SENCO.NS', 'GOLDIAM.NS'
               ]

    start_date = "2025-01-01"
    end_date = "2025-10-09"

    analyzer = Analyzer(
        symbols,
        start_date,
        end_date,
        batch_size=20,
        delay=0.5,
        n_jobs=-1,  # use ALL CPU cores
    )

    analyzer.run()



Starting analysis on 39 tickers
2 CPU cores for parallelization


ERROR:yfinance:
2 Failed downloads:
ERROR:yfinance:['PNR.T', 'PGOLD.PS']: YFTzMissingError('possibly delisted; no timezone found')


✓ Live data combined — 36 tickers loaded cleanly
Filtering 36 tickers for data quality live 
  Checked 0/36
✓ 36/36 tickers passed
Calculating correlation matrix live

 Correlation matrix ready.
Max correlation: 1.000
Min correlation: -0.183
Enter a ticker baby: HMY

Top 50 correlated with HMY:
Ticker
AU            0.792563
KGC           0.712003
AGI           0.701019
IAG           0.649250
EGO           0.630759
RGLD          0.626152
NGD           0.607476
CG.TO         0.595193
WDO.TO        0.580305
SAND          0.579967
SA            0.560239
EQX           0.537357
BTG           0.530192
LUG.TO        0.514013
OGC.TO        0.431182
AYA.TO        0.422629
3330.HK       0.349588
CGG.TO        0.341408
600988.SS     0.341061
1787.HK       0.299880
600489.SS     0.292825
NG            0.253435
0340.HK       0.214051
8871.T        0.101051
GOLDIAM.NS    0.094834
SENCO.NS      0.080879
MDKA.JK       0.057555
BGL.AX        0.038657
GOR.AX        0.009654
WGX.AX        0.008740
8111.T 

In [None]:
import os
os.cpu_count()


2