In [3]:
import yfinance as yf
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from tqdm import tqdm

def load_tickers():
    """Load tickers from CSV file"""
    try:
        tickers_df = pd.read_csv('/content/tickers.csv')
        return tickers_df.iloc[:, 0].tolist()
    except Exception as e:
        print(f"Error loading tickers: {str(e)}")
        return []

def fetch_pair_data(base_ticker="FTI", compare_ticker=None):
    """Fetch and prepare data for a pair of tickers"""
    try:
        end_date = datetime.now()
        start_date = end_date - timedelta(days=2*365)

        base = yf.Ticker(base_ticker)
        base_data = base.history(start=start_date, end=end_date)

        compare = yf.Ticker(compare_ticker)
        compare_data = compare.history(start=start_date, end=end_date)

        df = pd.DataFrame({
            'Base_Close': base_data['Close'],
            'Compare_Close': compare_data['Close']
        })

        df = df.dropna()

        df['Base_Returns'] = df['Base_Close'].pct_change()
        df['Compare_Returns'] = df['Compare_Close'].pct_change()

        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.dropna()

        return df if len(df) > 30 else None

    except Exception as e:
        print(f"Error fetching data for {compare_ticker}: {str(e)}")
        return None

def calculate_correlation_metrics(df):
    """Calculate various correlation metrics"""
    try:
        metrics = {
            'pearson_price': df['Base_Close'].corr(df['Compare_Close']),
            'pearson_returns': df['Base_Returns'].corr(df['Compare_Returns']),
            'spearman_price': stats.spearmanr(df['Base_Close'], df['Compare_Close'])[0],
            'spearman_returns': stats.spearmanr(df['Base_Returns'], df['Compare_Returns'])[0],
            'beta': df['Base_Returns'].cov(df['Compare_Returns']) / df['Compare_Returns'].var(),
            'data_points': len(df)
        }
        return metrics
    except:
        return None

def analyze_all_correlations():
    """Analyze correlations for all tickers"""
    # Load tickers
    tickers = load_tickers()
    print(f"Loaded {len(tickers)} tickers for analysis")

    # Store results
    results = []

    # Define metric weights
    weights = {
        'pearson_price': 0.25,
        'pearson_returns': 0.30,
        'spearman_price': 0.15,
        'spearman_returns': 0.20,
        'beta': 0.10
    }

    # Define metric mapping for rankings
    metric_mapping = {
        'Price Correlation': 'pearson_price',
        'Returns Correlation': 'pearson_returns',
        'Non-linear Price': 'spearman_price',
        'Non-linear Returns': 'spearman_returns',
        'Beta Sensitivity': 'beta'
    }

    # Process each ticker
    for ticker in tqdm(tickers, desc="Analyzing tickers"):
        df = fetch_pair_data(compare_ticker=ticker)

        if df is not None:
            metrics = calculate_correlation_metrics(df)

            if metrics:
                composite_score = sum(metrics[k] * weights[k] for k in weights.keys())

                results.append({
                    'ticker': ticker,
                    **metrics,
                    'composite_score': composite_score
                })

    # Convert to DataFrame
    results_df = pd.DataFrame(results)

    # Create rankings
    rankings = {
        'Composite Ranking': results_df.sort_values('composite_score', ascending=False),
        'Price Correlation': results_df.sort_values('pearson_price', ascending=False),
        'Returns Correlation': results_df.sort_values('pearson_returns', ascending=False),
        'Non-linear Price': results_df.sort_values('spearman_price', ascending=False),
        'Non-linear Returns': results_df.sort_values('spearman_returns', ascending=False),
        'Beta Sensitivity': results_df.sort_values('beta', ascending=False)
    }

    # Save and print results
    for name, ranking in rankings.items():
        # Save top 20
        filename = f"ranking_{name.lower().replace(' ', '_')}.csv"
        ranking.head(20).to_csv(filename, index=False)

        # Print top 5
        print(f"\nTop 5 {name}:")
        if name == 'Composite Ranking':
            display_cols = ['ticker', 'composite_score']
        else:
            metric_col = metric_mapping[name]
            display_cols = ['ticker', metric_col]

        print(ranking[display_cols].head())

    return rankings

def create_correlation_heatmap(rankings):
    """Create heatmap of top 20 correlated tickers"""
    top_tickers = rankings['Composite Ranking']['ticker'].head(20).tolist()

    # Fetch data for all top tickers
    price_data = pd.DataFrame()
    for ticker in top_tickers:
        df = fetch_pair_data(compare_ticker=ticker)
        if df is not None:
            price_data[ticker] = df['Compare_Returns']

    # Calculate correlation matrix
    corr_matrix = price_data.corr()

    # Create heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Heatmap of Top 20 Related Tickers')
    plt.tight_layout()
    plt.savefig('correlation_heatmap.png')
    plt.close()

# Run the analysis
print("Starting correlation analysis...")
rankings = analyze_all_correlations()
create_correlation_heatmap(rankings)
print("\nAnalysis complete. Results have been saved to CSV files and visualizations created.")

Starting correlation analysis...
Loaded 418 tickers for analysis


Analyzing tickers:  98%|█████████▊| 408/418 [01:10<00:01,  6.06it/s]ERROR:yfinance:$WRK: possibly delisted; no timezone found
Analyzing tickers: 100%|██████████| 418/418 [01:12<00:00,  5.80it/s]



Top 5 Composite Ranking:
    ticker  composite_score
123   FANG         0.741883
50     BKR         0.730486
299    OKE         0.653919
409    WMB         0.651566
252    MPC         0.633649

Top 5 Price Correlation:
    ticker  pearson_price
136    ETN       0.949685
20    AMZN       0.944881
143    LLY       0.943307
18    GOOG       0.939438
17   GOOGL       0.938693

Top 5 Returns Correlation:
    ticker  pearson_returns
188    HAL         0.696493
50     BKR         0.677373
347    SLB         0.670933
251    MRO         0.605281
123   FANG         0.593053

Top 5 Non-linear Price:
    ticker  spearman_price
406   WELL        0.936595
178     GE        0.936295
136    ETN        0.934166
247      L        0.933003
176   GRMN        0.932218

Top 5 Non-linear Returns:
    ticker  spearman_returns
188    HAL          0.673915
50     BKR          0.642593
347    SLB          0.642580
251    MRO          0.607043
123   FANG          0.600407

Top 5 Beta Sensitivity:
    ticker     