In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import yfinance as yf
import talib
from scipy.stats import pearsonr

# Displaying all columns in the output for clarity
pd.set_option('display.max_columns', None)

In [2]:
# Define stocks, date range, and technical indicators
stocks = ['COST', 'UL', 'AMGN', 'UNH', 'WAT', 'UPS', 'LPX', 'WM', 'NVDA',
    'GOOGL', 'MSFT', 'AXP', 'BLK', 'BRK-B', 'NEE', 'XOM', 'CNI']
start_date = '2004-10-01'
end_date = '2024-10-01'
indicators = ['SMA_50', 'SMA_100', 'SMA_200', 'RSI', 'Volatility_30', 'MACD', 'MACD_Signal', 
              'BB_Upper', 'BB_Middle', 'BB_Lower', 'Historical_Volatility']

# Initialize a dictionary to store correlations for each indicator and stock
correlation_results = {indicator: [] for indicator in indicators}

In [3]:
# Function to calculate technical indicators
def calculate_indicators(data):
    data['SMA_50'] = talib.SMA(data['Close'], timeperiod=50)
    data['SMA_100'] = talib.SMA(data['Close'], timeperiod=100)
    data['SMA_200'] = talib.SMA(data['Close'], timeperiod=200)
    data['RSI'] = talib.RSI(data['Close'], timeperiod=14)
    data['Volatility_30'] = data['Close'].pct_change().rolling(30).std()
    macd, macd_signal, _ = talib.MACD(data['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
    data['MACD'] = macd
    data['MACD_Signal'] = macd_signal
    upper_band, middle_band, lower_band = talib.BBANDS(data['Close'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
    data['BB_Upper'] = upper_band
    data['BB_Middle'] = middle_band
    data['BB_Lower'] = lower_band
    data['Historical_Volatility'] = data['Close'].pct_change().rolling(30).std() * np.sqrt(252)
    data['Cumulative_Return'] = data['Close'].shift(-126) / data['Close'] - 1
    return data

In [4]:
# Loop over stocks
for stock in stocks:
    print(f"\nProcessing stock: {stock}...")
    data = yf.download(stock, start=start_date, end=end_date)
    if data.empty or 'Close' not in data:
        print(f"Skipping {stock}: No data available.")
        continue

    # Calculate technical indicators
    print("Calculating technical indicators...")
    data = calculate_indicators(data)

    # Drop rows with missing values
    data.dropna(subset=['Cumulative_Return'] + indicators, inplace=True)

    # Skip stocks with insufficient data
    if len(data) < 30:  # Arbitrary threshold for minimal rows
        print(f"Skipping {stock}: Insufficient data after preprocessing.")
        for indicator in indicators:
            correlation_results[indicator].append(None)
        continue

    # Calculate correlations
    for indicator in indicators:
        if indicator in data.columns:
            try:
                corr, _ = pearsonr(data['Cumulative_Return'], data[indicator])
                correlation_results[indicator].append(corr)
            except ValueError as e:
                print(f"Error calculating correlation for {stock}, {indicator}: {e}")
                correlation_results[indicator].append(None)
        else:
            correlation_results[indicator].append(None)



Processing stock: COST...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: UL...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: AMGN...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: UNH...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: WAT...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: UPS...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: LPX...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: WM...
[*********************100%%**********************]  1 of 

In [5]:
# Convert results to a DataFrame
correlation_df = pd.DataFrame(correlation_results, index=stocks).T

# Save results to a CSV file
correlation_df.to_csv('correlation_results.csv', index_label='Indicator')
print("Correlation results saved to 'correlation_results.csv'.")

# Display the first few rows of results
correlation_df.head()

Correlation results saved to 'correlation_results.csv'.


Unnamed: 0,COST,UL,AMGN,UNH,WAT,UPS,LPX,WM,NVDA,GOOGL,MSFT,AXP,BLK,BRK-B,NEE,XOM,CNI
SMA_50,0.17658,-0.284316,0.001783,-0.01233,-0.172448,-0.149806,-0.121609,0.097425,0.222107,-0.031509,0.119908,-0.084218,-0.223331,0.037771,-0.111365,-0.351491,-0.212219
SMA_100,0.177371,-0.271424,0.016135,-0.01479,-0.164788,-0.138108,-0.120466,0.103404,0.225688,-0.037241,0.118699,-0.086332,-0.220368,0.042738,-0.107892,-0.369446,-0.201092
SMA_200,0.185232,-0.226902,0.022805,-0.020142,-0.152109,-0.148423,-0.118971,0.110311,0.239892,-0.037706,0.120361,-0.073144,-0.193802,0.044684,-0.098871,-0.408152,-0.187036
RSI,-0.128465,-0.046856,-0.140528,-0.033653,-0.143587,-0.160723,-0.083325,-0.172035,0.06858,-0.09788,-0.040549,-0.022972,-0.025565,-0.092606,-0.054572,-0.036905,-0.135889
Volatility_30,-0.125866,0.108946,-0.066493,0.045042,0.087792,0.27264,0.449098,-0.02046,-0.030474,0.014759,0.144518,0.262028,0.181722,-0.070675,0.112202,0.119231,0.234047


In [6]:
# Set a correlation threshold (e.g., abs(correlation) > 0.1)
correlation_threshold = 0.2

# Select indicators that meet the threshold for each stock
selected_indicators = correlation_df.applymap(lambda x: abs(x) > correlation_threshold).sum(axis=1)
print("Number of stocks each indicator is relevant for:")
print(selected_indicators)

# Filter out irrelevant indicators
relevant_indicators = selected_indicators[selected_indicators > 0].index.tolist()
print("\nRelevant indicators based on correlation threshold:")
print(relevant_indicators)

Number of stocks each indicator is relevant for:
SMA_50                   5
SMA_100                  5
SMA_200                  3
RSI                      0
Volatility_30            4
MACD                     2
MACD_Signal              2
BB_Upper                 5
BB_Middle                5
BB_Lower                 5
Historical_Volatility    4
dtype: int64

Relevant indicators based on correlation threshold:
['SMA_50', 'SMA_100', 'SMA_200', 'Volatility_30', 'MACD', 'MACD_Signal', 'BB_Upper', 'BB_Middle', 'BB_Lower', 'Historical_Volatility']


In [7]:
# Define stocks, date range, and technical indicators
stocks = ['COST', 'UL', 'AMGN', 'UNH', 'WAT', 'UPS', 'LPX', 'WM', 'NVDA',
    'GOOGL', 'MSFT', 'AXP', 'BLK', 'BRK-B', 'NEE', 'XOM', 'CNI']
start_date = '2010-01-01'
end_date = '2024-01-01'
indicators = ['SMA_50', 'SMA_100', 'SMA_200', 'RSI', 'Volatility_30', 'MACD', 'MACD_Signal', 
              'BB_Upper', 'BB_Middle', 'BB_Lower', 'Historical_Volatility']

# Initialize a dictionary to store correlations for each indicator and stock
correlation_results = {indicator: [] for indicator in indicators}

# Function to calculate technical indicators
def calculate_indicators(data):
    data['SMA_50'] = talib.SMA(data['Close'], timeperiod=50)
    data['SMA_100'] = talib.SMA(data['Close'], timeperiod=100)
    data['SMA_200'] = talib.SMA(data['Close'], timeperiod=200)
    data['RSI'] = talib.RSI(data['Close'], timeperiod=14)
    data['Volatility_30'] = data['Close'].pct_change().rolling(30).std()
    macd, macd_signal, _ = talib.MACD(data['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
    data['MACD'] = macd
    data['MACD_Signal'] = macd_signal
    upper_band, middle_band, lower_band = talib.BBANDS(data['Close'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
    data['BB_Upper'] = upper_band
    data['BB_Middle'] = middle_band
    data['BB_Lower'] = lower_band
    data['Historical_Volatility'] = data['Close'].pct_change().rolling(30).std() * np.sqrt(252)
    data['Price_in_6_Months'] = data['Close'].shift(-126)  # Predict price 6 months ahead (126 trading days)
    return data

# Loop over stocks
for stock in stocks:
    print(f"\nProcessing stock: {stock}...")
    data = yf.download(stock, start=start_date, end=end_date)
    if data.empty or 'Close' not in data:
        print(f"Skipping {stock}: No data available.")
        continue

    # Calculate technical indicators
    print("Calculating technical indicators...")
    data = calculate_indicators(data)

    # Drop rows with missing values
    data.dropna(subset=['Price_in_6_Months'] + indicators, inplace=True)

    # Skip stocks with insufficient data
    if len(data) < 30:  # Arbitrary threshold for minimal rows
        print(f"Skipping {stock}: Insufficient data after preprocessing.")
        for indicator in indicators:
            correlation_results[indicator].append(None)
        continue

    # Calculate correlations
    for indicator in indicators:
        if indicator in data.columns:
            try:
                corr, _ = pearsonr(data['Price_in_6_Months'], data[indicator])
                correlation_results[indicator].append(corr)
            except ValueError as e:
                print(f"Error calculating correlation for {stock}, {indicator}: {e}")
                correlation_results[indicator].append(None)
        else:
            correlation_results[indicator].append(None)

# Convert results to a DataFrame
correlation_df = pd.DataFrame(correlation_results, index=stocks).T

# Save results to a CSV file
correlation_df.to_csv('correlation_results.csv', index_label='Indicator')
print("Correlation results saved to 'correlation_results.csv'.")

# Display the first few rows of results
correlation_df.head()


Processing stock: COST...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: UL...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: AMGN...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: UNH...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: WAT...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: UPS...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: LPX...
[*********************100%%**********************]  1 of 1 completed
Calculating technical indicators...

Processing stock: WM...
[*********************100%%**********************]  1 of 

Unnamed: 0,COST,UL,AMGN,UNH,WAT,UPS,LPX,WM,NVDA,GOOGL,MSFT,AXP,BLK,BRK-B,NEE,XOM,CNI
SMA_50,0.973316,0.8412,0.954935,0.984012,0.917082,0.893713,0.918086,0.978693,0.844167,0.927714,0.963702,0.898278,0.898,0.963593,0.966885,0.729766,0.952422
SMA_100,0.972063,0.842208,0.959546,0.981436,0.908898,0.887322,0.904168,0.976823,0.821779,0.914752,0.958401,0.887037,0.88296,0.962635,0.964732,0.676572,0.952555
SMA_200,0.971273,0.850298,0.959467,0.97668,0.896207,0.860908,0.880133,0.973672,0.823311,0.898846,0.953249,0.865628,0.863043,0.958431,0.957915,0.552937,0.952029
RSI,-0.005478,0.017432,-0.090552,-0.09058,-0.010549,-0.020926,0.062899,0.023553,0.116078,0.044829,0.06304,0.043331,0.04385,0.02179,-0.050802,0.153989,-0.053122
Volatility_30,0.30698,0.109073,0.073684,0.056631,0.164374,0.511258,0.079817,0.150299,0.344557,0.265063,0.283973,0.228917,0.056963,0.060664,0.464893,-0.307971,0.217632


In [8]:
# Set a correlation threshold (e.g., abs(correlation) > 0.1)
correlation_threshold = 0.2

# Select indicators that meet the threshold for each stock
selected_indicators = correlation_df.applymap(lambda x: abs(x) > correlation_threshold).sum(axis=1)
print("Number of stocks each indicator is relevant for:")
print(selected_indicators)

# Filter out irrelevant indicators
relevant_indicators = selected_indicators[selected_indicators > 0].index.tolist()
print("\nRelevant indicators based on correlation threshold:")
print(relevant_indicators)

Number of stocks each indicator is relevant for:
SMA_50                   17
SMA_100                  17
SMA_200                  17
RSI                       0
Volatility_30             9
MACD                      4
MACD_Signal               4
BB_Upper                 17
BB_Middle                17
BB_Lower                 17
Historical_Volatility     9
dtype: int64

Relevant indicators based on correlation threshold:
['SMA_50', 'SMA_100', 'SMA_200', 'Volatility_30', 'MACD', 'MACD_Signal', 'BB_Upper', 'BB_Middle', 'BB_Lower', 'Historical_Volatility']
