### **LIBRARIES**

In [None]:
import yfinance as yf
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

### **DATA**

In [None]:
# Import historical data from yahoo finance API
# Prices per industry (Tickers)
tickers = [

    # Target of the model
    "SPY", 
    
    # Technology
    "AAPL", "MSFT", "NVDA", "GOOGL", "INTC",
    "CSCO", "TXN", "IBM", "ORCL", "QCOM", "AMZN",
    "TSLA", "META",

    # Financials
    "GS", "BAC", "C", "WFC", "MS", "AXP", "BRK-B", 
    "V", "MA","JPM",

    # Healthcare
    "UNH", "JNJ", "PFE", "LLY", "ABBV", "MRK", 
    "AMGN","MDT", "CI", "CVS",

    # Index(s)
    "^OEX", "NDAQ"
]

# Empty dictionary to keep tickers
data_dict = {}

# Import historical data for each ticker
for ticker in tickers:
    stock_data = yf.Ticker(ticker).history(start = "2009-01-01", end = "2026-01-01", interval = "1d", auto_adjust = False)[['Close']]
    stock_data.index = stock_data.index.tz_localize(None)
    stock_data.index.name = 'Date'
    stock_data.rename(columns = {'Close': ticker}, inplace = True)
    data_dict[ticker] = stock_data

# Individual dataframe for each stock
for ticker in tickers:
    globals()[ticker] = data_dict[ticker]
    globals()[ticker] = globals()[ticker][[ticker]]

# Merge dataframes 
SPY_combined = SPY.copy()
for ticker in tickers:
    if ticker != "SPY":  
        SPY_combined = SPY_combined.join(globals()[ticker], how = "inner") 
        
# Adjust dataset
df = SPY_combined
df.rename(columns = {'SPY': 'TARGET'}, inplace = True)
df = df.dropna()
model_df = df.copy()

In [None]:
model_df

### **DATA ANALYSIS**

In [None]:
# Correlation between S&P and other variables
correlation_with_sp = df.corr()['TARGET']
corr = pd.DataFrame(correlation_with_sp)
corr

In [None]:
# Correlation matrix 
corr_matrix = df.corr()

# Heatmap for the correlation matrix
plt.figure(figsize=(20, 18)) 
sns.heatmap(corr_matrix, annot = True, cmap = 'coolwarm', fmt = '.2f', linewidths = 0.5)
plt.title('Heatmap of Correlation between Columns in the DataFrame')
plt.show()

In [None]:
# Prices over-time
for column in df.columns:
    plt.figure()  
    plt.plot(df[column])
    plt.xlabel('Index')  
    plt.ylabel(column)   
    plt.title(f'Plot of {column}')  
    plt.show()  

### **WEIGHTED INDUSTRIES**

In [None]:
# Ticker for SPY
tickers = ["SPY"]

# Dictionary to store data
data_dict = {}

# Import SPY data
for ticker in tickers:
    stock_data = yf.Ticker(ticker).history(start="2013-01-02", end="2026-01-01", interval="1d", auto_adjust = False)
    stock_data.index = stock_data.index.tz_localize(None)
    stock_data.index.name = 'Date'
    data_dict[ticker] = stock_data

# Combine SPY data
SPY_combined = data_dict["SPY"]

# Prepare dataset and remove nulls
target = SPY_combined 
target = target.dropna()  

# Drop unnecessary columns and create TARGET
target['TARGET'] = (target['Close'] / target['Open']) - 1
target = target.drop(columns=['Dividends', 'Volume', 'Stock Splits', 
                              'Capital Gains', 'Low', 'High', 'Open', 'Close'
                             ])
target = target.drop(df.index[0])

# Convert prices to percentage changes
df = df.pct_change()
df = df.dropna()
df = df * 100

# Create TARGET
df[['TARGET']] = target[['TARGET']]

In [None]:
df

In [None]:
# Define industries
industries = {
    "tech": ["AAPL", "MSFT", "NVDA", "GOOGL", "INTC", "CSCO", "TXN", "IBM", "ORCL", "QCOM", "AMZN", "TSLA", "META"],
    "fin": ["GS", "BAC", "C", "WFC", "MS", "AXP", "BRK-B", "V", "MA", "JPM"],
    "health": ["UNH", "JNJ", "PFE", "LLY", "ABBV", "MRK", "AMGN", "MDT", "CI", "CVS"]
}

# Function to calculate rolling beta for multiple stocks
def calculate_rolling_beta(industry_tickers, df):
    industry_data = {"Date": df.index}
    for ticker in industry_tickers:
        stock_corr = df[[ticker, "TARGET"]].dropna()
        n_records = len(stock_corr)
        betas = []

        # Calcular la beta en ventanas de 1400 días
        for i in range(n_records - 1400 + 1):
            X = stock_corr.iloc[i:i+1400][ticker]
            Y = stock_corr.iloc[i:i+1400]["TARGET"]
            cov_XY = X.cov(Y)
            var_X = X.var()
            beta = cov_XY / var_X
            betas.append(beta)

        # Crear DataFrame con las betas móviles
        beta_series = pd.Series([np.nan] * 1399 + betas, index=stock_corr.index, name=f"{ticker}_BETA")
        industry_data[f"{ticker}_BETA"] = beta_series

    return pd.DataFrame(industry_data).set_index("Date")

# Calculate rolling betas for each industry
BETA_tech = calculate_rolling_beta(industries["tech"], df).dropna()
BETA_fin = calculate_rolling_beta(industries["fin"], df).dropna()
BETA_health = calculate_rolling_beta(industries["health"], df).dropna()

# Normalize betas by dividing by the sum of all betas for the respective industry
def normalize_betas(beta_df):
    beta_df['SUM_BETA'] = beta_df.sum(axis=1)
    for column in beta_df.columns[:-1]:  
        beta_df[column] = beta_df[column] / beta_df['SUM_BETA']
    beta_df = beta_df.drop(columns=['SUM_BETA'])
    return beta_df

# Normalize each industry's betas
BETA_tech_normalized = normalize_betas(BETA_tech)
BETA_fin_normalized = normalize_betas(BETA_fin)
BETA_health_normalized = normalize_betas(BETA_health)

# Multiply normalized betas by stock prices
def update_betas_with_prices(beta_df, price_data):
    updated_beta_df = beta_df.copy()
    for column in beta_df.columns:
        ticker = column.replace("_BETA", "")
        if ticker in price_data.columns:
            updated_beta_df[column] = beta_df[column] * price_data[ticker]
    updated_beta_df["SUM"] = updated_beta_df.sum(axis=1)
    return updated_beta_df

# Apply stock prices to normalized betas
weighted_tech = update_betas_with_prices(BETA_tech_normalized, SPY_combined)
weighted_fin = update_betas_with_prices(BETA_fin_normalized, SPY_combined)
weighted_health = update_betas_with_prices(BETA_health_normalized, SPY_combined)

### **MODELING**

#### **1| SETTING SET**

In [None]:
# Creating final dataset
regression = pd.DataFrame({
    "TARGET": model_df["TARGET"],
    "TECH_INDUSTRY": weighted_tech["SUM"],
    "FIN_INDUSTRY": weighted_fin["SUM"],
    "HEALTH_INDUSTRY": weighted_health["SUM"],
    "OEX": model_df["^OEX"],
    "NDX": model_df["NDAQ"]
})

# Creating Target 
regression['TARGET'] = regression['TARGET'].shift(-1)
regression = regression.dropna()

# Adjusting Dataframe
regression = regression.dropna()
regression = regression.pct_change()
regression = regression.dropna()

In [None]:
regression

#### **2| INCREMENTAL REGRESSION**

In [None]:
# Define X and Y 
X = regression.drop(columns=['TARGET'])
Y = regression['TARGET']

# Add intercept to X
X = sm.add_constant(X)

# List to store betas
betas = []

# Train model with first 1429 records 
initial_data = regression.iloc[:1429]  
X_initial = initial_data.drop(columns=['TARGET'])
Y_initial = initial_data['TARGET']
X_initial = sm.add_constant(X_initial)

# OLS regression for initial data
model_initial = sm.OLS(Y_initial, X_initial).fit()

# Store initial coefficients
betas.append(model_initial.params)

# Recalculate betas for each new record
for i in range(1429, len(regression)):  
    # Subset data up to record i
    current_data = regression.iloc[:i+1]
    X_current = current_data.drop(columns=['TARGET'])
    Y_current = current_data['TARGET']
    X_current = sm.add_constant(X_current)

    # Create logarithmic weights using log(x + 1)
    indices = np.arange(1, len(current_data) + 1) 
    weights = np.log1p(indices)  

    # OLS regression for current data with weights
    model_current = sm.WLS(Y_current, X_current, weights=weights).fit()  
    
    # Store new coefficients
    betas.append(model_current.params)

# Convert betas to DataFrame
betas_df = pd.DataFrame(betas)

In [None]:
betas_df

#### **3| PREDICTIONS**

In [None]:
# Select last 200 rows
regression = regression.iloc[-201:]

# Reset index and convert to column
regression_reset = regression.reset_index()

# Add 'const' column from betas_df
regression_reset['const'] = betas_df['const']

# Get common columns
common_columns = regression_reset.columns.intersection(betas_df.columns)

# Multiply common columns
for col in common_columns:
    regression_reset[col] = regression_reset[col] * betas_df[col].values

# Set 'Date' as index
regression_reset.set_index('Date', inplace=True)

# Drop missing values
regression_reset = regression_reset.dropna()

# Update regression
regression = regression_reset

# Remove the 'TARGET' column
regression.drop(columns=['TARGET'], inplace=True)

# Create 'PRED' column as the sum of all columns
regression['PRED'] = regression.sum(axis=1)

# Keep only the 'PRED' column
regression = regression[['PRED']]
regresssion = regression.dropna()

# Create empty row with index '2025-01-16'
regression.loc[pd.to_datetime('2025-01-17')] = None

# Shift 'PRED_SAME_DATE' by 1
regression['PRED'] = regression['PRED'].shift(1)

# Drop NA values
regression = regression.dropna()

In [None]:
regression

In [None]:
regression.to_csv("PREDICTIONS.csv")