In [5]:
# 1. Import necessary libraries
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from keras import layers, initializers 
from keras_tuner import HyperModel, BayesianOptimization
from pypfopt import EfficientFrontier, risk_models, expected_returns
import json
import re
import pickle
import  os


In [6]:
# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
os.environ['PYTHONHASHSEED'] = '42'
os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [7]:
# with open('symbols.json', 'r') as f:
#    tickers = json.load(f)

# pk_filenames.json = ["data/ibm.pk1", "data/aapl.pk1"]

with open('pk_filenames.json','r') as f:
    filenames = json.load(f)

# for filename in filenames:
#     df = pd.read_pickle(filename)

In [8]:
# 2. Data Collection
# with open('symbols.json', 'r') as f:
#    tickers = json.load(f)

# def fetch_data(ticker, start, end):
#     data = yf.download(ticker, start=start, end=end)
#     return data['Adj Close']

# tickers = ['AAPL', 'MSFT', 'GOOGL', 'TSLA', 'NVDA', 'TLT', 'COST', 'WMT', 'BA', 'DIS', 'JPM', 'AMD']
all_expected_returns = {}
all_data = pd.DataFrame()

In [9]:
# Function to aggregate daily returns into weekly returns
def aggregate_returns(data, freq='W'):
    return data.resample(freq).ffill().pct_change().dropna()

In [10]:
# Walk-forward validation function
def walk_forward_validation(data, model, time_step, n_test):
    predictions = []
    train, test = data[:-n_test], data[-n_test:]
    for i in range(n_test):
        train_set = pd.concat([train, test[:i]])
        X_train, y_train = create_dataset(train_set, time_step)
        model.fit(X_train, y_train, epochs=10, verbose=0)
        input_data = train_set[-time_step:].values.reshape((1, time_step, 1))
        yhat = model.predict(input_data, verbose=0)
        predictions.append(yhat[0, 0])
    return predictions

In [11]:
# Create dataset function to prepare the data for LSTM
def create_dataset(data, time_step):
    X, y = [], []
    for i in range(len(data) - time_step):
        X.append(data[i:i + time_step])
        y.append(data[i + time_step])
    return np.array(X), np.array(y)

In [12]:
for filename in filenames:
    df = pd.read_pickle(filename)
    # data = fetch_data(ticker, '2020-01-01', '2023-01-01')
    data = df['Adj Close']
    ticker = filename.split('/')[1].split('.')[0]

    # all_data[ticker] = data  # Store data for covariance calculation
    

    # Data Preprocessing
    weekly_returns = aggregate_returns(data)  # Aggregate to weekly returns

    # Normalize the weekly returns using Min-Max Scaler
    scaler = MinMaxScaler(feature_range=(0, 1))
    weekly_returns_reshaped = weekly_returns.values.reshape(-1, 1)
    scaler.fit(weekly_returns_reshaped)
    weekly_returns_normalized = scaler.transform(weekly_returns_reshaped)

    # Reshape data for LSTM in a compatible sliding window format
    time_step = 4
    X, y = create_dataset(weekly_returns_normalized, time_step)

    # LSTM Modeling with fixed initializers
    class LSTMHyperModel(HyperModel):
        def build(self, hp):
            model = keras.Sequential()
            model.add(layers.Input(shape=(time_step, 1)))  # Update input shape
            model.add(layers.LSTM(
                units=hp.Int('units', min_value=32, max_value=128, step=32),
                activation='relu',
                kernel_initializer=initializers.GlorotUniform(seed=42),  # Fixed seed for weights
                bias_initializer=initializers.Zeros()  # Fixed bias initializer
            ))
            model.add(layers.Dense(1))  # Ensure the output layer has a fixed size
            model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3])), loss='mse')
            return model

    # Hyperparameter tuning
    tuner = BayesianOptimization(
        LSTMHyperModel(),
        objective='val_loss',
        max_trials=2,
        executions_per_trial=1,
        directory='lstm_tuning',
        project_name=f'portfolio_optimization_{ticker}'
    )
    # Search for the best hyperparameters
    tuner.search(X, y, epochs=10, validation_split=0.2)
    
    # Use walk-forward validation to evaluate the model
    n_test = 52  # Number of weeks to predict
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    model = tuner.hypermodel.build(best_hps)
    predicted_returns_normalized = walk_forward_validation(pd.Series(weekly_returns_normalized.flatten()), model, time_step, n_test)

    # Inverse transform the predicted returns
    predicted_returns = scaler.inverse_transform(np.array(predicted_returns_normalized).reshape(-1, 1)).flatten()

    # Calculate the total return over the 52 weeks using compounded returns
    compounded_return = np.prod(1 + np.array(predicted_returns)) - 1

    # Use the compounded return as the annualized expected return
    annualized_return = compounded_return
    all_expected_returns[ticker] = annualized_return
    all_data[ticker] = weekly_returns  # Add this line to populate the all_data dictionary

    # Print annualized expected returns for each ticker
    print(f"Annualized Expected Returns for {ticker}: {annualized_return}")

    # filename = "data/" + ticker + '_annualized_return.pkl'
    # with open(filename,'wb') as f:
    #     pickle.dump(annualized_return,f)

Reloading Tuner from lstm_tuning\portfolio_optimization_META\tuner0.json
Annualized Expected Returns for META: 0.448915958404541
Reloading Tuner from lstm_tuning\portfolio_optimization_TSLA\tuner0.json
Annualized Expected Returns for TSLA: 1.464540719985962
Reloading Tuner from lstm_tuning\portfolio_optimization_F\tuner0.json
Annualized Expected Returns for F: 0.1566929817199707


In [18]:
with open('data/all_data.pkl','wb') as f:
    pickle.dump(all_data,f)

In [13]:
# Convert the dictionary to a Pandas Series
expected_returns_series = pd.Series(all_expected_returns)

In [14]:
cov_matrix = risk_models.risk_matrix(all_data, returns_data=True, method='ledoit_wolf')

In [15]:
# Portfolio Optimization
# Calculate the covariance matrix using all tickers' data
# cov_matrix = risk_models.risk_matrix(returns_df, method='ledoit_wolf' )
# https://pyportfolioopt.readthedocs.io/en/latest/RiskModels.html
ef = EfficientFrontier(expected_returns=expected_returns_series, cov_matrix=cov_matrix)
weights = ef.max_sharpe()
cleaned_weights = ef.clean_weights()

# Print the optimized portfolio weights
print("Optimized Portfolio Weights:", cleaned_weights)

Optimized Portfolio Weights: OrderedDict([('META', 0.2893), ('TSLA', 0.7107), ('F', 0.0)])


In [16]:
with open('data/cleaned_weights.pkl','wb') as f:
    pickle.dump(cleaned_weights,f)