In [21]:
import numpy as np
import pandas as pd
from lstm_functions import *
from lost_functions import *
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
import yfinance as yf
from sklearn.decomposition import PCA
import os


In [25]:
directory = 'data/scrapped_data'

# Get all .csv files in the directory
all_files = os.listdir(directory)
csv_files = [f for f in all_files if f.endswith('.csv')]

In [26]:
all_data = {}

for csv_file in csv_files:
    # Get the name of the stock (which is the filename without the .csv extension)
    stock_name = os.path.splitext(csv_file)[0]
    
    # Read the csv file
    data = pd.read_csv(os.path.join(directory, csv_file))
    data['Date'] = pd.to_datetime(data['Date'])
    data = data.set_index('Date')
    # resampled_data = data.resample('Y').last().reset_index()
    original_data = data.reset_index()
    all_data[stock_name] = original_data

print(all_data.keys())

dict_keys(['CSCO', 'UAL', 'TROW', 'ISRG', 'NVR', 'TPR', 'DVN', 'CE', 'MRO', 'BA', 'VRTX', 'GILD', 'EQIX', 'TER', 'MDT', 'V', 'QRVO', 'A', 'FOX', 'FLT', 'MO', 'CTRA', 'SWKS', 'ENPH', 'MCHP', 'CDNS', 'MSCI', 'CHTR', 'EIX', 'BBY', 'WBA', 'LVS', 'HCA', 'AJG', 'DTE', 'C', 'T', 'CF', 'DISH', 'MGM', 'HUM', 'CBOE', 'CFG', 'WU', 'APH', 'SYY', 'MSI', 'FCX', 'ADM', 'LH', 'LNT', 'BAC', 'LNC', 'PSX', 'GPN', 'PPG', 'TECH', 'IRM', 'IQV', 'ESS', 'HAL', 'STZ', 'DXC', 'ADI', 'F', 'ADBE', 'CPRT', 'TDG', 'TFX', 'ULTA', 'ARE', 'SYK', 'CB', 'TSN', 'GNRC', 'PEP', 'PEG', 'NOW', 'LLY', 'COST', 'REG', 'NWS', 'LOW', 'MDLZ', 'BKNG', 'ZBRA', 'FMC', 'XEL', 'AIZ', 'MET', 'FTV', 'DLR', 'XRAY', 'FAST', 'TJX', 'SNA', 'MPC', 'BR', 'D', 'MRK', 'STX', 'NOC', 'BXP', 'KHC', 'IPG', 'UNP', 'ALLE', 'ABBV', 'CDAY', 'ORCL', 'ECL', 'ETR', 'EBAY', 'SBUX', 'PENN', 'IR', 'AMT', 'INTU', 'DPZ', 'PAYC', 'CMA', 'IPGP', 'PG', 'CAT', 'ODFL', 'MCD', 'MNST', 'AMZN', 'INTC', 'PNR', 'GLW', 'BDX', 'KMI', 'PWR', 'APTV', 'BBWI', 'DXCM', 'EXR', '

In [27]:
all_data['CSCO'].head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Log Returns,RSI,ATR
0,2009-01-02,16.41,17.0,16.25,16.959999,11.685604,40980600,,44.728481,0.0
1,2009-01-05,16.85,17.299999,16.75,17.110001,11.78896,45480200,0.008806,44.728481,0.0
2,2009-01-06,17.33,17.98,17.26,17.790001,12.257485,58256600,0.038973,44.728481,0.0
3,2009-01-07,17.370001,17.58,17.110001,17.32,11.933647,50246600,-0.026775,44.728481,0.0
4,2009-01-08,17.23,17.57,17.0,17.540001,12.085232,46484600,0.012622,44.728481,0.0


In [28]:
dfs = []  # List to hold individual DataFrames for each stock's transformed data

# Loop over each stock in the all_data dictionary
for stock, data in all_data.items():
    data = data.dropna()
    # Drop non-numeric columns, like 'Date' and 'Sector'
    numeric_data = data.drop(columns=['Date'])
    # Standardize the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(numeric_data)
    
    # Apply PCA
    pca = PCA(n_components=1)
    transformed_data = pca.fit_transform(scaled_data)
    
    # Create a new DataFrame for the transformed data and set its column name to the stock
    df_stock = pd.DataFrame(transformed_data, columns=[stock], index=data['Date'])
    dfs.append(df_stock)

# Concatenate all individual DataFrames to create the all_stock DataFrame
all_stock = pd.concat(dfs, axis=1)

all_stock = all_stock.dropna()
all_stock

Unnamed: 0_level_0,CSCO,UAL,TROW,ISRG,NVR,TPR,DVN,CE,MRO,BA,...,CRM,PGR,WAT,IEX,BWA,LRCX,NWL,UAA,BLK,PPL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-05-28,5.742560,-3.101492,-2.941123,3.977873,4.209739,2.612609,3.672401,2.787215,2.292586,4.585790,...,4.939526,5.968055,3.749546,4.175739,0.474799,3.477087,-2.244693,0.768567,2.432493,-0.551553
2019-05-29,5.556673,-2.961172,-3.016342,3.933250,4.219512,2.536146,4.021113,2.862652,2.579049,4.438496,...,4.809965,5.990602,3.663666,4.159149,0.422434,3.331694,-2.383500,0.671020,2.327289,-0.271027
2019-05-30,5.674830,-3.058444,-3.145513,4.033821,4.291905,2.714001,3.914470,2.914301,2.590401,4.443868,...,4.892576,6.082325,3.672284,4.202720,0.310862,3.435586,-2.402307,0.656765,2.336810,-0.199948
2019-05-31,5.283546,-2.856948,-2.844767,3.813620,4.210843,2.654546,3.928847,2.580194,2.807813,4.259111,...,4.726087,5.921675,3.599236,4.208028,0.553490,3.284177,-2.471258,0.632961,2.101701,-0.114303
2019-06-03,5.218577,-2.884312,-2.471279,3.694799,4.374227,2.586957,3.793778,2.569843,2.714616,4.112392,...,4.449071,5.920239,3.556064,4.269690,0.398818,3.205809,-2.360422,0.675346,2.145099,-0.270231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-24,4.528247,-3.778037,-5.202313,5.668613,5.883938,2.746525,3.127041,4.913191,2.194773,4.164945,...,5.050904,5.051427,4.836394,5.210768,-1.093609,7.708492,-1.234440,0.213665,3.776071,-3.522287
2019-12-26,4.460317,-3.752110,-5.289247,5.644636,5.831584,2.671834,3.185776,4.816622,2.252053,4.031953,...,5.037988,4.936749,4.790517,5.194602,-1.011697,7.661096,-1.277436,0.220990,3.819172,-3.400410
2019-12-27,4.425738,-3.675666,-5.239472,5.629947,5.791137,2.761027,3.379824,4.814851,2.342484,4.006162,...,5.027544,4.973646,4.780315,5.176829,-0.989948,7.613757,-1.304755,0.215615,3.813293,-3.408152
2019-12-30,4.347413,-3.606835,-5.122394,5.562392,5.757856,2.863804,3.414140,4.763848,2.359675,3.920394,...,4.976048,4.974005,4.769360,5.133344,-0.939322,7.502904,-1.329096,0.148008,3.757418,-3.465939


In [29]:
def evaluate_model(y_train, train_predictions, y_test, test_predictions, ticker, feature):
    train_mae = mean_absolute_error(y_train, train_predictions)
    test_mae = mean_absolute_error(y_test, test_predictions)

    train_rmse = mean_squared_error(y_train, train_predictions, squared=False)
    test_rmse = mean_squared_error(y_test, test_predictions, squared=False)

    print(f"\nEvaluation for {ticker} on {feature}:")
    print(f"Training MAE: {train_mae}, Testing MAE: {test_mae}")
    print(f"Training RMSE: {train_rmse}, Testing RMSE: {test_rmse}\n")
    return train_mae, test_mae, train_rmse, test_rmse

In [30]:
def plot_predictions(y_train, train_predictions, y_test, test_predictions, ticker, feature):
    plt.figure(figsize=(14,7))
    plt.plot(y_train, label="Actual Train Values", color='blue')
    plt.plot(train_predictions, label="Predicted Train Values", color='blue', linestyle='dashed')
    plt.plot(np.arange(len(y_train), len(y_train) + len(y_test)), y_test, label="Actual Test Values", color='red')
    plt.plot(np.arange(len(y_train), len(y_train) + len(y_test)), test_predictions, label="Predicted Test Values", color='red', linestyle='dashed')
    plt.title(f"{ticker} {feature} - Actual vs Predicted Values")
    plt.legend()
    plt.show()

In [31]:
final_importance_values = {}
final_predictions = {}
# 30 is not a good number of batches, but it's a start for testing
# 60 is a good number of batches, but it takes a long time to train
time_steps = 60
features = len(all_stock.columns)
features
batch_size_value = 4

In [32]:
data = all_stock.copy().dropna()
lstm_model = LstmBuilder(time_step=time_steps, loss="mean_squared_error", batch_size=batch_size_value)
model = lstm_model.create_stateful_model(features=features)
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data)
X, y = lstm_model.create_sequences(normalized_data)
X_train, X_test, y_train, y_test = lstm_model.split_stateful_data(X,y, 0.9)



2023-10-26 16:07:56.921777: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2023-10-26 16:07:56.921913: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-10-26 16:07:56.921961: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-10-26 16:07:56.922319: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-10-26 16:07:56.922383: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Remaining:  0


In [33]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((80, 60, 477), (12, 60, 477), (80, 477), (12, 477))

In [34]:
model.fit(X_train, y_train, epochs=200, batch_size=batch_size_value, verbose=0)

# Extracting importance
dense_weights = model.layers[-1].get_weights()[0]

# Think about to use sum or mean and to use abs() or not
feature_weights = dense_weights.sum(axis=0)
print(feature_weights)

2023-10-26 16:08:03.359014: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


KeyboardInterrupt: 

In [None]:
# Predict for both training and testing data
train_predictions = scaler.inverse_transform(model.predict(X_train, batch_size=batch_size_value))
test_predictions = scaler.inverse_transform(model.predict(X_test, batch_size=batch_size_value))
y_train = scaler.inverse_transform(y_train)
y_test = scaler.inverse_transform(y_test)
features_list = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
for feature_index, feature_name in enumerate(data.columns):
    # Extracting data for the specific feature
    y_train_feature = y_train[:, feature_index]
    y_test_feature = y_test[:, feature_index]
    train_predictions_feature = train_predictions[:, feature_index]
    test_predictions_feature = test_predictions[:, feature_index]

    # Evaluating the model for this feature
    evaluate_model(y_train_feature, train_predictions_feature, y_test_feature, test_predictions_feature, feature_name, feature_name)

    # Plotting the results for this feature
    plot_predictions(y_train_feature, train_predictions_feature, y_test_feature, test_predictions_feature, feature_name, feature_name)


In [None]:
final_importance_values = dict(zip(data.columns, feature_weights))
final_importance_values

In [None]:
importance_values = np.array(list(final_importance_values.values()))
importance_values

# Run this if we want a arbitrage strategy
Each weight will be -1 to 1, the sum is 0

In [None]:
# 1. Scale the values to [-1, 1]
arbitrage_scaled_importance = 2 * (importance_values - np.min(importance_values)) / (np.max(importance_values) - np.min(importance_values)) - 1

# 2. Ensure the sum is zero
arbitrage_normalized_importance = arbitrage_scaled_importance - np.mean(arbitrage_scaled_importance)

# Convert back to dictionary
arbitrage_ticker_to_importance = dict(zip(final_importance_values.keys(), arbitrage_normalized_importance))

print(arbitrage_ticker_to_importance)

# Run this instead if we want a normal strategy
Each weight will be 0 to 1, the sum is 1

In [None]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))  # subtract max to avoid potential overflow
    return e_x / e_x.sum(axis=0)

# Convert the importance values to probabilities using softmax
probabilities = softmax(importance_values)

# Convert back to dictionary
normalized_ticker_to_importance = dict(zip(final_importance_values.keys(), probabilities))

print(normalized_ticker_to_importance)

In [None]:
def plot_importance(normalized_ticker_to_importance = normalized_ticker_to_importance, title='Normalized Importance Values'):
# Split the tickers and importance values based on positive and negative values
    long_positions = {k: v for k, v in normalized_ticker_to_importance.items() if v > 0}
    short_positions = {k: v for k, v in normalized_ticker_to_importance.items() if v <= 0}

    # Sort the positions for better visualization
    sorted_long = dict(sorted(long_positions.items(), key=lambda item: item[1], reverse=True))
    sorted_short = dict(sorted(short_positions.items(), key=lambda item: item[1]))

    # Create bar charts
    fig, ax = plt.subplots(figsize=(12, 7))

    # Positive cluster
    bars_long = ax.bar(sorted_long.keys(), sorted_long.values(), color='g', label='Long')

    # Negative cluster
    bars_short = ax.bar(sorted_short.keys(), sorted_short.values(), color='r', label='Short')

    # Rotate x-tick labels for better readability
    plt.xticks(rotation=45, ha='right')

    # Annotate the bars
    for bar in bars_long:
        yval = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2, yval + 0.01, round(yval, 3), ha='center', va='bottom', fontsize=9)

    for bar in bars_short:
        yval = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2, yval - 0.02, round(yval, 3), ha='center', va='top', fontsize=9)

    ax.set_title(title)
    ax.set_ylabel('Importance Value')
    ax.set_xlabel('Ticker')
    ax.legend()

    plt.tight_layout()
    plt.show()


In [None]:
# Plot the arbitrage importance values
plot_importance(arbitrage_ticker_to_importance, title='Arbitrage Importance Values')

In [None]:
# Plot the importance values
plot_importance(normalized_ticker_to_importance, title='Normalized Importance Values')

In [None]:
spy_data = yf.download('SPY')
spy_monthly = spy_data.resample('M').last()
spy_monthly

In [None]:
all_data

In [None]:
# Construct the Portfolio and Backtest
def build_portfolio(normalized_ticker_to_importance=normalized_ticker_to_importance, strategy='Normal'):
    portfolio_returns = pd.DataFrame()
    for ticker, importance in normalized_ticker_to_importance.items():
        data = all_data[ticker].set_index('Date')
        data['Returns'] = data['Adj Close'].pct_change().fillna(0)
        portfolio_returns[ticker] = data['Returns'] * importance
    portfolio_returns['Portfolio'] = portfolio_returns.sum(axis=1)
    spy_monthly['SPY Returns'] = spy_monthly['Adj Close'].pct_change().fillna(0)
    # Cumulative Returns
    portfolio_returns['Cumulative Portfolio'] = (portfolio_returns['Portfolio'] + 1).cumprod() - 1
    spy_monthly['Cumulative SPY'] = (spy_monthly['SPY Returns'] + 1).cumprod() - 1
    combined = pd.concat([portfolio_returns['Cumulative Portfolio'], spy_monthly['Cumulative SPY']], axis=1).dropna()
    print(combined)
    # Plot
    plt.figure(figsize=(14,7))
    combined['Cumulative Portfolio'].plot(label="Portfolio")
    combined['Cumulative SPY'].plot(label="SPY")
    plt.legend()
    plt.title(strategy + " Portfolio vs. SPY Cumulative Returns")
    plt.show()

In [None]:
# Build the portfolio for arbitrage strategy
build_portfolio(arbitrage_ticker_to_importance, strategy='Arbitrage')

In [None]:
# Build the portfolio for the Normal strategy
build_portfolio(normalized_ticker_to_importance, strategy='Normal')