<a href="https://colab.research.google.com/github/LeibGit/-DI_Bootcamp/blob/main/SMP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Extract zip file with stock data
import zipfile
import os

zip_file_path = "archive.zip"
extracted_dir = "stock_data"

if not os.path.exists(extracted_dir):
  os.makedirs(extracted_dir)

try:
  with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_dir)
  print(f"Files extracted in: {extracted_dir}")
except zipfile.BadZipFile as e:
  print(f"An error occured with the zip file: {e}")
except FileNotFoundError as e:
  print(f"File not found: {e}")

In [None]:
# make dowloads
!pip install gensim
!pip install spacy
!pip install numpy
!pip install tensorflow

In [None]:

# importing modules
import spacy
import gensim
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, GRU
from keras.optimizers import SGD
import math
from sklearn.metrics import mean_squared_error

In [None]:
# grab sample columns that csv contains for necessary metadata creation in master dataframe
df = pd.read_csv("stock_data/stocks/AAPL.csv")
df.columns

In [None]:
# -- EXPECT A 2-Minute COMPUTE TIME --

# Loop through all assets and convert into one main dataframe
from pathlib import Path
import glob

# list of all dataframes
all_dfs = []

# main paths for both stocks and etfs
main_path_stocks = "stock_data/stocks"
main_path_etfs = "stock_data/etfs"

# finding all files that end in csv
etf_csv_files = glob.glob(os.path.join(main_path_etfs, "*.csv"))
stocks_csv_files = glob.glob(os.path.join(main_path_stocks, "*.csv"))

# loop for etfs
for filename in etf_csv_files:
  try:
    df = pd.read_csv(filename)
    df["ticker"] = Path(filename).stem #grabs ticker symbol by removing .csv from filename
    df["asset_type"] = "etf"
    all_dfs.append(df)
  except Exception as e:
    continue

#loop for stocks
for filename in stocks_csv_files:
  try:
    df = pd.read_csv(filename)
    df["ticker"] = Path(filename).stem #grabs ticker symbol by removing .csv from filename
    df["asset_type"] = "stock"
    all_dfs.append(df)
  except Exception as e:
    continue

In [None]:
# combine into one dataframe
combined_df = pd.concat(all_dfs, ignore_index=True)
print("Successfully combined all files into one DataFrame.")
# analyze first 5 rows to confirm
combined_df.head()

In [None]:
#aanalyze columns to confirm
combined_df.columns

In [None]:
# add filler column for next days predicted close price
combined_df["Predicted_close"] = np.nan
combined_df.columns

In [None]:
# get range of dates to decide data split
print(np.max(combined_df["Date"]))
print(np.min(combined_df["Date"]))

In [None]:
# Ensure Date is in datetime format
combined_df["Date"] = pd.to_datetime(combined_df["Date"])

# Create a target column (Next day's close price)
combined_df["Predicted_close"] = combined_df["Close"].shift(-1)

# Drop the last row(s) with NaN target
combined_df = combined_df.dropna(subset=["Predicted_close"])

# Split by date
split_date = pd.Timestamp('2019-04-02')

train_df = combined_df.loc[combined_df['Date'] <= split_date]
test_df  = combined_df.loc[combined_df['Date'] > split_date]

# Select features and target
features = ["Open", "High", "Low", "Close", "Volume"]

X_train = train_df[features]
y_train = train_df["Predicted_close"]

X_test  = test_df[features]
y_test  = test_df["Predicted_close"]


In [None]:
# scale features between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))

In [None]:
# initialize the STM model with Sequential
regressor = Sequential()

# First LSTM layer
regressor.add(
    LSTM(
        units=50, # neurons => memory cells
        return_sequences=True, # return sequence for future layers
        input_shape=(X_train.shape[1], 1) #
      )
    )
regressor.add(Dropout(0.2))

# Second LSTM layer
regressor.add(
    LSTM(
        units=50,
        return_sequences=True
        )
    )
regressor.add(Dropout(0.2))

# Third LSTM layer
regressor.add(
    LSTM(
        units=50,
        return_sequences=True
    )
)
regressor.add(Dropout(0.2))

# Fourth LSTM layer
regressor.add(LSTM(units=50))
regressor.add(Dropout(0.2))

# Output layer
regressor.add(Dense(units=1))

# Compiling the model
regressor.compile(optimizer="rmsprop", loss="mean_squared_error")

# Training the model
regressor.fit(X_train, y_train, epochs=50, batch_size=32)