## 1. Data Processing for Time-series with Lag Feature

In [None]:
import pandas as pd

data = pd.read_parquet('./stock_data/Cleaned_Stock_Data.parquet')
data_with_indicators = pd.read_parquet('./stock_data/Stock_Data_with_Indicators.parquet')

In [None]:
data.dtypes

In [None]:
# Create lag features (previous day's Close price)
data['Close_Lag_1'] = data['Close'].shift(1)
data['Close_Lag_2'] = data['Close'].shift(2)

data_with_indicators['Close_Lag_1'] = data_with_indicators['Close'].shift(1)
data_with_indicators['Close_Lag_2'] = data_with_indicators['Close'].shift(2)

# Drop NaN values after shifting
data.dropna(inplace=True)
data_with_indicators.dropna(inplace=True)

# Save processed data
data.to_parquet("./stock_data/Processed_Stock_Data.parquet")
data_with_indicators.to_parquet("./stock_data/Processed_Stock_Data_with_Indicators.parquet")

## 3. Random Forest

#### 3.1 Without Indicators

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

# Load dataset without indicators
data_no_ind = pd.read_parquet("./stock_data/Processed_Stock_Data.parquet")

# Split into features & target
features = ['Close_Lag_1', 'Close_Lag_2', 'Volume']
target = 'Close'

# Train-test split (80% train, 20% test)
train_X, test_X, train_y, test_y = train_test_split(data_no_ind[features], data_no_ind[target], test_size=0.2, shuffle=False)

# Train Random Forest
rf_no_ind = RandomForestRegressor(n_estimators=100, random_state=42)
rf_no_ind.fit(train_X, train_y)

# Predictions
rf_no_ind_preds = rf_no_ind.predict(test_X)

# Evaluate performance
mae_no_ind = mean_absolute_error(test_y, rf_no_ind_preds)
rmse_no_ind = np.sqrt(mean_squared_error(test_y, rf_no_ind_preds))

print(f"Random Forest (No Indicators) - MAE: {mae_no_ind:.4f}, RMSE: {rmse_no_ind:.4f}")

# Save predictions
rf_no_ind_df = pd.DataFrame({"Date": test_X.index, "Actual": test_y, "Predicted": rf_no_ind_preds})
rf_no_ind_df.to_parquet("./models/RandomForest_No_Indicators.parquet", index=False)

#### 3.2 With Indicators

In [None]:
# Load dataset with indicators
data_with_ind = pd.read_parquet("./stock_data/Processed_Stock_Data_with_Indicators.parquet")

# Define features and target
features_with_ind = ['Close_Lag_1', 'Close_Lag_2', 'Volume', 'RSI', 'EMA_10', 'SMA_10', 'MACD']

# Train-test split
train_X, test_X, train_y, test_y = train_test_split(data_with_ind[features_with_ind], data_with_ind[target], test_size=0.2, shuffle=False)

# Train Random Forest
rf_with_ind = RandomForestRegressor(n_estimators=100, random_state=42)
rf_with_ind.fit(train_X, train_y)

# Predictions
rf_with_ind_preds = rf_with_ind.predict(test_X)

# Evaluate performance
mae_with_ind = mean_absolute_error(test_y, rf_with_ind_preds)
rmse_with_ind = np.sqrt(mean_squared_error(test_y, rf_with_ind_preds))

print(f"Random Forest (With Indicators) - MAE: {mae_with_ind:.4f}, RMSE: {rmse_with_ind:.4f}")

# Save predictions
rf_with_ind_df = pd.DataFrame({"Date": test_X.index, "Actual": test_y, "Predicted": rf_with_ind_preds})
rf_with_ind_df.to_parquet("./models/RandomForest_With_Indicators.parquet", index=False)

#### 3.3 Results

In [None]:
# Compare MAE & RMSE
print("\nPerformance Comparison:")
print(f"Random Forest (No Indicators) - MAE: {mae_no_ind:.4f}, RMSE: {rmse_no_ind:.4f}")
print(f"Random Forest (With Indicators) - MAE: {mae_with_ind:.4f}, RMSE: {rmse_with_ind:.4f}")

# Which model performs better?
if mae_with_ind < mae_no_ind:
    print("Technical Indicators improved model accuracy!")
else:
    print("Technical Indicators did not improve model accuracy.")

## 4. LSTM

#### 1.1 LSTM without Indicators

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset WITHOUT indicators
data_no_ind = pd.read_parquet("./stock_data/Processed_Stock_Data.parquet")

# Define features and target
features = ['Close_Lag_1', 'Close_Lag_2', 'Volume']
target = 'Close'

# Scale data between 0 and 1
scaler = MinMaxScaler()
data_no_ind[features] = scaler.fit_transform(data_no_ind[features])
data_no_ind[target] = scaler.fit_transform(data_no_ind[[target]])

# Convert to numpy arrays for LSTM
X, y = data_no_ind[features].values, data_no_ind[target].values

# Reshape data into 3D (samples, time steps, features)
X = X.reshape((X.shape[0], 1, X.shape[1]))

# Save preprocessed data
np.save("./models/LSTM_X_No_Indicators.npy", X)
np.save("./models/LSTM_y_No_Indicators.npy", y)

print(f"Data Shape for LSTM: {X.shape}, {y.shape}")


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Load preprocessed LSTM data
X = np.load("./models/LSTM_X_No_Indicators.npy")
y = np.load("./models/LSTM_y_No_Indicators.npy")

# Split data (80% train, 20% test)
split = int(0.8 * len(X))
train_X, test_X = X[:split], X[split:]
train_y, test_y = y[:split], y[split:]

# Define LSTM model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(1, train_X.shape[2])),
    LSTM(50),
    Dense(1)
])

# Compile model
model.compile(optimizer='adam', loss='mse')

# Train model
model.fit(train_X, train_y, epochs=50, batch_size=16, verbose=1)

# Save trained model
model.save("./models/LSTM_No_Indicators.keras")

In [None]:
# Load model
model = tf.keras.models.load_model("./models/LSTM_No_Indicators.keras")

# Make predictions
predictions = model.predict(test_X)

# Inverse transform predictions
predictions = scaler.inverse_transform(predictions)
actuals = scaler.inverse_transform(test_y.reshape(-1, 1))

# Compute MAE and RMSE
mae_lstm = np.mean(np.abs(actuals - predictions))
rmse_lstm = np.sqrt(np.mean((actuals - predictions) ** 2))

print(f"LSTM (No Indicators) - MAE: {mae_lstm:.4f}, RMSE: {rmse_lstm:.4f}")

test_dates = pd.read_parquet("./models/RandomForest_No_Indicators.parquet")["Date"]  # Ensure the dates align

# Create DataFrame for LSTM predictions
lstm_forecast_df = pd.DataFrame({'Date': test_dates, 'Actual': actuals.flatten(), 'Predicted': predictions.flatten()})

# Save to Parquet for visualization
lstm_forecast_df.to_parquet("./models/LSTM_No_Indicators_Predictions.parquet", index=False)

print("\n", lstm_forecast_df.head())  # Confirm data structure

#### 1.2 LSTM with Indicators

In [None]:
# Load dataset with indicators
data_with_ind = pd.read_parquet("./stock_data/Processed_Stock_Data_with_Indicators.parquet")

# Define features & target
features_with_ind = ['Close_Lag_1', 'Close_Lag_2', 'Volume', 'RSI', 'EMA_10', 'SMA_10', 'MACD']
target = 'Close'

# Scale data between 0 and 1
scaler = MinMaxScaler()
data_with_ind[features_with_ind] = scaler.fit_transform(data_with_ind[features_with_ind])
data_with_ind[target] = scaler.fit_transform(data_with_ind[[target]])

# Convert to numpy arrays
X, y = data_with_ind[features_with_ind].values, data_with_ind[target].values

# Reshape for LSTM
X = X.reshape((X.shape[0], 1, X.shape[1]))

# Save processed data
np.save("./models/LSTM_X_With_Indicators.npy", X)
np.save("./models/LSTM_y_With_Indicators.npy", y)

print(f"LSTM Data Shape (With Indicators): {X.shape}, {y.shape}")


In [None]:
# Load preprocessed data
X = np.load("./models/LSTM_X_With_Indicators.npy")
y = np.load("./models/LSTM_y_With_Indicators.npy")

# Split data (80% train, 20% test)
split = int(0.8 * len(X))
train_X, test_X = X[:split], X[split:]
train_y, test_y = y[:split], y[split:]

# Define LSTM model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(1, train_X.shape[2])),
    LSTM(50),
    Dense(1)
])

# Compile model
model.compile(optimizer='adam', loss='mse')

# Train model
model.fit(train_X, train_y, epochs=50, batch_size=16, verbose=1)

# Save trained model
model.save("./models/LSTM_With_Indicators.keras")

In [None]:
# Load model
model = tf.keras.models.load_model("./models/LSTM_With_Indicators.keras")

# Make predictions
predictions = model.predict(test_X)

# Inverse transform predictions
predictions = scaler.inverse_transform(predictions)
actuals = scaler.inverse_transform(test_y.reshape(-1, 1))

# Compute MAE and RMSE
mae_lstm_ind = np.mean(np.abs(actuals - predictions))
rmse_lstm_ind = np.sqrt(np.mean((actuals - predictions) ** 2))

print(f"LSTM (With Indicators) - MAE: {mae_lstm_ind:.4f}, RMSE: {rmse_lstm_ind:.4f}")

# Create DataFrame for LSTM predictions
lstm_forecast_df = pd.DataFrame({'Date': test_dates, 'Actual': actuals.flatten(), 'Predicted': predictions.flatten()})

# Save to Parquet for visualization
lstm_forecast_df.to_parquet("./models/LSTM_with_Indicators_Predictions.parquet", index=False)

print("\n", lstm_forecast_df.head())  # Confirm data structure