## 1. Data Processing for Time-series with Lag deatures.

In [1]:
import pandas as pd

data = pd.read_parquet('./stock_data/Cleaned_Stock_Data.parquet')
data_with_indicators = pd.read_parquet('./stock_data/Stock_Data_with_Indicators.parquet')

In [2]:
data.dtypes

Date      datetime64[ns]
Open             float64
High             float64
Low              float64
Close            float64
Volume             int64
dtype: object

In [3]:
# Create lag features (previous day's Close price)
data['Close_Lag_1'] = data['Close'].shift(1)
data['Close_Lag_2'] = data['Close'].shift(2)

data_with_indicators['Close_Lag_1'] = data_with_indicators['Close'].shift(1)
data_with_indicators['Close_Lag_2'] = data_with_indicators['Close'].shift(2)

# Drop NaN values after shifting
data.dropna(inplace=True)
data_with_indicators.dropna(inplace=True)

# Save processed data
data.to_parquet("./stock_data/Processed_Stock_Data.parquet")
data_with_indicators.to_parquet("./stock_data/Processed_Stock_Data_with_Indicators.parquet")

## 2. ARIMA

In [4]:
from statsmodels.tsa.stattools import adfuller

# Perform Augmented Dickey-Fuller Test
result = adfuller(data['Close'])
print(f"ADF Statistic: {result[0]}")
print(f"p-value: {result[1]}")

# If p-value > 0.05, data is not stationary
if result[1] > 0.05:
    print("Data is NOT stationary. Differencing is needed.")
else:
    print("Data is stationary. Ready for ARIMA.")

ADF Statistic: 1.7031519414486895
p-value: 0.9981362823257245
Data is NOT stationary. Differencing is needed.


In [5]:
data['Close_Diff'] = data['Close'].diff().dropna()

In [6]:
# Set Date as Index for ARIMA and LSTM
data.set_index('Date', inplace=True)

In [7]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Close_Lag_1,Close_Lag_2,Close_Diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-06,6.451466,6.477045,6.342226,6.348846,552160000,6.451467,6.440331,
2010-01-07,6.372319,6.379843,6.291067,6.33711,477131200,6.348846,6.451467,-0.011737
2010-01-08,6.328681,6.37984,6.291366,6.379238,447610800,6.33711,6.348846,0.042129
2010-01-11,6.403918,6.409937,6.273011,6.322967,462229600,6.379238,6.33711,-0.056272
2010-01-12,6.295279,6.312734,6.211921,6.251042,594459600,6.322967,6.379238,-0.071925


In [8]:
from statsmodels.tsa.arima.model import ARIMA

# Define ARIMA model (p=1, d=1, q=1 as initial guess)
model = ARIMA(data['Close_Diff'].dropna(), order=(1, 1, 1))

# Fit the model
model_fit = model.fit()

# Print summary
print(model_fit.summary())

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                               SARIMAX Results                                
Dep. Variable:             Close_Diff   No. Observations:                 3771
Model:                 ARIMA(1, 1, 1)   Log Likelihood               -7078.643
Date:                Thu, 27 Feb 2025   AIC                          14163.286
Time:                        16:41:08   BIC                          14181.991
Sample:                             0   HQIC                         14169.936
                               - 3771                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.0130      0.008     -1.531      0.126      -0.030       0.004
ma.L1         -0.9989      0.001  -1374.090      0.000      -1.000      -0.998
sigma2         2.4986      0.023    106.377      0.0

In [9]:
# Forecast next 10 days
forecast = model_fit.forecast(steps=10)

# Convert differenced forecast back to normal scale
last_close = data['Close'].iloc[-1]
forecast = last_close + forecast.cumsum()

# Create a date index for forecast
forecast_dates = pd.date_range(start=data.index[-1], periods=10, freq='B')
forecast_df = pd.DataFrame({'Date': forecast_dates, 'Forecast': forecast.values})

# Display corrected forecast
print(forecast_df)

# Save ARIMA predictions to Parquet
forecast_df.to_parquet("./models/ARIMA_Predictions.parquet", index=False)

        Date    Forecast
0 2024-12-31  250.296953
1 2025-01-01  250.423887
2 2025-01-02  250.551145
3 2025-01-03  250.678399
4 2025-01-06  250.805653
5 2025-01-07  250.932907
6 2025-01-08  251.060162
7 2025-01-09  251.187416
8 2025-01-10  251.314670
9 2025-01-13  251.441924


  return get_prediction_index(
  return get_prediction_index(


## 3. Random Forest

#### 3.1 Without Indicators

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

# Load dataset without indicators
data_no_ind = pd.read_parquet("./stock_data/Processed_Stock_Data.parquet")

# Split into features & target
features = ['Close_Lag_1', 'Close_Lag_2', 'Volume']
target = 'Close'

# Train-test split (80% train, 20% test)
train_X, test_X, train_y, test_y = train_test_split(data_no_ind[features], data_no_ind[target], test_size=0.2, shuffle=False)

# Train Random Forest
rf_no_ind = RandomForestRegressor(n_estimators=100, random_state=42)
rf_no_ind.fit(train_X, train_y)

# Predictions
rf_no_ind_preds = rf_no_ind.predict(test_X)

# Evaluate performance
mae_no_ind = mean_absolute_error(test_y, rf_no_ind_preds)
rmse_no_ind = np.sqrt(mean_squared_error(test_y, rf_no_ind_preds))

print(f"Random Forest (No Indicators) - MAE: {mae_no_ind:.4f}, RMSE: {rmse_no_ind:.4f}")

# Save predictions
rf_no_ind_df = pd.DataFrame({"Date": test_X.index, "Actual": test_y, "Predicted": rf_no_ind_preds})
rf_no_ind_df.to_parquet("./models/RandomForest_No_Indicators.parquet", index=False)

Random Forest (No Indicators) - MAE: 13.6947, RMSE: 23.5741


#### 3.2 With Indicators

In [11]:
# Load dataset with indicators
data_with_ind = pd.read_parquet("./stock_data/Processed_Stock_Data_with_Indicators.parquet")

# Define features and target
features_with_ind = ['Close_Lag_1', 'Close_Lag_2', 'Volume', 'RSI', 'EMA_10', 'SMA_10', 'MACD']

# Train-test split
train_X, test_X, train_y, test_y = train_test_split(data_with_ind[features_with_ind], data_with_ind[target], test_size=0.2, shuffle=False)

# Train Random Forest
rf_with_ind = RandomForestRegressor(n_estimators=100, random_state=42)
rf_with_ind.fit(train_X, train_y)

# Predictions
rf_with_ind_preds = rf_with_ind.predict(test_X)

# Evaluate performance
mae_with_ind = mean_absolute_error(test_y, rf_with_ind_preds)
rmse_with_ind = np.sqrt(mean_squared_error(test_y, rf_with_ind_preds))

print(f"Random Forest (With Indicators) - MAE: {mae_with_ind:.4f}, RMSE: {rmse_with_ind:.4f}")

# Save predictions
rf_with_ind_df = pd.DataFrame({"Date": test_X.index, "Actual": test_y, "Predicted": rf_with_ind_preds})
rf_with_ind_df.to_parquet("./models/RandomForest_With_Indicators.parquet", index=False)

Random Forest (With Indicators) - MAE: 15.2692, RMSE: 25.5188


#### 3.3 Results

In [12]:
# Compare MAE & RMSE
print("\nPerformance Comparison:")
print(f"Random Forest (No Indicators) - MAE: {mae_no_ind:.4f}, RMSE: {rmse_no_ind:.4f}")
print(f"Random Forest (With Indicators) - MAE: {mae_with_ind:.4f}, RMSE: {rmse_with_ind:.4f}")

# Which model performs better?
if mae_with_ind < mae_no_ind:
    print("Technical Indicators improved model accuracy!")
else:
    print("Technical Indicators did not improve model accuracy.")


Performance Comparison:
Random Forest (No Indicators) - MAE: 13.6947, RMSE: 23.5741
Random Forest (With Indicators) - MAE: 15.2692, RMSE: 25.5188
Technical Indicators did not improve model accuracy.


## 4. LSTM

#### 1.1 LSTM without Indicators

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset WITHOUT indicators
data_no_ind = pd.read_parquet("./stock_data/Processed_Stock_Data.parquet")

# Define features and target
features = ['Close_Lag_1', 'Close_Lag_2', 'Volume']
target = 'Close'

# Scale data between 0 and 1
scaler = MinMaxScaler()
data_no_ind[features] = scaler.fit_transform(data_no_ind[features])
data_no_ind[target] = scaler.fit_transform(data_no_ind[[target]])

# Convert to numpy arrays for LSTM
X, y = data_no_ind[features].values, data_no_ind[target].values

# Reshape data into 3D (samples, time steps, features)
X = X.reshape((X.shape[0], 1, X.shape[1]))

# Save preprocessed data
np.save("./models/LSTM_X_No_Indicators.npy", X)
np.save("./models/LSTM_y_No_Indicators.npy", y)

print(f"Data Shape for LSTM: {X.shape}, {y.shape}")


Data Shape for LSTM: (3772, 1, 3), (3772,)


In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Load preprocessed LSTM data
X = np.load("./models/LSTM_X_No_Indicators.npy")
y = np.load("./models/LSTM_y_No_Indicators.npy")

# Split data (80% train, 20% test)
split = int(0.8 * len(X))
train_X, test_X = X[:split], X[split:]
train_y, test_y = y[:split], y[split:]

# Define LSTM model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(1, train_X.shape[2])),
    LSTM(50),
    Dense(1)
])

# Compile model
model.compile(optimizer='adam', loss='mse')

# Train model
model.fit(train_X, train_y, epochs=50, batch_size=16, verbose=1)

# Save trained model
model.save("./models/LSTM_No_Indicators.keras")

  super().__init__(**kwargs)


Epoch 1/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.0171
Epoch 2/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 9.2084e-05
Epoch 3/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 5.5416e-05
Epoch 4/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4.2729e-05
Epoch 5/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.7448e-05
Epoch 6/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.3522e-05
Epoch 7/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.6712e-05
Epoch 8/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.1747e-05
Epoch 9/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.0994e-05
Epoch 10/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [15]:
# Load model
model = tf.keras.models.load_model("./models/LSTM_No_Indicators.keras")

# Make predictions
predictions = model.predict(test_X)

# Inverse transform predictions
predictions = scaler.inverse_transform(predictions)
actuals = scaler.inverse_transform(test_y.reshape(-1, 1))

# Compute MAE and RMSE
mae_lstm = np.mean(np.abs(actuals - predictions))
rmse_lstm = np.sqrt(np.mean((actuals - predictions) ** 2))

print(f"LSTM (No Indicators) - MAE: {mae_lstm:.4f}, RMSE: {rmse_lstm:.4f}")

test_dates = pd.read_parquet("./models/RandomForest_No_Indicators.parquet")["Date"]  # Ensure the dates align

# Create DataFrame for LSTM predictions
lstm_forecast_df = pd.DataFrame({'Date': test_dates, 'Actual': actuals.flatten(), 'Predicted': predictions.flatten()})

# Save to Parquet for visualization
lstm_forecast_df.to_parquet("./models/LSTM_No_Indicators_Predictions.parquet", index=False)

print("\n", lstm_forecast_df.head())  # Confirm data structure

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
LSTM (No Indicators) - MAE: 2.8098, RMSE: 3.5819

    Date      Actual   Predicted
0  3019  175.135437  178.465942
1  3020  174.516296  177.702591
2  3021  178.879898  176.702637
3  3022  176.609650  179.421127
4  3023  171.911819  179.334885


#### 1.2 LSTM with Indicators

In [16]:
# Load dataset with indicators
data_with_ind = pd.read_parquet("./stock_data/Processed_Stock_Data_with_Indicators.parquet")

# Define features & target
features_with_ind = ['Close_Lag_1', 'Close_Lag_2', 'Volume', 'RSI', 'EMA_10', 'SMA_10', 'MACD']
target = 'Close'

# Scale data between 0 and 1
scaler = MinMaxScaler()
data_with_ind[features_with_ind] = scaler.fit_transform(data_with_ind[features_with_ind])
data_with_ind[target] = scaler.fit_transform(data_with_ind[[target]])

# Convert to numpy arrays
X, y = data_with_ind[features_with_ind].values, data_with_ind[target].values

# Reshape for LSTM
X = X.reshape((X.shape[0], 1, X.shape[1]))

# Save processed data
np.save("./models/LSTM_X_With_Indicators.npy", X)
np.save("./models/LSTM_y_With_Indicators.npy", y)

print(f"LSTM Data Shape (With Indicators): {X.shape}, {y.shape}")


LSTM Data Shape (With Indicators): (3772, 1, 7), (3772,)


In [17]:
# Load preprocessed data
X = np.load("./models/LSTM_X_With_Indicators.npy")
y = np.load("./models/LSTM_y_With_Indicators.npy")

# Split data (80% train, 20% test)
split = int(0.8 * len(X))
train_X, test_X = X[:split], X[split:]
train_y, test_y = y[:split], y[split:]

# Define LSTM model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(1, train_X.shape[2])),
    LSTM(50),
    Dense(1)
])

# Compile model
model.compile(optimizer='adam', loss='mse')

# Train model
model.fit(train_X, train_y, epochs=50, batch_size=16, verbose=1)

# Save trained model
model.save("./models/LSTM_With_Indicators.keras")

Epoch 1/50


  super().__init__(**kwargs)


[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.0157
Epoch 2/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 8.6565e-05
Epoch 3/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4.2085e-05
Epoch 4/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.8413e-05
Epoch 5/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.4837e-05
Epoch 6/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.2661e-05
Epoch 7/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.1737e-05
Epoch 8/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.0073e-05
Epoch 9/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.1213e-05
Epoch 10/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [18]:
# Load model
model = tf.keras.models.load_model("./models/LSTM_With_Indicators.keras")

# Make predictions
predictions = model.predict(test_X)

# Inverse transform predictions
predictions = scaler.inverse_transform(predictions)
actuals = scaler.inverse_transform(test_y.reshape(-1, 1))

# Compute MAE and RMSE
mae_lstm_ind = np.mean(np.abs(actuals - predictions))
rmse_lstm_ind = np.sqrt(np.mean((actuals - predictions) ** 2))

print(f"LSTM (With Indicators) - MAE: {mae_lstm_ind:.4f}, RMSE: {rmse_lstm_ind:.4f}")

# Create DataFrame for LSTM predictions
lstm_forecast_df = pd.DataFrame({'Date': test_dates, 'Actual': actuals.flatten(), 'Predicted': predictions.flatten()})

# Save to Parquet for visualization
lstm_forecast_df.to_parquet("./models/LSTM_with_Indicators_Predictions.parquet", index=False)

print("\n", lstm_forecast_df.head())  # Confirm data structure

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
LSTM (With Indicators) - MAE: 3.1193, RMSE: 3.7604

    Date      Actual   Predicted
0  3019  175.135437  172.804764
1  3020  174.516296  172.332108
2  3021  178.879898  174.396530
3  3022  176.609650  174.171143
4  3023  171.911819  171.336182
