In [63]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import pacf

def calculate_stock_features(df):
    df = df.copy()

    # Ensure the DataFrame is sorted by date
    df = df.sort_values(by='DateTime').reset_index(drop=True)

    # 1. Exponential Moving Average (EMA)
    df['EMA_5'] = df['Close'].ewm(span=5, adjust=False).mean()
    df['EMA_20'] = df['Close'].ewm(span=20, adjust=False).mean()

    # 2. Weighted Moving Average (WMA)
    weights_5 = np.arange(1, 6)
    df['WMA_5'] = df['Close'].rolling(5).apply(lambda x: np.dot(x, weights_5) / weights_5.sum(), raw=True)

    weights_20 = np.arange(1, 21)
    df['WMA_20'] = df['Close'].rolling(20).apply(lambda x: np.dot(x, weights_20) / weights_20.sum(), raw=True)

    # 3. Lagged features
    for lag in [3, 9, 27]:
        df[f'Close_Lag_{lag}'] = df['Close'].shift(lag)
        df[f'EMA_5_Lag_{lag}'] = df['EMA_5'].shift(lag)

    # 4. Auto-Covariance Function (ACVF)
    for lag in range(4):
        df[f'ACVF_Lag_{lag}'] = df['Close'].rolling(30).apply(lambda x: np.cov(x[:-lag] if lag > 0 else x, x[lag:] if lag > 0 else x)[0, 1] if len(x) > lag else np.nan, raw=True)

    # 5. Auto-Correlation Function (ACF)
    for lag in range(4):
        df[f'ACF_Lag_{lag}'] = df['Close'].rolling(30).apply(lambda x: np.corrcoef(x[:-lag] if lag > 0 else x, x[lag:] if lag > 0 else x)[0, 1] if len(x) > lag else np.nan, raw=True)

    # 6. Partial Auto-Correlation Function (PACF)
    def calc_pacf_series(series, max_lag=3):
        """ Compute PACF values up to max_lag for a given series """
        if len(series) > max_lag:
            pacf_vals = pacf(series, nlags=max_lag, method='yw')
            return pacf_vals[1:]  # Exclude lag 0 (always 1)
        return [np.nan] * max_lag  # Return NaNs if insufficient data

    # Apply PACF calculation separately for each lag
    for lag in range(1, 4):  # Lags 1, 2, 3
        df[f'PACF_Lag_{lag}'] = df['Close'].rolling(window=30).apply(
            lambda x: calc_pacf_series(x)[lag - 1] if len(x) > lag else np.nan, raw=False
        )

    # 7. Correlation and Covariance between Close and EMA
    df['Corr_Close_EMA5'] = df['Close'].rolling(30).corr(df['EMA_5'])
    df['Cov_Close_EMA5'] = df['Close'].rolling(30).cov(df['EMA_5'])

    return df


In [86]:
df = pd.read_csv('/content/AAPL_15min_candles_2022_2024.csv')
df.head()

Unnamed: 0,DateTime,Open,High,Low,Close,Volume
0,2022-01-03 04:00:00,175.3921,175.4708,174.9001,175.215,9876
1,2022-01-03 04:15:00,175.215,175.2543,174.9493,174.9592,9992
2,2022-01-03 04:30:00,175.0084,175.028,174.9001,175.028,9560
3,2022-01-03 04:45:00,175.0182,175.2347,174.9493,175.215,10339
4,2022-01-03 05:00:00,175.215,175.2937,175.1264,175.2248,10041


In [87]:
df = calculate_stock_features(df)
df.head(30)

Unnamed: 0,DateTime,Open,High,Low,Close,Volume,EMA_5,EMA_20,WMA_5,WMA_20,...,ACVF_Lag_3,ACF_Lag_0,ACF_Lag_1,ACF_Lag_2,ACF_Lag_3,PACF_Lag_1,PACF_Lag_2,PACF_Lag_3,Corr_Close_EMA5,Cov_Close_EMA5
0,2022-01-03 04:00:00,175.3921,175.4708,174.9001,175.215,9876,175.215,175.215,,,...,,,,,,,,,,
1,2022-01-03 04:15:00,175.215,175.2543,174.9493,174.9592,9992,175.129733,175.190638,,,...,,,,,,,,,,
2,2022-01-03 04:30:00,175.0084,175.028,174.9001,175.028,9560,175.095822,175.175149,,,...,,,,,,,,,,
3,2022-01-03 04:45:00,175.0182,175.2347,174.9493,175.215,10339,175.135548,175.178944,,,...,,,,,,,,,,
4,2022-01-03 05:00:00,175.215,175.2937,175.1264,175.2248,10041,175.165299,175.183311,175.14676,,...,,,,,,,,,,
5,2022-01-03 05:15:00,175.2347,175.3527,175.1855,175.3134,12027,175.214666,175.195701,175.208427,,...,,,,,,,,,,
6,2022-01-03 05:30:00,175.3035,175.3527,175.2051,175.2347,6586,175.221344,175.199415,175.2373,,...,,,,,,,,,,
7,2022-01-03 05:45:00,175.2839,175.3035,175.2642,175.3035,6055,175.248729,175.209328,175.27074,,...,,,,,,,,,,
8,2022-01-03 06:00:00,175.3035,175.5495,175.2937,175.4708,12512,175.322753,175.23423,175.34158,,...,,,,,,,,,,
9,2022-01-03 06:15:00,175.4511,175.579,175.4019,175.5594,3963,175.401635,175.265199,175.4249,,...,,,,,,,,,,


In [66]:
df.head(50)

Unnamed: 0,DateTime,Open,High,Low,Close,Volume,EMA_5,EMA_20,WMA_5,WMA_20,...,ACVF_Lag_3,ACF_Lag_0,ACF_Lag_1,ACF_Lag_2,ACF_Lag_3,PACF_Lag_1,PACF_Lag_2,PACF_Lag_3,Corr_Close_EMA5,Cov_Close_EMA5
0,2022-01-03 04:00:00,175.3921,175.4708,174.9001,175.215,9876,175.215,175.215,,,...,,,,,,,,,,
1,2022-01-03 04:15:00,175.215,175.2543,174.9493,174.9592,9992,175.129733,175.190638,,,...,,,,,,,,,,
2,2022-01-03 04:30:00,175.0084,175.028,174.9001,175.028,9560,175.095822,175.175149,,,...,,,,,,,,,,
3,2022-01-03 04:45:00,175.0182,175.2347,174.9493,175.215,10339,175.135548,175.178944,,,...,,,,,,,,,,
4,2022-01-03 05:00:00,175.215,175.2937,175.1264,175.2248,10041,175.165299,175.183311,175.14676,,...,,,,,,,,,,
5,2022-01-03 05:15:00,175.2347,175.3527,175.1855,175.3134,12027,175.214666,175.195701,175.208427,,...,,,,,,,,,,
6,2022-01-03 05:30:00,175.3035,175.3527,175.2051,175.2347,6586,175.221344,175.199415,175.2373,,...,,,,,,,,,,
7,2022-01-03 05:45:00,175.2839,175.3035,175.2642,175.3035,6055,175.248729,175.209328,175.27074,,...,,,,,,,,,,
8,2022-01-03 06:00:00,175.3035,175.5495,175.2937,175.4708,12512,175.322753,175.23423,175.34158,,...,,,,,,,,,,
9,2022-01-03 06:15:00,175.4511,175.579,175.4019,175.5594,3963,175.401635,175.265199,175.4249,,...,,,,,,,,,,


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33342 entries, 0 to 33341
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   DateTime         33342 non-null  object 
 1   Open             33342 non-null  float64
 2   High             33342 non-null  float64
 3   Low              33342 non-null  float64
 4   Close            33342 non-null  float64
 5   Volume           33342 non-null  int64  
 6   EMA_5            33342 non-null  float64
 7   EMA_20           33342 non-null  float64
 8   WMA_5            33338 non-null  float64
 9   WMA_20           33323 non-null  float64
 10  Close_Lag_3      33339 non-null  float64
 11  EMA_5_Lag_3      33339 non-null  float64
 12  Close_Lag_9      33333 non-null  float64
 13  EMA_5_Lag_9      33333 non-null  float64
 14  Close_Lag_27     33315 non-null  float64
 15  EMA_5_Lag_27     33315 non-null  float64
 16  ACVF_Lag_0       33313 non-null  float64
 17  ACVF_Lag_1  

# Creating H.BLSTM for Multi-variate process

In [68]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout

In [88]:
def prepare_data(df, target_col='Close', sequence_length=30):
    # Convert DateTime column to pandas datetime format
    df['DateTime'] = pd.to_datetime(df['DateTime'])

    # Extract time features (hour, day_of_week, month)
    df['hour'] = df['DateTime'].dt.hour
    df['day_of_week'] = df['DateTime'].dt.dayofweek
    df['month'] = df['DateTime'].dt.month

    # Apply cyclical encoding
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

    # Drop original DateTime and non-numeric columns
    df.drop(columns=['DateTime', 'hour', 'day_of_week', 'month'], inplace=True)

    # Fill missing values
    df.fillna(method="ffill", inplace=True)  # Forward fill missing values
    df.fillna(method="bfill", inplace=True)  # Backward fill (if needed)

    # Separate features and target
    feature_cols = df.columns[df.columns != target_col]  # All columns except target
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    # Scale features
    df_scaled_X = pd.DataFrame(scaler_X.fit_transform(df[feature_cols]), columns=feature_cols)

    # Scale target separately
    df_scaled_y = scaler_y.fit_transform(df[[target_col]])  # Must be 2D

    # Convert to supervised learning format (3D tensor for LSTM)
    X, y = [], []
    for i in range(len(df) - sequence_length):
        X.append(df_scaled_X.iloc[i:i + sequence_length].values)  # Features over sequence
        y.append(df_scaled_y[i + sequence_length])  # Target

    X, y = np.array(X), np.array(y).reshape(-1, 1)  # Ensure y is 2D

    # Train-test split (70% train, 30% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

    return X_train, y_train, X_test, y_test, scaler_X, scaler_y

In [83]:
def build_hblstm_model(input_shape):
    model = Sequential([
        Bidirectional(LSTM(64, activation='relu', return_sequences=True), input_shape=input_shape),
        Dropout(0.2),
        Bidirectional(LSTM(32, activation='relu')),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)  # Regression output (predicting Close price)
    ])

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [84]:
def train_model(model, X_train, y_train, batch_size=32, epochs=50):
    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        batch_size=batch_size,
        epochs=epochs,
        verbose=1
    )
    return model, history


We are training the LSTM model with all the 30k records in the given dataset, which act both as training and validation dataset. For actual testing, new dataset has to brought in.

In [90]:
df_copy = df.copy()
# Load your dataset (assuming df is already available)
X_train, y_train, X_test, y_test, scaler_X, scaler_y = prepare_data(df_copy)

# Define model
input_shape = (X_train.shape[1], X_train.shape[2])
model = build_hblstm_model(input_shape)


  df.fillna(method="ffill", inplace=True)  # Forward fill missing values
  df.fillna(method="bfill", inplace=True)  # Backward fill (if needed)
  super().__init__(**kwargs)


In [91]:
# Train model
trained_model, history = train_model(model, X_train, y_train)

Epoch 1/50
[1m583/583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 24ms/step - loss: 0.0107 - mae: 0.0636 - val_loss: 0.0064 - val_mae: 0.0746
Epoch 2/50
[1m583/583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - loss: 0.0010 - mae: 0.0240 - val_loss: 0.0034 - val_mae: 0.0545
Epoch 3/50
[1m583/583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - loss: 6.2077e-04 - mae: 0.0190 - val_loss: 0.0040 - val_mae: 0.0593
Epoch 4/50
[1m583/583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - loss: 4.8572e-04 - mae: 0.0167 - val_loss: 0.0029 - val_mae: 0.0462
Epoch 5/50
[1m583/583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - loss: 4.0060e-04 - mae: 0.0151 - val_loss: 0.0018 - val_mae: 0.0371
Epoch 6/50
[1m583/583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - loss: 3.2561e-04 - mae: 0.0138 - val_loss: 0.0029 - val_mae: 0.0482
Epoch 7/50
[1m583/583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [92]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np
import plotly.express as px
import pandas as pd

In [93]:
# Inverse transform predictions to original scale
y_pred = trained_model.predict(X_test)
y_pred_original = scaler_y.inverse_transform(y_pred.reshape(-1, 1))  # Convert back to original scale
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1))  # Convert true values back to original scale

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step


In [94]:
# Compute error metrics
mae = mean_absolute_error(y_test_original, y_pred_original)
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
r2 = r2_score(y_test_original, y_pred_original)
mape = mean_absolute_percentage_error(y_test_original, y_pred_original) * 100  # Convert to percentage

# Print metrics
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")
print(f"MAPE: {mape:.2f}%")

MAE: 7.86
RMSE: 8.77
R-squared: -0.21
MAPE: 4.20%


In [95]:
# Prepare DataFrame for plotting
new_df = pd.DataFrame({
    'DateTime': df.iloc[-len(y_test_original):]['DateTime'],  # Use corresponding DateTime values
    'Actual_Close': y_test_original.flatten(),
    'Predicted_Close': y_pred_original.flatten()
})

# Create interactive plot
fig = px.scatter(new_df, x='DateTime', y='Actual_Close', title="Stock Market Predictions",
                 labels={'Actual_Close': "Close Price"})

# Add predicted line
fig.add_scatter(x=new_df['DateTime'], y=new_df['Predicted_Close'], mode='lines', name="Predicted Line",
                line=dict(color="red"))

# Show interactive plot
fig.show()