In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("../data/stock_with_sentiment.csv",index_col="date")
df.head()

Unnamed: 0_level_0,Close,Volume,Target,RAV,volatility,Buy_Sell_Strength,Weighted_Strength,Trend,Returns,Log_returns,...,return_2,vol_3,close_z,market_mean_return,pol_mean,pol_sum,pos_count,neg_count,neu_count,has_news
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-02-06,26.495508,174826400,1,250822400.0,1.046604,0.266668,-0.162635,-0.007055,-0.008421,8.064544,...,,,,-0.007545,0.0,0.0,0.0,0.0,0.0,0
2015-02-06,36.218159,34616600,0,50972100.0,2.133874,0.406248,-0.06367,0.022419,-0.000942,10.08721,...,,,,-0.007545,0.0,0.0,0.0,0.0,0.0,0
2015-02-06,0.489517,210524000,0,215585700.0,0.010619,0.225,-0.268543,0.023777,-0.004392,-0.689525,...,-0.008421,1.063699,,-0.007545,0.0,0.0,0.0,0.0,0.0,0
2015-02-06,14.490667,48658500,1,54862930.0,0.608197,0.124639,-0.332912,0.016009,-0.016426,5.38673,...,-0.000942,0.917563,,-0.007545,0.0,0.0,0.0,0.0,0.0,0
2015-02-09,26.671497,155559200,1,224795800.0,1.096022,0.914897,0.28711,0.010177,0.006642,8.13906,...,,0.571613,,0.001259,0.0,0.0,0.0,1.0,0.0,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10804 entries, 2015-02-06 to 2025-10-31
Data columns (total 34 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Close               10804 non-null  float64
 1   Volume              10804 non-null  int64  
 2   Target              10804 non-null  int64  
 3   RAV                 10804 non-null  float64
 4   volatility          10804 non-null  float64
 5   Buy_Sell_Strength   10804 non-null  float64
 6   Weighted_Strength   10804 non-null  float64
 7   Trend               10804 non-null  float64
 8   Returns             10804 non-null  float64
 9   Log_returns         10804 non-null  float64
 10  AAPL                10804 non-null  bool   
 11  MSFT                10804 non-null  bool   
 12  NVDA                10804 non-null  bool   
 13  TSLA                10804 non-null  bool   
 14  market_return       10804 non-null  float64
 15  rel_return          10804 non-null  float64


In [7]:
X = df.drop(['Target','lag_market_return','close_z', 'vol_3', 'return_2', 'return_1',"market_mean_return"], axis="columns")
y = np.array(df['Target'])
print(len(X), len(y))

10804 10804


In [8]:
train_size = int(len(X) * 0.70)
val_size = int(len(X) * 0.85)
X_train, X_val, X_test = X[0:train_size], X[train_size:val_size], X[val_size:len(X)]
y_train, y_val, y_test = y[0:train_size], y[train_size:val_size], y[val_size:len(y)]
print(len(X_train), len(X_test), len(X_val), len(y_train), len(y_test), len(y_val))

7562 1621 1621 7562 1621 1621


In [9]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)
X_val_scaled  = scaler.transform(X_val)

In [10]:
def creat_sequence(X, y, time_stamp=5):
    Xs, ys = [], []
    for i in range(len(X) - time_stamp):
        Xs.append(X[i:(i + time_stamp)])
        ys.append(y[i + time_stamp])
    return np.array(Xs), np.array(ys)

In [11]:
time_stamp = 20
features = len(X.columns)
X_train_seq, y_train_seq = creat_sequence(X_train_scaled, y_train, time_stamp)
X_val_seq, y_val_seq = creat_sequence(X_val_scaled, y_val, time_stamp)
X_test_seq, y_test_seq = creat_sequence(X_test_scaled, y_test, time_stamp)

print(X_train_seq.shape, y_train_seq.shape)
print(X_val_seq.shape, y_val_seq.shape)
print(X_test_seq.shape, y_test_seq.shape)

(7542, 20, 27) (7542,)
(1601, 20, 27) (1601,)
(1601, 20, 27) (1601,)


In [12]:
for i in range(5):
    print(f"sequence {i}")
    print("X window indices:", list(range(i, i + time_stamp)))
    print("Target index:", i + time_stamp)
    print("y target:", y[i + time_stamp])
    print("-" * 130)

sequence 0
X window indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
Target index: 20
y target: 1
----------------------------------------------------------------------------------------------------------------------------------
sequence 1
X window indices: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
Target index: 21
y target: 1
----------------------------------------------------------------------------------------------------------------------------------
sequence 2
X window indices: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
Target index: 22
y target: 1
----------------------------------------------------------------------------------------------------------------------------------
sequence 3
X window indices: [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
Target index: 23
y target: 1
-------------------------------------------------------------------------------------

In [21]:
import os
import random
import numpy as np
import tensorflow as tf

SEED = 42

# Python
random.seed(SEED)

# NumPy
np.random.seed(SEED)

# TensorFlow
tf.random.set_seed(SEED)

# Force reproducibility inside TF
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["TF_DETERMINISTIC_OPS"] = "1"
os.environ["TF_CUDNN_DETERMINISTIC"] = "1"



#Define the model
model = Sequential()


model.add(LSTM(units=128, input_shape=(time_stamp, features), return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(units=64, return_sequences=False))
# model.add(Dropout(0.1))

# model.add(LSTM(units=32, return_sequences=False))
# model.add(Dropout(0.2))

model.add(Dense(32, activation="tanh"))
# Add a Dense output layer
model.add(Dense(1, activation="sigmoid"))

# Compile the model
from tensorflow.keras.optimizers import Adam
optimizer = Adam(learning_rate=0.001,  clipnorm=1.0)
model.compile(
    optimizer=optimizer, 
    loss='binary_crossentropy', 
    metrics=["accuracy"]
)

early_stop = EarlyStopping(
    monitor='val_loss', 
    patience=5,        
    restore_best_weights=True
)

model.fit(
    X_train_seq,
    y_train_seq,
    validation_data=(X_val_seq, y_val_seq),
    batch_size=64,
    epochs=50,
    shuffle=False,
    # callbacks=[early_stop]
)
model.summary()
print(model.loss, model.optimizer)
model.evaluate(X_test_seq,y_test_seq)

Epoch 1/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 99ms/step - accuracy: 0.5302 - loss: 0.6943 - val_accuracy: 0.5215 - val_loss: 0.6922
Epoch 2/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 88ms/step - accuracy: 0.5270 - loss: 0.6921 - val_accuracy: 0.5215 - val_loss: 0.6921
Epoch 3/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 89ms/step - accuracy: 0.5302 - loss: 0.6911 - val_accuracy: 0.5215 - val_loss: 0.6936
Epoch 4/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 90ms/step - accuracy: 0.5326 - loss: 0.6910 - val_accuracy: 0.5215 - val_loss: 0.6920
Epoch 5/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 89ms/step - accuracy: 0.5333 - loss: 0.6909 - val_accuracy: 0.5215 - val_loss: 0.6936
Epoch 6/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 91ms/step - accuracy: 0.5320 - loss: 0.6905 - val_accuracy: 0.5178 - val_loss: 0.6928
Epoch 7/50
[1m1

binary_crossentropy <keras.src.optimizers.adam.Adam object at 0x000002586B3CD070>
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.5353 - loss: 0.8051


[0.8051102161407471, 0.5352904200553894]

In [22]:
y_proba = model.predict(X_test_seq)  
y_pred = (y_proba.ravel() >= 0.5).astype(int)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test_seq, y_pred))
print(confusion_matrix(y_test_seq, y_pred))
print(y_proba[:5])
print("LSTM",np.bincount(y_pred)) 

[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step
              precision    recall  f1-score   support

           0       0.49      0.48      0.49       733
           1       0.57      0.58      0.57       868

    accuracy                           0.54      1601
   macro avg       0.53      0.53      0.53      1601
weighted avg       0.53      0.54      0.53      1601

[[354 379]
 [365 503]]
[[0.45259878]
 [0.84189624]
 [0.90071005]
 [0.8987608 ]
 [0.401319  ]]
LSTM [719 882]


In [40]:
import os
import random
import numpy as np
import tensorflow as tf

SEED = 42

# Python
random.seed(SEED)

# NumPy
np.random.seed(SEED)

# TensorFlow
tf.random.set_seed(SEED)

# Force reproducibility inside TF
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["TF_DETERMINISTIC_OPS"] = "1"
os.environ["TF_CUDNN_DETERMINISTIC"] = "1"



#Define the model

model_1 = Sequential()


model_1.add(LSTM(units=128, input_shape=(time_stamp, features), return_sequences=True))
model_1.add(Dropout(0.1))

model_1.add(LSTM(units=64, return_sequences=False))
# model_1.add(Dropout(0.1))

# model_1.add(LSTM(units=32, return_sequences=False))
# model.add(Dropout(0.2))

model_1.add(Dense(32, activation="tanh"))
# Add a Dense output layer
model_1.add(Dense(1, activation="sigmoid"))

# Compile the model
from tensorflow.keras.optimizers import Adam
optimizer = Adam(learning_rate=0.001,  clipnorm=1.0)
model_1.compile(
    optimizer=optimizer, 
    loss='binary_crossentropy', 
    metrics=["accuracy"]
)

early_stop = EarlyStopping(
    monitor='val_loss', 
    patience=5,        
    restore_best_weights=True
)

model_1.fit(
    X_train_seq,
    y_train_seq,
    validation_data=(X_val_seq, y_val_seq),
    batch_size=64,
    epochs=30,
    shuffle=False,
    # callbacks=[early_stop]
)
model_1.summary()
print(model_1.loss, model_1.optimizer)
model_1.evaluate(X_test_seq,y_test_seq)

Epoch 1/30
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 77ms/step - accuracy: 0.5296 - loss: 0.6928 - val_accuracy: 0.5215 - val_loss: 0.6926
Epoch 2/30
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 60ms/step - accuracy: 0.5298 - loss: 0.6915 - val_accuracy: 0.5215 - val_loss: 0.6925
Epoch 3/30
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 64ms/step - accuracy: 0.5304 - loss: 0.6912 - val_accuracy: 0.5215 - val_loss: 0.6923
Epoch 4/30
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 59ms/step - accuracy: 0.5325 - loss: 0.6911 - val_accuracy: 0.5215 - val_loss: 0.6924
Epoch 5/30
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 62ms/step - accuracy: 0.5334 - loss: 0.6908 - val_accuracy: 0.5147 - val_loss: 0.6925
Epoch 6/30
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 59ms/step - accuracy: 0.5316 - loss: 0.6901 - val_accuracy: 0.5128 - val_loss: 0.6926
Epoch 7/30
[1m118/11

binary_crossentropy <keras.src.optimizers.adam.Adam object at 0x00000258207478C0>
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.4697 - loss: 0.7554


[0.7554135918617249, 0.4697064459323883]

In [41]:
y_proba_1 = model_1.predict(X_test_seq)  
y_pred_1 = (y_proba_1.ravel() >= 0.5).astype(int)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test_seq, y_pred_1))
print(confusion_matrix(y_test_seq, y_pred_1))
print(y_proba_1[:5])
print("LSTM",np.bincount(y_pred_1)) 

[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step
              precision    recall  f1-score   support

           0       0.45      0.80      0.58       733
           1       0.53      0.19      0.28       868

    accuracy                           0.47      1601
   macro avg       0.49      0.50      0.43      1601
weighted avg       0.50      0.47      0.42      1601

[[583 150]
 [699 169]]
[[0.48642346]
 [0.44464108]
 [0.42095238]
 [0.40548056]
 [0.38535553]]
LSTM [1282  319]


In [23]:
import joblib
# joblib.dump(model,'../models/v3.1(sentiment)_lstm_stock_pediction.joblib')

['../models/v3.1(sentiment)_lstm_stock_pediction.joblib']

In [None]:
X_2_features = ['market_return', 'Weighted_Strength', 'mean_return_others','TSLA', 'AAPL', 'NVDA', 'MSFT','pol_mean', 'pol_sum', 'pos_count' ,'neg_count', 'neu_count', 'has_news']
X_2 = X[X_2_features]

train_size = int(len(X_2) * 0.70)
val_size = int(len(X_2) * 0.85)
X_train, X_val, X_test = X_2[0:train_size], X_2[train_size:val_size], X_2[val_size:len(X_2)]
y_train, y_val, y_test = y[0:train_size], y[train_size:val_size], y[val_size:len(y)]
print(len(X_train), len(X_test), len(X_val), len(y_train), len(y_test), len(y_val))

scaler = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)
X_val_scaled  = scaler.transform(X_val)

time_stamp = 20
features = len(X_2.columns)
X_train_seq, y_train_seq = creat_sequence(X_train_scaled, y_train, time_stamp)
X_val_seq, y_val_seq = creat_sequence(X_val_scaled, y_val, time_stamp)
X_test_seq, y_test_seq = creat_sequence(X_test_scaled, y_test, time_stamp)
print(X_train_seq.shape, y_train_seq.shape)

model_2 = Sequential()

model_2.add(LSTM(units=128, input_shape=(time_stamp, features), return_sequences=True))
model_2.add(Dropout(0.2))
          
model_2.add(LSTM(units=64, return_sequences=True)) 
model_2.add(Dropout(0.2))

model_2.add(LSTM(units=32, return_sequences=False))
# model.add(Dropout(0.2))

model_2.add(Dense(64, activation="tanh"))
model_2.add(Dense(1, activation="sigmoid"))

from tensorflow.keras.optimizers import AdamW
optimizer = AdamW(learning_rate=0.005)

model_2.compile(
    optimizer=optimizer, 
    loss='binary_crossentropy', 
    metrics=["accuracy"]
)

early_stop = EarlyStopping(
    monitor='val_loss', 
    patience=5,        
    restore_best_weights=True
)


reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,        # reduce by half
    patience=3,         # after 3 bad epochs
    min_lr=1e-6         # never go below this
)


model_2.fit(
    X_train_seq,y_train_seq,
    validation_data=(X_val_seq, y_val_seq),
    batch_size=16,
    epochs=50,
    # callbacks=[early_stop]
)
model_2.evaluate(X_test_seq,y_test_seq)

y_proba = model_2.predict(X_test_seq)  
y_pred = (y_proba.ravel() >= 0.5).astype(int)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test_seq, y_pred))
print(confusion_matrix(y_test_seq, y_pred))