Libaries 

In [7]:
import numpy as np
import pandas as pd
import math
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


In [8]:
# stage 1 Dataset Selection:
base_dir = r"C:\Users\liory\Downloads\Stock Market Dataset"

print("stocks:", len(os.listdir(base_dir + r"\stocks")))
print("etfs:", len(os.listdir(base_dir + r"\etfs")))


NameError: name 'os' is not defined

In [None]:
stocks_dir = r"C:\Users\liory\Downloads\Stock Market Dataset\stocks"
etf_dir    = r"C:\Users\liory\Downloads\Stock Market Dataset\etfs"

stock_files = glob.glob(stocks_dir + r"\*.csv")
etf_files   = glob.glob(etf_dir   + r"\*.csv")

print("number of stock files:", len(stock_files))
print("number of ETF files:", len(etf_files))

df = pd.read_csv(stock_files[0])

In [None]:
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

df.info()
df.describe()
df.head()

In [None]:
plt.figure()
plt.plot(df["Date"], df["Close"])
plt.xlabel("Date")
plt.ylabel("Close")
plt.title("Close Price Over Time")
plt.show()


In [None]:
## stage 2 Build Model:
values = df[["Close"]].values.astype("float32")

n = len(values)
train_end = int(n * 0.70)
val_end   = int(n * 0.85)

train_vals = values[:train_end]
val_vals   = values[train_end:val_end]
test_vals  = values[val_end:]


scaler = MinMaxScaler()

train_scaled = scaler.fit_transform(train_vals)
val_scaled   = scaler.transform(val_vals)
test_scaled  = scaler.transform(test_vals)

In [None]:
def make_windows(data, seq_len=30):
    X, y = [], []
    for i in range(len(data) - seq_len):
        X.append(data[i:i+seq_len])
        y.append(data[i+seq_len])
    return np.array(X), np.array(y)

SEQ_LEN = 30

X_train, y_train = make_windows(train_scaled, SEQ_LEN)
X_val,   y_val   = make_windows(val_scaled, SEQ_LEN)
X_test,  y_test  = make_windows(test_scaled, SEQ_LEN)

print(X_train.shape, y_train.shape)


In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)  
        self.y = torch.tensor(y, dtype=torch.float32)  

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

BATCH_SIZE = 64

train_loader = DataLoader(TimeSeriesDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=False)
val_loader   = DataLoader(TimeSeriesDataset(X_val, y_val), batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(TimeSeriesDataset(X_test, y_test), batch_size=BATCH_SIZE, shuffle=False)

# בדיקה
xb, yb = next(iter(train_loader))
print("X batch:", xb.shape, "y batch:", yb.shape)


In [None]:
class LSTMRegressor(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2, dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)     
        last = out[:, -1, :]    
        y = self.fc(last)          
        return y


In [None]:
class GRURegressor(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2, dropout=0.2):
        super().__init__()
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.gru(x)
        last = out[:, -1, :]
        return self.fc(last)


In [None]:
# stage 3 model train and test
device = "cuda" if torch.cuda.is_available() else "cpu"
model = LSTMRegressor(input_size=1, hidden_size=64, num_layers=2, dropout=0.2).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def run_epoch(model, loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0

    for Xb, yb in loader:
        Xb, yb = Xb.to(device), yb.to(device)

        if train:
            optimizer.zero_grad()

        preds = model(Xb)                
        loss = criterion(preds, yb)      

        if train:
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * Xb.size(0)

    return total_loss / len(loader.dataset)

history = {"train_loss": [], "val_loss": []}
EPOCHS = 20

for epoch in range(1, EPOCHS + 1):
    train_loss = run_epoch(model, train_loader, train=True)
    val_loss   = run_epoch(model, val_loader, train=False)

    history["train_loss"].append(train_loss)
    history["val_loss"].append(val_loss)

    print(f"Epoch {epoch:02d}/{EPOCHS} | train_loss={train_loss:.6f} | val_loss={val_loss:.6f}")


In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(history["train_loss"], label="Train")
plt.plot(history["val_loss"], label="Validation")
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.show()


In [None]:
model.eval()

preds_list = []
y_list = []

with torch.no_grad():
    for Xb, yb in test_loader:
        Xb = Xb.to(device)
        preds = model(Xb).cpu().numpy()  
        preds_list.append(preds)
        y_list.append(yb.numpy())        

preds_scaled = np.vstack(preds_list)  
y_scaled     = np.vstack(y_list)      


In [None]:
preds = scaler.inverse_transform(preds_scaled)
y_true = scaler.inverse_transform(y_scaled)


In [None]:
rmse = np.sqrt(np.mean((preds - y_true) ** 2))
print("Test RMSE:", rmse)


In [None]:
plt.figure()
plt.plot(y_true[:200], label="Actual")
plt.plot(preds[:200], label="Predicted")
plt.xlabel("Time step")
plt.ylabel("Close Price")
plt.title("Actual vs Predicted (first 200 test points)")
plt.legend()
plt.show()


In [None]:
#Report

Preprocessing:
I converted the Date column into a datetime format and sorted the dataset chronologically. Since this is a time-series task, I performed a time-based split into 70% training, 15% validation, and 15% testing, without shuffling to preserve temporal order.
For scaling, I used MinMaxScaler, fitting it only on the training set to prevent future data leakage, then transforming the validation and test sets using the same scaler.
After scaling, I generated sliding windows with seq_len=30, where each sample consists of 30 past time steps and the label represents the next value in the sequence.

Architecture:
I implemented an LSTM-based regressor with hidden_size=64, num_layers=2, and dropout=0.2 for regularization.
The final layer is a fully connected Linear unit that outputs a single predicted value for each input sequence.

Training & Validation:
The model was trained using MSELoss as the objective function and optimized with the Adam optimizer at a learning rate of lr=1e-3.
Training was performed with batch_size=64 for 20 epochs, evaluating performance on the validation set after each epoch to track loss improvement and avoid overfitting.

Results & Evaluation:
I logged training and validation loss values and visualized them using a train/val loss plot.
Final performance was evaluated on the test set using RMSE, and I also plotted Actual vs Predicted values over the first 200 test points to verify that the model follows the historical trend and produces reasonable future forecasts.