In [12]:
import pandas as pd

In [13]:
import yfinance as yf


In [14]:
symbol = "TSLA" # Tesla, Inc
start_date = "2023-01-01"
end_date = "2024-12-31"

In [15]:
df = yf.download(symbol, start=start_date, end=end_date)

  df = yf.download(symbol, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


In [16]:
df.columns

MultiIndex([( 'Close', 'TSLA'),
            (  'High', 'TSLA'),
            (   'Low', 'TSLA'),
            (  'Open', 'TSLA'),
            ('Volume', 'TSLA')],
           names=['Price', 'Ticker'])

In [17]:
df.columns = ['_'.join(col).strip() for col in df.columns.values]
df.rename(columns={'Close_TSLA': 'Close', 'High_TSLA': 'High', 'Low_TSLA': 'Low', 'Open_TSLA': 'Open', 'Volume_TSLA': 'Volume'}, inplace=True)

print("Flattened and renamed columns:")
display(df.columns)
display(df.head())

Flattened and renamed columns:


Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-03,108.099998,118.800003,104.639999,118.470001,231402800
2023-01-04,113.639999,114.589996,107.519997,109.110001,180389000
2023-01-05,110.339996,111.75,107.160004,110.510002,157986300
2023-01-06,113.059998,114.389999,101.809998,103.0,220911100
2023-01-09,119.769997,123.519997,117.110001,118.959999,190284000


In [18]:
df.columns

Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')

In [19]:
df.head()

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-03,108.099998,118.800003,104.639999,118.470001,231402800
2023-01-04,113.639999,114.589996,107.519997,109.110001,180389000
2023-01-05,110.339996,111.75,107.160004,110.510002,157986300
2023-01-06,113.059998,114.389999,101.809998,103.0,220911100
2023-01-09,119.769997,123.519997,117.110001,118.959999,190284000


# Creating new features "Moving average" and "RSI"

In [20]:
# 10-period Simple Moving Average (SMA)
df['SMA'] = df["Close"].rolling(window=10).mean()

In [21]:
def compute_rsi(series: pd.Series, window: int = 14) -> pd.Series:
    delta = series.diff()

    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    return rsi

df["RSI"] = compute_rsi(df['Close'], window=14)

In [22]:
df.tail()

Unnamed: 0_level_0,Close,High,Low,Open,Volume,SMA,RSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-12-23,430.600006,434.51001,415.410004,431.0,72698100,435.092999,68.841618
2024-12-24,462.279999,462.779999,435.140015,435.899994,59551800,441.222,72.174769
2024-12-26,454.130005,465.329987,451.019989,465.160004,76366400,444.158002,68.250827
2024-12-27,431.660004,450.0,426.5,449.519989,82666800,445.514001,59.044415
2024-12-30,417.410004,427.0,415.75,419.399994,64941000,443.632001,55.561821


In [23]:
df.head()

Unnamed: 0_level_0,Close,High,Low,Open,Volume,SMA,RSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-01-03,108.099998,118.800003,104.639999,118.470001,231402800,,
2023-01-04,113.639999,114.589996,107.519997,109.110001,180389000,,
2023-01-05,110.339996,111.75,107.160004,110.510002,157986300,,
2023-01-06,113.059998,114.389999,101.809998,103.0,220911100,,
2023-01-09,119.769997,123.519997,117.110001,118.959999,190284000,,


In [24]:
print(df.dtypes)

Close     float64
High      float64
Low       float64
Open      float64
Volume      int64
SMA       float64
RSI       float64
dtype: object


In [25]:
import numpy as np

cols_price = ['Close', 'High', 'Low', 'Open', "SMA"]
col_volume = "Volume"
col_rsi = "RSI"

# Applying logarithm to volume values
df["Volume_log"] = np.log1p(df[col_volume])

df['RSI_scaled'] = df[col_rsi] / 100.0

# Features entering the model (after transformation)
feature_cols = cols_price + ["Volume_log", "RSI_scaled"]

In [26]:
df[feature_cols].isna().sum() # RSI_scaled contains a null row
df = df.dropna(subset=feature_cols + ["Close"]).reset_index(drop=True)

# Splitting between train and test

In [27]:
train_size = int(len(df) * 0.8)
train_df = df.iloc[:train_size].copy()
test_df = df.iloc[train_size:].copy()

# Normalization

In [28]:
from sklearn.preprocessing import MinMaxScaler

scaler_price = MinMaxScaler(feature_range=(0, 1))
scaler_volume = MinMaxScaler(feature_range=(0, 1))


In [29]:
train_df[cols_price] = scaler_price.fit_transform(train_df[cols_price])
test_df[cols_price] = scaler_price.transform(test_df[cols_price])

In [30]:
# We only apply .fit_transform to the train set
train_df[["Volume_log"]] = scaler_volume.fit_transform(train_df[["Volume_log"]])

# We only apply .transform to the test set
test_df[["Volume_log"]] = scaler_volume.transform(test_df[["Volume_log"]])

In [31]:
# Transforming in numpy arrays

X_train_all = train_df[feature_cols].values
X_test_all = test_df[feature_cols].values

In [32]:
close_index = cols_price.index("Close")

y_train_all = train_df[cols_price].values[:, close_index]
y_test_all = test_df[cols_price].values[:, close_index]

# Creating sequences

In [33]:
def create_sequences(X, y, seq_length):
  X_seqs, y_seqs = [], []
  for i in range(len(X) - seq_length):
    X_seqs.append(X[i : i + seq_length])
    y_seqs.append(y[i + seq_length])
  return np.array(X_seqs), np.array(y_seqs)

In [34]:
seq_length = 20

X_train_seq, y_train = create_sequences(X_train_all, y_train_all, seq_length)
X_test_seq, y_test = create_sequences(X_test_all, y_test_all, seq_length)

# Converting to tensors

In [35]:
import torch

X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

y_train_tensor = y_train_tensor.view(-1, 1)
y_test_tensor = y_test_tensor.view(-1, 1)

In [36]:
print("NaN em X_train_tensor?", torch.isnan(X_train_tensor).any())
print("NaN em y_train_tensor?", torch.isnan(y_train_tensor).any())

NaN em X_train_tensor? tensor(False)
NaN em y_train_tensor? tensor(False)


# DataLoader

In [37]:
from torch.utils.data import TensorDataset, DataLoader

batch_size = 32

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Preparing the model

In [38]:
import torch
import torch.nn as nn

class LSTMRegressor(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, dropout):
    super().__init__()
    self.lstm = nn.LSTM(
    input_size=input_size,
    hidden_size = hidden_size,
    num_layers = num_layers,
    batch_first=True, # -> input: (batch, seq, feature)
    dropout = dropout if num_layers > 1 else 0.0
    )

    # Layer to map from hidden -> 1 value (regression)
    self.fc = nn.Linear(hidden_size, 1)

  def forward(self, x):
    """
    x: (batch_size, seq_length, input_size)
    """

    # out: (batch_size, seq_length, hidden_size)
    # h_n: (num_layers, batch_size, hidden_size)
    out, (h_n, c_n) = self.lstm(x)

    # We only get the output from the last timestep
    # out[:, 1, :] => (batch_size, hidden_size)
    last_hidden = out[:, -1, :]

    out = self.fc(last_hidden)

    return out

# Instantiating the model

In [39]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Device:", device)

input_size = len(feature_cols)
print(input_size)

model = LSTMRegressor(input_size=input_size, hidden_size=64, num_layers=2, dropout=0.0)
model = model.to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 20

for epoch in range(num_epochs):
  model.train()
  epoch_loss = 0.0

  for X_batch, y_batch in train_loader:
    X_batch = X_batch.to(device)
    y_batch = y_batch.to(device)

    optimizer.zero_grad()

    y_pred = model(X_batch)
    loss = criterion(y_pred, y_batch)

    loss.backward()
    optimizer.step()

    epoch_loss += loss.item() * X_batch.size(0)

  epoch_loss /= len(train_loader.dataset)
  print(f"Epoch {epoch+1}/{num_epochs} - train MSE: {epoch_loss:.6f}")


Device: cpu
7
Epoch 1/20 - train MSE: 0.178970
Epoch 2/20 - train MSE: 0.038546
Epoch 3/20 - train MSE: 0.026300
Epoch 4/20 - train MSE: 0.015111
Epoch 5/20 - train MSE: 0.010203
Epoch 6/20 - train MSE: 0.008981
Epoch 7/20 - train MSE: 0.007900
Epoch 8/20 - train MSE: 0.007276
Epoch 9/20 - train MSE: 0.006830
Epoch 10/20 - train MSE: 0.006340
Epoch 11/20 - train MSE: 0.006277
Epoch 12/20 - train MSE: 0.005836
Epoch 13/20 - train MSE: 0.005774
Epoch 14/20 - train MSE: 0.005396
Epoch 15/20 - train MSE: 0.005712
Epoch 16/20 - train MSE: 0.005290
Epoch 17/20 - train MSE: 0.004757
Epoch 18/20 - train MSE: 0.005191
Epoch 19/20 - train MSE: 0.004631
Epoch 20/20 - train MSE: 0.004198


# Evaluating the model

In [40]:
model.eval()
test_loss = 0.0

with torch.no_grad():
  for X_batch, y_batch in test_loader:
    X_batch = X_batch.to(device)
    y_batch = y_batch.to(device)

    y_pred = model(X_batch)
    loss = criterion(y_pred, y_batch)

    test_loss += loss.item() * X_batch.size(0)

test_loss /= len(test_loader.dataset)
print(f"Test MSE: {test_loss:.6f}")

Test MSE: 0.043317


In [41]:
model.eval()

all_preds_scaled = []
all_targets_scaled = []

with torch.no_grad():
  for X_batch, y_batch in test_loader:
    X_batch = X_batch.to(device)
    y_batch = y_batch.to(device)

    y_pred = model(X_batch)

    all_preds_scaled.append(y_pred.cpu().numpy())
    all_targets_scaled.append(y_batch.cpu().numpy())

y_pred_scaled = np.concatenate(all_preds_scaled, axis=0)
y_true_scaled = np.concatenate(all_targets_scaled, axis=0)

print("Shapes (scaled):", y_pred_scaled.shape, y_true_scaled.shape)


Shapes (scaled): (78, 1) (78, 1)


# Transforming the prediction back to USD

In [42]:
n_test = y_pred_scaled.shape[0]

# Temporary array with len(cols_price)
temp_pred = np.zeros((n_test, len(cols_price)))
temp_true = np.zeros((n_test, len(cols_price)))

# Putting Close (scaled) in the correct column
temp_pred[:, close_index] = y_pred_scaled[:, 0]
temp_true[:, close_index] = y_true_scaled[:, 0]

# Inverting the scale
temp_pred_inv = scaler_price.inverse_transform(temp_pred)
temp_true_inv = scaler_price.inverse_transform(temp_true)

# Getting "Close" column in the original scale
y_pred_real = temp_pred_inv[:, close_index]
y_true_real = temp_true_inv[:, close_index]

print("Shapes (real):", y_pred_real.shape, y_true_real.shape)
print("5 first predicted values:", y_pred_real[:5])
print("5 first real values:", y_true_real[:5])

Shapes (real): (78,) (78,)
5 first predicted values: [217.38757214 217.89289964 218.90387026 220.73351147 222.25362403]
5 first real values: [226.16999783 228.13000629 229.81000065 230.28999775 226.78000148]
