In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import EarlyStopping
import joblib

from keras.optimizers import Adam


In [14]:
df = pd.read_csv('all_stocks_5yr.csv')

In [15]:
df['date'] = pd.to_datetime(df['date'])  # Convert 'date' to datetime format

# making month day and year into their own columns
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

df.drop(columns=['date'], inplace=True)





In [16]:
selected_stocks = ['AAPL'] 
filtered_data = df[df['Name'].isin(selected_stocks)]

filtered_data

Unnamed: 0,open,high,low,close,volume,Name,year,month,day
1259,67.7142,68.4014,66.8928,67.8542,158168416,AAPL,2013,2,8
1260,68.0714,69.2771,67.6071,68.5614,129029425,AAPL,2013,2,11
1261,68.5014,68.9114,66.8205,66.8428,151829363,AAPL,2013,2,12
1262,66.7442,67.6628,66.1742,66.7156,118721995,AAPL,2013,2,13
1263,66.3599,67.3771,66.2885,66.6556,88809154,AAPL,2013,2,14
...,...,...,...,...,...,...,...,...,...
2513,167.1650,168.6200,166.7600,167.7800,47230787,AAPL,2018,2,1
2514,166.0000,166.8000,160.1000,160.5000,86593825,AAPL,2018,2,2
2515,159.1000,163.8800,156.0000,156.4900,72738522,AAPL,2018,2,5
2516,154.8300,163.7200,154.0000,163.0300,68243838,AAPL,2018,2,6


In [17]:
#reformat to split easily

df['date'] = df['year'] * 10000 + df['month'] * 100 + df['day']

df.drop(columns=['year', 'day', 'month'], inplace=True)

selected_data = pd.concat([df[df['Name'] == stock] for stock in selected_stocks])

selected_features = selected_data[['date', 'close']]
selected_features

Unnamed: 0,date,close
1259,20130208,67.8542
1260,20130211,68.5614
1261,20130212,66.8428
1262,20130213,66.7156
1263,20130214,66.6556
...,...,...
2513,20180201,167.7800
2514,20180202,160.5000
2515,20180205,156.4900
2516,20180206,163.0300


In [18]:

#split the years it trains on
train_start_year = 2013
train_end_year = 2017
test_start_year = 2017
test_end_year = 2018

# Filter rows for training and testing sets
training_data = selected_features[(selected_features['date'] >= train_start_year * 10000) & (selected_features['date'] <= train_end_year * 10000)]
testing_data = selected_features[(selected_features['date'] >= test_start_year * 10000) & (selected_features['date'] <= test_end_year * 10000)]

testing_data

Unnamed: 0,date,close
2241,20170103,116.15
2242,20170104,116.02
2243,20170105,116.61
2244,20170106,117.91
2245,20170109,118.99
...,...,...
2487,20171222,175.01
2488,20171226,170.57
2489,20171227,170.60
2490,20171228,171.08


In [19]:

# Create copies of the selected features
# so that I don't modify original datasets
training_data_normalized = training_data.copy()
testing_data_normalized = testing_data.copy()

#normalize data
scaler = MinMaxScaler(feature_range=(0,1))
    
training_data_normalized[['close']] = scaler.fit_transform(training_data_normalized['close'].values.reshape(-1, 1))
testing_data_normalized[['close']] = scaler.fit_transform(testing_data_normalized['close'].values.reshape(-1, 1))


training_data_normalized

Unnamed: 0,date,close
1259,20130208,0.156253
1260,20130211,0.165412
1261,20130212,0.143154
1262,20130213,0.141506
1263,20130214,0.140729
...,...,...
2236,20161223,0.786556
2237,20161227,0.796141
2238,20161228,0.789665
2239,20161229,0.789276


In [20]:
#function for creating sequences for training

def create_sequences(data, sequence_length):
    sequences, targets = [], []
    
    for i in range(len(data) - sequence_length):
        #grabs sequence_length number of rows to "look back" on
        seq = data.iloc[i:i + sequence_length]['close'].values
        
        target = data.iloc[i + sequence_length]['close']
        #set targets for y and sequences for x
        sequences.append(seq)
        targets.append(target)
    return np.array(sequences), np.array(targets)

sequence_length = 10

# Create sequences for training and testing
X_train, y_train = create_sequences(training_data_normalized, sequence_length)
X_test, y_test = create_sequences(testing_data_normalized, sequence_length)

#reshape because LSTM expects 3D array for input layer
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))



In [21]:

#X_train = np.transpose(X_train,(0, 2, 1))
X_train.shape


(972, 10, 1)

In [None]:

# units are memory cells in the context of LSTMs
# the have input gates, output gates, and forget gate that
# regulate the flow of memory within the model
# dropout layers improve how a model learns via
# ignoring random neurons throughout training to defer the 
# model from learning the training data too well
# prevents it from performing poorly on new data
# aka overfitting

model = Sequential()

model.add(LSTM(units=10, return_sequences=True, input_shape=(sequence_length, 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=10, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=10))
model.add(Dropout(0.2))
model.add(Dense(units=1))
# Compile the model
# Adam optimizer: converges fast, very efficient
# uses momentum concept to move faster

model.compile(optimizer=Adam(learning_rate = .0001), loss='mean_squared_error')

model.summary()
print("First few sequences in X_train:", X_train[:3])

early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

#history = model.fit(X_train, y_train, epochs=500, batch_size=40, validation_split=0.1, verbose=1)

history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=.1, callbacks=[early_stopping])


#X_test_close = np.asarray(y_test).reshape((y_test.shape[0], 1, 1)).astype(np.float32)
#X_test_date = np.asarray(X_test_date).reshape((X_test_date.shape[0], 1, 1)).astype(np.float32)

# Predict the closing prices on the testing set
y_pred_original_scale = model.predict(X_test)


y_pred_original_scale = scaler.inverse_transform(y_pred_original_scale)

y_train = scaler.inverse_transform(y_train.reshape(-1, 1))
y_test = scaler.inverse_transform(y_test.reshape(-1, 1))

# Evaluate model
mse = mean_squared_error(testing_data.iloc[sequence_length:,]['close'], y_pred_original_scale)
print(f'Mean Squared Error on Testing Data: {mse}')



# Concatenate 'close' and 'date' features for testing
#X_test = np.concatenate([X_test_close, X_test_date], axis=2)
# Predict using test set
#y_pred = model.predict(X_test)

print("Training Data Range:", training_data['close'].min(), training_data['close'].max())
print("Testing Data Range:", testing_data['close'].min(), testing_data['close'].max())


min_pred_value = np.min(y_pred_original_scale)
max_pred_value = np.max(y_pred_original_scale)
print(f'Minimum Predicted Closing Price: {min_pred_value}')
print(f'Maximum Predicted Closing Price: {max_pred_value}')
#mse = mean_squared_error(y_test, y_pred)
#print(f'Mean Squared Error on Testing Data: {mse}')

plt.figure(figsize=(10, 6))

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')

plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Mean Squared Error')
plt.legend()

plt.show()




2023-12-13 19:26:33.672274: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-13 19:26:36.055310: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14581 MB memory:  -> device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0000:d8:00.0, compute capability: 7.0


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 10, 10)            480       
                                                                 
 dropout (Dropout)           (None, 10, 10)            0         
                                                                 
 lstm_1 (LSTM)               (None, 10, 10)            840       
                                                                 
 dropout_1 (Dropout)         (None, 10, 10)            0         
                                                                 
 lstm_2 (LSTM)               (None, 10)                840       
                                                                 
 dropout_2 (Dropout)         (None, 10)                0         
                                                                 
 dense (Dense)               (None, 1)                 1

2023-12-13 19:26:42.684418: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
 1/28 [>.............................] - ETA: 0s - loss: 0.0718

In [None]:
#mse 50, 50, 50 units = 16.66, 100 epochs
#mse 50, 100, 50 units = 18, 25 epochs
#mse 100, 100, 100 units = 16.69 100 epochs
#mse 25, 50, 100 units = 24 24 epochs
#mse 15, 15, 15 units = 19 100 epochs

In [None]:
#first time working mean squared error = 25790

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
print("Training Labels (y_train):", y_test[:5])


In [None]:
plt.figure(figsize=(12, 6))
testing_data['date'] = testing_data['date'].astype(str)

# Plot actual prices
plt.plot(testing_data['date'][:241], y_test, label='Actual Closing Prices', color='blue')

# Plot predicted prices
plt.plot(testing_data['date'][:241], y_pred_original_scale, label='Predicted Closing Prices', color='red')

plt.title('Actual vs. Predicted Closing Prices')
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.legend()
plt.xticks(rotation=45)  

plt.xticks(testing_data['date'][::30], rotation=45)

plt.show()