In [None]:
#importing necessary libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
df_AAPL =  pd.read_csv('../data/AAPL.csv', header=0)
 # save it to a new CSV file

df_AAPL.head()

print(df_AAPL.shape)


# exploring the data
print(df_AAPL.describe())
print(df_AAPL.isnull().sum())
#setting the index of the dataframe to the date column
df_AAPL['Date'] = pd.to_datetime(df_AAPL['Date'], format='%d-%m-%Y')


print(df_AAPL.Date.dtype)

df_AAPL = df_AAPL.set_index("Date")

df_AAPL.head(50)
#setting the frequency of the index to business days
print(df_AAPL.shape)
df_AAPL.index.duplicated().sum()
df_AAPL = df_AAPL[~df_AAPL.index.duplicated(keep='first')]
AAPL_data = df_AAPL.asfreq('b')
print(AAPL_data.shape)

print(AAPL_data.tail(30))

print(AAPL_data.isnull().sum())


#getting the dates of the missing values
null_data = AAPL_data[AAPL_data.isnull().any(axis=1)]

null_data.head()

null_dates = null_data.index.tolist()

import calendar
import datetime

holidays = []

# A complete list of Good Fridays for the relevant period
good_fridays_list = [
    datetime.date(2006, 4, 14), datetime.date(2007, 4, 6),
    datetime.date(2008, 3, 21), datetime.date(2009, 4, 10),
    datetime.date(2010, 4, 2),  datetime.date(2011, 4, 22),
    datetime.date(2012, 4, 6),  datetime.date(2013, 3, 29),
    datetime.date(2014, 4, 18), datetime.date(2015, 4, 3),
    datetime.date(2016, 3, 25), datetime.date(2017, 4, 14),
    datetime.date(2018, 3, 30), datetime.date(2019, 4, 19),
    datetime.date(2020, 4, 10)
]
good_fridays = [pd.to_datetime(date) for date in good_fridays_list]

# Special closures not covered by simple rules
special_closures = [
    pd.to_datetime('2007-01-02'), #mournday
     pd.to_datetime('2012-10-29'), # Hurricane Sandy
    pd.to_datetime('2012-10-30'), # Hurricane Sandy
    pd.to_datetime('2018-12-05')  # National Day of Mourning
]


for date in null_dates:
    year, month, day = date.year, date.month, date.day
    week_day = calendar.day_name[date.weekday()]

    # Check for special closures first
    if date in special_closures:
        holidays.append(date)
        continue

    # Check for Good Friday
    if date in good_fridays:
        holidays.append(date)
        continue

    # Check for standard holidays based on rules
    if month == 1 and (day == 1 or (day == 2 and week_day == 'Monday')): # New Year's
        holidays.append(date)
    elif month == 1 and week_day == 'Monday' and 15 <= day <= 21: # MLK Day
        holidays.append(date)
    elif month == 2 and week_day == 'Monday' and 15 <= day <= 21: # Presidents' Day
        holidays.append(date)
    elif month == 5 and week_day == 'Monday' and day >= 25: # Memorial Day
        holidays.append(date)
    elif month == 7 and (day == 4 or (day == 5 and week_day == 'Monday') or (day == 3 and week_day == 'Friday')): # Independence Day
        holidays.append(date)
    elif month == 9 and week_day == 'Monday' and day <= 7: # Labor Day
        holidays.append(date)
    elif month == 11 and week_day == 'Thursday' and 22 <= day <= 28: # Thanksgiving
        holidays.append(date)
    elif month == 12 and (day == 25 or (day == 24 and week_day == 'Friday') or (day == 26 and week_day == 'Monday')): # Christmas
        holidays.append(date)


# Filter out the holidays to find any remaining non-holiday gaps 
non_holidays = [date for date in null_dates if date not in holidays]

print(f"Identified {len(holidays)} holidays within the missing dates.")
print(f"Found {len(non_holidays)} missing dates that were NOT holidays.")

if non_holidays:
    print("\nThe following missing dates were NOT identified as holidays:")
    for date in non_holidays:
        print(date.strftime('%Y-%m-%d'))
else:
    print("\nSuccess! All missing dates were confirmed to be holidays or special closures.")
#getting the modified data
print(AAPL_data.shape)

modified_df = AAPL_data.drop(holidays)
modified_df.shape
print("Before filling missing values:\n",modified_df.isna().sum())

modified_df = modified_df.bfill(axis ='rows')

print("\nAfter filling missing values:\n",modified_df.isna().sum())
#visualizing the AAPLE Data
def plotter(code):
    global closing_stock
    plt.subplot(211)
    company_close = modified_df
    company_close = company_close.Close.values.astype('float32')
    company_close = company_close.reshape(-1, 1)
    closing_stock = company_close
    plt.xlabel('Time')
    plt.ylabel(code + " close stock prices")
    plt.title('prices Vs Time')
    plt.grid(True)
    plt.plot(company_close , 'b')
    plt.show()

plotter("AAPL")
#split the data
n_train = int(len(closing_stock) * 0.80)
n_remaining = len(closing_stock) - n_train

n_val = int(n_remaining*0.50)
n_test = n_remaining - n_val 
print("Train samples:",n_train, "Validation Samples:",n_val,"Test Samples:", n_test)

train_data = closing_stock[0:n_train]
print(train_data.shape)

val_data = closing_stock[n_train:n_train + n_val]
print(val_data.shape)

test_data = closing_stock[n_train + n_val:]
print(test_data.shape)
#feature scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

train = scaler.fit_transform(train_data)
val = scaler.transform(val_data)
test = scaler.transform(test_data)
#creating the dataset
def create_dataset(data , n_features):
    dataX, dataY = [], []
    for i in range(len(data)-n_features-1):
        a = data[i:(i+n_features), 0]
        dataX.append(a)
        dataY.append(data[i + n_features, 0])
    return np.array(dataX), np.array(dataY)

n_features = 2

trainX, trainY = create_dataset(train, n_features)
valX, valY = create_dataset(val, n_features)
testX, testY = create_dataset(test, n_features)

print(trainX.shape , trainY.shape , valX.shape , valY.shape, testX.shape , testY.shape)

trainX = trainX.reshape(trainX.shape[0] , 1 ,trainX.shape[1])
valX = valX.reshape(valX.shape[0] , 1 ,valX.shape[1])
testX = testX.reshape(testX.shape[0] , 1 ,testX.shape[1])

print(trainX.shape , trainY.shape , valX.shape , valY.shape, testX.shape , testY.shape)
#building the model
import tensorflow as tf
tf.random.set_seed(42)

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import mean_squared_error

model = keras.Sequential()

# First GRU layer
model.add(layers.GRU(units=100, return_sequences=True, input_shape=(1,n_features), activation='tanh'))
model.add(layers.Dropout(0.2))

# Second GRU layer
model.add(layers.GRU(units=150, return_sequences=True,  activation='tanh'))
model.add(layers.Dropout(0.2))

# Third GRU layer
model.add(layers.GRU(units=100, activation='tanh'))
model.add(layers.Dropout(0.2))

# The output layer
model.add(layers.Dense(units=1, kernel_initializer='he_uniform', activation='linear'))

model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate = 0.0005) , metrics = ['mean_squared_error'])

print(model.summary())

history = model.fit(trainX,trainY,epochs=100,batch_size=128, verbose=1, validation_data = (valX,valY))
#evaluating the model performance
import math

def model_score(model, X_train, y_train, X_val, y_val , X_test, y_test):
    print('Train Score:')
    train_score = model.evaluate(X_train, y_train, verbose=0)
    print("MSE: {:.5f} , RMSE: {:.2f}".format(train_score[0], math.sqrt(train_score[0])))

    print('Validation Score:')
    val_score = model.evaluate(X_val, y_val, verbose=0)
    print("MSE: {:.5f} , RMSE: {:.2f}".format (val_score[0], math.sqrt(val_score[0])))

    print('Test Score:')
    test_score = model.evaluate(X_test, y_test, verbose=0)
    print("MSE: {:.5f} , RMSE: {:.2f}".format (test_score[0], math.sqrt(test_score[0])))


model_score(model, trainX, trainY ,valX, valY , testX, testY)
#visualizing the loss vs epochs
print(history.history.keys())

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()
#visualizing the predictions vs ground truth
pred = model.predict(testX)
pred = scaler.inverse_transform(pred)
print(pred[:10])

testY_actual = testY.reshape(testY.shape[0] , 1)
testY_actual = scaler.inverse_transform(testY_actual)
print(testY_actual[:10])

plt.plot(testY_actual , 'b')
plt.plot(pred , 'r')

plt.xlabel('Time')
plt.ylabel('Stock Prices')
plt.title('Check the performance of the model with time')
plt.legend(['Actual', 'Predicted'], loc='upper left')

plt.grid(True)
plt.show()