In [None]:
import pandas as pd
import numpy as np
import math
import datetime as dt
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.datasets import make_regression

import tensorflow as tf
'''from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, GRU'''

from itertools import cycle


# ! pip install plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot
# Load your dataset
df = pd.read_excel('JKL (John Keells) 2022-2023.xlsx')

# Convert the 'Day' column to datetime format
df['Day'] = pd.to_datetime(df['Day'], format='%Y-%m-%d', errors='coerce')

# Filter data for the year 2022 and 2023
start_date = '2022-01-01'
end_date = '2023-12-31'

# Convert the date strings to datetime objects
start_date = pd.to_datetime(start_date, format='%Y-%m-%d')
end_date = pd.to_datetime(end_date, format='%Y-%m-%d')

# Filter the dataframe
filtered_df = df[(df['Day'] >= start_date) & (df['Day'] <= end_date)]

# Check the shape of the filtered data
print("Shape of filtered data:", filtered_df.shape)

# Convert the 'Closing' column to float64
#filtered_df['Closing'] = pd.to_numeric(filtered_df['Closing'], errors='coerce', downcast='float')

# Sort the data by date
filtered_df.sort_values(by='Day', inplace=True)
filtered_df.head()


Shape of filtered data: (254, 7)


Unnamed: 0,Day,High,Low,Closing,No. of Trades,No. of Shares,Turnover(Rs.)
0,2022-01-03,71.9,68.5,68.6,16,5130,351687.2
1,2022-01-04,72.0,68.8,70.6,30,13046,934838.2
2,2022-01-05,72.0,70.5,70.8,19,12145,868850.5
3,2022-01-06,71.0,68.7,70.5,10,4264,302609.3
4,2022-01-07,75.0,69.0,74.3,33,31975,2358199.0


In [None]:
# Assuming df.iloc[0][0] and df.iloc[-1][0] are date strings
start_date_str = '2022-01-01'  # Start date for the year 2022
end_date_str = '2023-12-31'    # End date for the year 2023

# Convert the date strings to datetime objects
start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d')
end_date = pd.to_datetime(end_date_str, format='%Y-%m-%d')

# Calculate the duration
duration = end_date - start_date

print("Starting date: ", start_date)
print("Ending date: ", end_date)
print("Duration: ", duration)

#Make separate dataframe with close price
closedf = filtered_df[['Day','Closing']]
print("Shape of close dataframe:", closedf.shape)

#Normalizing / scaling close value between 0 to 1
close_stock = closedf.copy()
del closedf['Day']
scaler=MinMaxScaler(feature_range=(0,1))
closedf=scaler.fit_transform(np.array(closedf).reshape(-1,1))
print(closedf.shape)




Starting date:  2022-01-01 00:00:00
Ending date:  2023-12-31 00:00:00
Duration:  729 days 00:00:00
Shape of close dataframe: (254, 2)
(254, 1)


In [None]:
#Split data for training and testing
#Ratio for training and testing data is 80:20
training_size=int(len(closedf)*0.8)
test_size=len(closedf)-training_size
train_data,test_data=closedf[0:training_size,:],closedf[training_size:len(closedf),:1]

    # Split the data
train_data, test_data = closedf[0:training_size, :], closedf[training_size:len(closedf), :1]

print("train_data: ", train_data.shape)
print("test_data: ", test_data.shape)

#Create new dataset according to requirement of time-series prediction
# convert an array of values into a dataset matrix
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step), 0]   ###i=0, 0,1,2,3-----99   100
        dataX.append(a)
        dataY.append(dataset[i + time_step, 0])
    return np.array(dataX), np.array(dataY)

# reshape into X=t,t+1,t+2,t+3 and Y=t+4
time_step = 15
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)
# Convert NumPy array to a pandas DataFrame
X_test_df = pd.DataFrame(X_test)

# Drop rows with missing values
X_test_df = X_test_df.dropna()

# Convert the DataFrame back to a NumPy array
X_test = X_test_df.values

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)

from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X_train, y_train)

train_data:  (203, 1)
test_data:  (51, 1)
X_train:  (187, 15)
y_train:  (187,)
X_test:  (35, 15)
y_test (35,)


In [None]:
# Lets Do the prediction
train_predict=regressor.predict(X_train)
test_predict=regressor.predict(X_test)

train_predict = train_predict.reshape(-1,1)
test_predict = test_predict.reshape(-1,1)

print("Train data prediction:", train_predict.shape)
print("Test data prediction:", test_predict.shape)

# Transform back to original form
train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)
original_ytrain = scaler.inverse_transform(y_train.reshape(-1,1))
original_ytest = scaler.inverse_transform(y_test.reshape(-1,1))

# Evaluation metrices RMSE and MAE
#print("Train data RMSE: ", math.sqrt(mean_squared_error(original_ytrain,train_predict)))
#print("Train data MSE: ", mean_squared_error(original_ytrain,train_predict))
#print("Train data MAE: ", mean_absolute_error(original_ytrain,train_predict))
print("-------------------------------------------------------------------------------------")
'''print("Test data RMSE: ", math.sqrt(mean_squared_error(original_ytest,test_predict)))
print("Test data MSE: ", mean_squared_error(original_ytest,test_predict))
print("Test data MAE: ", mean_absolute_error(original_ytest,test_predict))'''
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Assuming you have original test values in 'original_ytest' and predicted values in 'test_predict'

# Find the intersection of indices between 'original_ytest' and 'test_predict'
common_indices = list(set(range(len(original_ytest))) & set(range(len(test_predict))))

# Use the common indices to extract the corresponding data points
common_original_ytest = [original_ytest[i] for i in common_indices]
common_test_predict = [test_predict[i] for i in common_indices]

# Convert lists to NumPy arrays for easier handling
common_original_ytest = np.array(common_original_ytest)
common_test_predict = np.array(common_test_predict)

# Remove rows with NaN values from both arrays
valid_indices = ~np.isnan(common_original_ytest) & ~np.isnan(common_test_predict)
common_original_ytest = common_original_ytest[valid_indices]
common_test_predict = common_test_predict[valid_indices]

# Calculate RMSE using the common data without NaN values
rmse = math.sqrt(mean_squared_error(common_original_ytest, common_test_predict))
print("Test data RMSE with common data: ", rmse)
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.utils.validation import check_array

# Define a custom function to calculate MAPE
# Define a custom function to calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    #y_true, y_pred = check_array(y_true, y_pred)
    y_true = y_true.reshape(-1, 1)  # Reshape to a 2D array
    y_pred = y_pred.reshape(-1, 1)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Calculate MSE
mse = mean_squared_error(common_original_ytest, common_test_predict)
print("MSE: ", mse)

# Calculate MAE
mae = mean_absolute_error(common_original_ytest, common_test_predict)
print("MAE: ", mae)

# Calculate R-squared (R²)
r2 = r2_score(common_original_ytest, common_test_predict)
print("R-squared (R²): ", r2)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(common_original_ytest, common_test_predict)
print("MAPE: ", mape)







Train data prediction: (187, 1)
Test data prediction: (35, 1)
-------------------------------------------------------------------------------------
Test data RMSE with common data:  1.1978294178578688
MSE:  1.4347953142857208
MAE:  0.8558857142857168
R-squared (R²):  0.541166943382827
MAPE:  1.2715280513059675


In [None]:
# shift train predictions for plotting

look_back=time_step
trainPredictPlot = np.empty_like(closedf)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(train_predict)+look_back, :] = train_predict
print("Train predicted data: ", trainPredictPlot.shape)

# shift test predictions for plotting
testPredictPlot = np.empty_like(closedf)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(train_predict)+(look_back*2)+1:len(closedf)-1, :] = test_predict
print("Test predicted data: ", testPredictPlot.shape)

names = cycle(['Original close price','Train predicted close price','Test predicted close price'])


plotdf = pd.DataFrame({'Date': close_stock['Day'],
                       'original_close': close_stock['Closing'],
                      'train_predicted_close': trainPredictPlot.reshape(1,-1)[0].tolist(),
                      'test_predicted_close': testPredictPlot.reshape(1,-1)[0].tolist()})

fig = px.line(plotdf,x=plotdf['Date'], y=[plotdf['original_close'],plotdf['train_predicted_close'],
                                          plotdf['test_predicted_close']],
              labels={'value':'Stock price','Day': 'Date'})
fig.update_layout(title_text='Comparision between original close price vs predicted close price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

Train predicted data:  (254, 1)
Test predicted data:  (254, 1)


In [None]:
x_input=test_data[len(test_data)-time_step:].reshape(1,-1)
temp_input=list(x_input)
temp_input=temp_input[0].tolist()

from numpy import array

lst_output=[]
n_steps=time_step
i=0
pred_days = 10
while(i<pred_days):

    if(len(temp_input)>time_step):

        x_input=np.array(temp_input[1:])
        #print("{} day input {}".format(i,x_input))
        x_input=x_input.reshape(1,-1)

        yhat = regressor.predict(x_input)
        #print("{} day output {}".format(i,yhat))
        temp_input.extend(yhat.tolist())
        temp_input=temp_input[1:]

        lst_output.extend(yhat.tolist())
        i=i+1

    else:
        yhat = regressor.predict(x_input)

        temp_input.extend(yhat.tolist())
        lst_output.extend(yhat.tolist())

        i=i+1

print("Output of predicted next days: ", len(lst_output))


Output of predicted next days:  10


In [None]:
#Plotting whole closing stock price with prediction

rfdf=closedf.tolist()
rfdf.extend((np.array(lst_output).reshape(-1,1)).tolist())
rfdf=scaler.inverse_transform(rfdf).reshape(1,-1).tolist()[0]

names = cycle(['Close price'])

fig = px.line(rfdf,labels={'value': 'Stock price','index': 'Timestamp'})
fig.update_layout(title_text='Plotting whole closing stock price with prediction',
                  plot_bgcolor='white', font_size=15, font_color='black',legend_title_text='Stock')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()
df_pred = pd.DataFrame(Y_val.values, columns=['Actual'], index=Y_val.index)
df_pred['Predicted'] = Y_val_pred
df_pred = df_pred.reset_index()
df_pred.loc[:, 'Day'] = pd.to_datetime(df_pred['Day'],format='%Y-%m-%d')
print(df_pred)
df_pred.set_index('Day', inplace=True)
Actual = df_pred[['Actual']]
Predicted = df_pred[['Predicted']]
fig, ax = plt.subplots()
ax.plot(Actual, label='Actual stock price',color='r')
ax.plot(Predicted, label='Predicted stock price',color='g')
ax.legend(loc='best')
plt.show()


NameError: ignored