<a href="https://colab.research.google.com/github/JoviJoseph/Stock-Price-Prediction/blob/main/Time_Series_Forecasting_with_Yahoo_Stock_Price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import ttest_rel, f_oneway, shapiro, levene

In [2]:
# Load dataset
df = pd.read_csv("yahoo_stock.csv")

In [3]:
df.head()

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2015-11-23,2095.610107,2081.389893,2089.409912,2086.590088,3587980000.0,2086.590088
1,2015-11-24,2094.120117,2070.290039,2084.419922,2089.139893,3884930000.0,2089.139893
2,2015-11-25,2093.0,2086.300049,2089.300049,2088.870117,2852940000.0,2088.870117
3,2015-11-26,2093.0,2086.300049,2089.300049,2088.870117,2852940000.0,2088.870117
4,2015-11-27,2093.290039,2084.129883,2088.820068,2090.110107,1466840000.0,2090.110107


In [4]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

In [5]:
# Set Date as index
df.set_index('Date', inplace=True)

Exploratory Data Analysis

In [6]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

Missing values:
 High         0
Low          0
Open         0
Close        0
Volume       0
Adj Close    0
dtype: int64


In [7]:
# Descriptive statistics
df.describe()

Unnamed: 0,High,Low,Open,Close,Volume,Adj Close
count,1825.0,1825.0,1825.0,1825.0,1825.0,1825.0
mean,2660.718673,2632.81758,2647.704751,2647.856284,3869627000.0,2647.856284
std,409.680853,404.310068,407.169994,407.301177,1087593000.0,407.301177
min,1847.0,1810.099976,1833.400024,1829.079956,1296540000.0,1829.079956
25%,2348.350098,2322.25,2341.97998,2328.949951,3257950000.0,2328.949951
50%,2696.25,2667.840088,2685.48999,2683.340088,3609740000.0,2683.340088
75%,2930.790039,2900.709961,2913.860107,2917.52002,4142850000.0,2917.52002
max,3645.98999,3600.159912,3612.090088,3626.909912,9044690000.0,3626.909912


In [8]:
# Compute moving averages
if len(df) >= 200:  # Ensure sufficient data for rolling calculations
    df['SMA_50'] = df['Close'].rolling(window=50).mean()
    df['SMA_200'] = df['Close'].rolling(window=200).mean()

    # Create trend plot
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.index, y=df['Close'], mode='lines', name='Close Price'))
    fig.add_trace(go.Scatter(x=df.index, y=df['SMA_50'], mode='lines', name='50-day SMA'))
    fig.add_trace(go.Scatter(x=df.index, y=df['SMA_200'], mode='lines', name='200-day SMA'))

    fig.update_layout(
        title="Stock Price with 50-day & 200-day Moving Averages",
        xaxis_title="Date",
        yaxis_title="Stock Price",
        template='plotly_dark',
        height=600
    )

    fig.show()
else:
    print("Not enough data to compute 50-day and 200-day SMA.")



In [9]:
# Check if required columns exist
required_cols = {'Open', 'High', 'Low', 'Close'}
if required_cols.issubset(df.columns):
    # Create candlestick chart
    fig = go.Figure(
        data=[
            go.Candlestick(
                x=df.index,
                open=df['Open'],
                high=df['High'],
                low=df['Low'],
                close=df['Close'],
                name="Stock Price"
            )
        ]
    )

    fig.update_layout(
        title="Stock Price Candlestick Chart",
        xaxis_title="Date",
        yaxis_title="Price",
        template='plotly_dark',
        height=600
    )

    fig.show()
else:
    print("Missing required columns for candlestick chart.")


In [10]:
# Check if 'Close' column exists before computing rolling volatility
if 'Close' in df.columns:
    # Calculate rolling standard deviation (30-day window)
    df['Rolling_Volatility'] = df['Close'].rolling(window=30).std()

    # Plot rolling volatility
    fig = px.line(df, x=df.index, y='Rolling_Volatility', title="30-Day Rolling Volatility")

    fig.update_layout(
        title_x=0.5,
        template='plotly_dark',
        xaxis_title="Date",
        yaxis_title="Volatility",
        height=500
    )

    fig.show()
else:
    print("Missing required column: 'Close'. Skipping rolling volatility analysis.")


In [11]:
# Check if required columns exist before computing correlation
if 'Close' in df.columns and 'Volume' in df.columns:
    # Compute rolling correlation (30-day window)
    df['Rolling_Correlation'] = df['Close'].rolling(window=30).corr(df['Volume'])

    # Plot rolling correlation
    fig = px.line(df, x=df.index, y='Rolling_Correlation', title="30-Day Rolling Correlation (Close vs. Volume)")

    fig.update_layout(
        title_x=0.5,
        template='plotly_dark',
        xaxis_title="Date",
        yaxis_title="Correlation",
        height=500
    )

    fig.show()
else:
    print("Missing required columns: 'Close' and 'Volume'. Skipping correlation analysis.")


In [12]:
# Check if 'Close' column exists before computing daily returns
if 'Close' in df.columns:
    # Calculate daily percentage returns
    df['Daily_Return'] = df['Close'].pct_change() * 100  # Convert to percentage

    # Drop NaN values caused by pct_change()
    df.dropna(subset=['Daily_Return'], inplace=True)

    # Histogram of daily returns
    fig = px.histogram(df, x='Daily_Return', nbins=50, title="Daily Return Distribution")

    fig.update_layout(
        title_x=0.5,
        template='plotly_dark',
        xaxis_title="Daily Return (%)",
        yaxis_title="Frequency",
        height=500
    )

    fig.show()
else:
    print("Missing required column: 'Close'. Skipping daily return analysis.")


In [13]:
# Statistical Tests
# Perform paired T-test for 'High' and 'Low' prices
t_stat, p_value = ttest_rel(df['High'], df['Low'])

# Store results in a DataFrame
t_test_result = pd.DataFrame({'Statistic': [t_stat], 'p-value': [p_value]})

# Display results
t_test_result


Unnamed: 0,Statistic,p-value
0,47.091126,2.166482e-317


In [14]:
# Create a bar chart with annotations
fig = go.Figure()

# Add T-statistic bar
fig.add_trace(go.Bar(
    x=['T-Statistic'],
    y=[t_stat],
    name='T-Statistic',
    marker_color='royalblue',
    text=[round(t_stat, 2)],
    textposition='auto'
))

# Add p-value bar
fig.add_trace(go.Bar(
    x=['P-Value'],
    y=[p_value],
    name='P-Value',
    marker_color='orangered',
    text=[f"{p_value:.5f}"],
    textposition='auto'
))

# Update layout
fig.update_layout(
    title='T-Test Results: High vs Low Prices',
    title_x=0.5,
    template='plotly_dark',
    yaxis_title='Value',
    xaxis_title='Metric'
)

fig.show()


In [15]:
# Perform ANOVA Test
anova_stat, anova_p_value = f_oneway(df['Open'], df['High'], df['Low'], df['Close'])

# Check Normality using Shapiro-Wilk test
shapiro_tests = {col: shapiro(df[col])[1] for col in ['Open', 'High', 'Low', 'Close']}

# Check Homogeneity of Variance using Levene's test
levene_stat, levene_p = levene(df['Open'], df['High'], df['Low'], df['Close'])

# Store results in DataFrames
anova_df = pd.DataFrame({'Statistic': [anova_stat], 'p-value': [anova_p_value]})
shapiro_df = pd.DataFrame({'Feature': list(shapiro_tests.keys()), 'Shapiro p-value': list(shapiro_tests.values())})
levene_df = pd.DataFrame({'Levene Statistic': [levene_stat], 'p-value': [levene_p]})

# Display results
print("✅ ANOVA Results:")
display(anova_df)

print("\n📊 Normality Test (Shapiro-Wilk):")
display(shapiro_df)

print("\n⚖️ Homogeneity of Variance Test (Levene's Test):")
display(levene_df)


✅ ANOVA Results:


Unnamed: 0,Statistic,p-value
0,1.433086,0.231004



📊 Normality Test (Shapiro-Wilk):


Unnamed: 0,Feature,Shapiro p-value
0,Open,8.49322e-16
1,High,2.352021e-16
2,Low,3.340855e-15
3,Close,8.056606e-16



⚖️ Homogeneity of Variance Test (Levene's Test):


Unnamed: 0,Levene Statistic,p-value
0,0.085859,0.967797


In [16]:
# Replace zero p-values to avoid log issues
anova_df['p-value'] = anova_df['p-value'].replace(0, 1e-10)

# Plot with log scale for better visibility
fig = px.bar(anova_df, x=anova_df.index, y=['Statistic', 'p-value'],
             barmode='group', title='ANOVA Test Result: Open, High, Low, Close Prices')

fig.update_layout(title_text='ANOVA Test Result: Open, High, Low, Close Prices',
                  title_x=0.5, template='plotly_dark',
                  yaxis_type="log")  # Log scale applied

fig.show()


In [17]:
scaler = MinMaxScaler(feature_range=(0, 1))
df['Close'] = scaler.fit_transform(df[['Close']])


In [18]:
def create_dataset(data, time_step=1):
    X, y = [], []
    for i in range(len(data) - time_step):
        X.append(data[i:i + time_step, 0])
        y.append(data[i + time_step, 0])
    return np.array(X), np.array(y)

time_step = 60


In [19]:
train_size = int(len(df['Close']) * 0.8)
scaled_data = df[['Close']].values  # Ensure scaled_data is defined
train_data, test_data = scaled_data[:train_size, :], scaled_data[train_size:, :]


In [20]:
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)


In [21]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)


In [22]:
# Build the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(time_step, 1)))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(25))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')



Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



In [23]:
# Train the model
history = model.fit(X_train, y_train, batch_size=1, epochs=10)


Epoch 1/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 46ms/step - loss: 0.0067
Epoch 2/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 44ms/step - loss: 0.0017
Epoch 3/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 45ms/step - loss: 0.0014
Epoch 4/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 45ms/step - loss: 0.0012
Epoch 5/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 45ms/step - loss: 0.0013
Epoch 6/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 47ms/step - loss: 0.0011
Epoch 7/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 44ms/step - loss: 0.0012
Epoch 8/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 45ms/step - loss: 0.0011
Epoch 9/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 46ms/step - loss: 9.3951e-04
Epoch 10/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━

In [24]:
# Predictions
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


In [25]:
# Inverse transform to get actual values
train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)
y_train = scaler.inverse_transform([y_train])
y_test = scaler.inverse_transform([y_test])

In [26]:
# Calculate RMSE, MAE, MAPE, and R² Score
train_rmse = np.sqrt(mean_squared_error(y_train[0], train_predict[:, 0]))
train_mae = mean_absolute_error(y_train[0], train_predict[:, 0])
train_mape = np.mean(np.abs((y_train[0] - train_predict[:, 0]) / y_train[0])) * 100
train_r2 = r2_score(y_train[0], train_predict[:, 0])

test_rmse = np.sqrt(mean_squared_error(y_test[0], test_predict[:, 0]))
test_mae = mean_absolute_error(y_test[0], test_predict[:, 0])
test_mape = np.mean(np.abs((y_test[0] - test_predict[:, 0]) / y_test[0])) * 100
test_r2 = r2_score(y_test[0], test_predict[:, 0])

# Print results
print(f'Train RMSE: {train_rmse}, Train MAE: {train_mae}, Train MAPE: {train_mape}, Train R²: {train_r2}')
print(f'Test RMSE: {test_rmse}, Test MAE: {test_mae}, Test MAPE: {test_mape}, Test R²: {test_r2}')


Train RMSE: 33.045938183480814, Train MAE: 27.054217891406815, Train MAPE: 1.124297530542543, Train R²: 0.9889857445807458
Test RMSE: 71.66024642813828, Test MAE: 54.81807921362705, Test MAPE: 1.7855676476869622, Test R²: 0.9413544776594475


In [27]:
# Initialize empty arrays for plotting
train_plot = np.empty_like(scaled_data)
train_plot[:, :] = np.nan
train_plot[time_step:train_size, :] = train_predict  # Align train predictions

test_plot = np.empty_like(scaled_data)
test_plot[:, :] = np.nan
test_plot[train_size + time_step:len(scaled_data), :] = test_predict  # Align test predictions


In [28]:
# Ensure the train and test predictions are correctly positioned
train_plot = np.full_like(scaled_data, np.nan)
train_plot[time_step:train_size, 0] = train_predict[:, 0]  # Align train predictions

test_plot = np.full_like(scaled_data, np.nan)
test_plot[train_size + time_step:len(scaled_data), 0] = test_predict[:, 0]  # Align test predictions

# Create the plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['Close'], mode='lines', name='Actual Price', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=df.index, y=train_plot[:, 0], mode='lines', name='Train Predict', line=dict(color='green')))
fig.add_trace(go.Scatter(x=df.index, y=test_plot[:, 0], mode='lines', name='Test Predict', line=dict(color='red')))

fig.update_layout(title='Stock Price Prediction',
                  xaxis_title='Date',
                  yaxis_title='Stock Price',
                  template='plotly_dark')

fig.show()


In [29]:
def predict_future(model, data, scaler, time_step=60, future_steps=30):
    last_data = data[-time_step:]
    last_data = last_data.reshape(1, time_step, 1)

    future_predictions = []
    for _ in range(future_steps):
        next_pred = model.predict(last_data, verbose=0)
        future_predictions.append(next_pred[0, 0])
        last_data = np.append(last_data[:, 1:, :], [[[next_pred[0, 0]]]], axis=1)

    future_predictions = scaler.inverse_transform(np.array(future_predictions).reshape(-1, 1))

    return future_predictions

# Predict future prices
future_steps = 30
future_predictions = predict_future(model, scaled_data, scaler, time_step, future_steps)

# Generate future dates
future_dates = pd.date_range(start=df.index[-1], periods=future_steps, freq='D')

# Plot predictions
fig = go.Figure()

# Actual Prices
fig.add_trace(go.Scatter(x=df.index, y=df['Close'], mode='lines',
                         name='Actual Price', line=dict(color='blue', width=2)))

# Future Predictions
fig.add_trace(go.Scatter(x=future_dates, y=future_predictions[:, 0], mode='lines',
                         name='Future Predict', line=dict(color='orange', width=2, dash='dot')))

fig.update_layout(title='Future Stock Price Prediction',
                  xaxis_title='Date',
                  yaxis_title='Stock Price',
                  template='plotly_dark')

fig.show()
