In [1]:
%pip install tensorflow autokeras statsmodels prophet xgboost scikit-learn pandas numpy matplotlib keras-tuner plotly nbformat ipywidgets


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import sys
print(sys.executable)


c:\Program Files\Python311\python.exe


In [3]:
# %%
# Cell 1: Header and Dependencies
"""
Petrol Price Prediction Ensemble Forecast Notebook

Standalone Jupyter-style notebook split into cells,
for training, forecasting, and optional test-set evaluation.

Dependencies (install in Cell 0):
    tensorflow, autokeras, statsmodels, prophet, xgboost,
    scikit-learn, pandas, numpy, matplotlib, keras-tuner

USAGE as script:
    python petrol_price_prediction.py --train_file "D:/Kabilan/Final Year Project/train_data.csv" \
      --test_file "D:/Kabilan/Final Year Project/test_data.csv" --output_file "D:/Kabilan/Final Year Project/forecast_petrol_prices.csv"
"""
import os  # for file existence checks
import tensorflow as tf
import argparse
import pandas as pd
import numpy as npw
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import keras_tuner as kt
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
import plotly.graph_objects as go


In [4]:
# Cell 2: Argument Parsing and Helper Functions

import argparse
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
import keras_tuner as kt

def parse_args():
    parser = argparse.ArgumentParser(
        description="Super Hyper Realistic Petrol Price Predictor (Ensemble)"
    )
    parser.add_argument('--train_file',    type=str, default='train_data.csv',            help='Path to training CSV')
    parser.add_argument('--test_file',     type=str, default=None,                        help='Optional test CSV')
    parser.add_argument('--output_file',   type=str, default='forecast_petrol_prices.csv', help='Output CSV')
    parser.add_argument('--start_date',    type=str, default='1990-01-01',               help='Training start')
    parser.add_argument('--end_date',      type=str, default='2025-03-10',               help='Training end')
    parser.add_argument('--forecast_start',type=str, default='2025-03-11',               help='Forecast start')
    parser.add_argument('--forecast_end',  type=str, default='2025-12-31',               help='Forecast end')
    # Ignore Jupyter args
    args, _ = parser.parse_known_args()
    return args

def create_dataset(dataset: np.ndarray, time_step: int):
    X, Y = [], []
    for i in range(len(dataset) - time_step):
        X.append(dataset[i:i+time_step, 0])
        Y.append(dataset[i+time_step, 0])
    return np.array(X), np.array(Y)

def build_lstm_model(time_step, hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('lstm_units_1', 32, 256, 32),
                   return_sequences=True,
                   input_shape=(time_step, 1)))
    model.add(Dropout(rate=hp.Float('dropout_1', 0.0, 0.5, 0.1)))
    model.add(LSTM(units=hp.Int('lstm_units_2', 32, 256, 32),
                   return_sequences=True))
    model.add(Dropout(rate=hp.Float('dropout_2', 0.0, 0.5, 0.1)))
    model.add(LSTM(units=hp.Int('lstm_units_3', 32, 256, 32)))
    model.add(Dropout(rate=hp.Float('dropout_3', 0.0, 0.5, 0.1)))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

def recursive_forecast_reg(model, last_window, steps):
    preds = []
    window = last_window.copy()
    for _ in range(steps):
        p = model.predict(window.reshape(1, -1))[0]
        preds.append(p)
        window = np.append(window[1:], p)
    return np.array(preds)


In [5]:
# Cell 2.5: Notebook Arguments Override
class Args: pass
args = Args()
# If the CSVs live alongside this notebook, use:
args.train_file     = "train_data.csv"
args.test_file      = "test_data.csv"
args.output_file    = "forecast_petrol_prices.csv"
args.start_date     = "1990-01-01"
args.end_date       = "2025-05-05"
args.forecast_start = "2025-05-05"
args.forecast_end   = "2025-05-30"


In [6]:
# %%
# Cell 3: Main Script - Load and Preprocess Data
# (args defined in Cell 2.5)
import glob
# Auto-detect the correct training file by scanning for CSV/TXT with 'Petrol (USD)' column
search_dir = os.path.dirname(args.train_file) or os.getcwd()
candidates = glob.glob(os.path.join(search_dir, '*.csv')) + glob.glob(os.path.join(search_dir, '*.txt'))
train_path = None
for file in candidates:
    try:
        temp_df = pd.read_csv(file, nrows=5)
        if 'Petrol (USD)' in temp_df.columns or 'Petrol' in temp_df.columns:
            train_path = file
            print(f"Detected training file: {train_path}")
            break
    except Exception:
        continue
if not train_path:
    raise FileNotFoundError(f"No suitable training file found in {search_dir}")

# Load training data
df = pd.read_csv(train_path)
# Rename and prepare
if 'Petrol (USD)' in df.columns:
    df = df.rename(columns={'Petrol (USD)': 'Petrol'})
elif 'Petrol' in df.columns:
    df = df.rename(columns={'Petrol': 'Petrol'})
else:
    raise KeyError("Dataset does not contain 'Petrol' column.")
df = df.dropna(subset=['Petrol'])
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.sort_values('Date')
mask = (df['Date'] >= args.start_date) & (df['Date'] <= args.end_date)
df = df.loc[mask].reset_index(drop=True)

# Extract & scale
data = df['Petrol'].values.reshape(-1, 1)
scaler = MinMaxScaler((0, 1))
data_scaled = scaler.fit_transform(data)

# Create lagged features
time_step = 60
X_lstm, y_lstm = create_dataset(data_scaled, time_step)
X_lstm = X_lstm.reshape(-1, time_step, 1)
X_reg, y_reg = create_dataset(data_scaled, time_step)

# %%
time_step = 60
X_lstm, y_lstm = create_dataset(data_scaled, time_step)
X_lstm = X_lstm.reshape(-1, time_step, 1)
X_reg, y_reg = create_dataset(data_scaled, time_step)

Detected training file: d:\Kabilan's\Final Year Project\train_data.csv


In [7]:
# %%
# Cell 4: Hyperparameter Tuning for LSTM
split = int(len(X_lstm) * 0.9)
tuner = kt.RandomSearch(lambda hp: build_lstm_model(time_step, hp), 'val_loss', max_trials=10, executions_per_trial=1, directory='kt_dir', project_name='petrol')
tuner.search(X_lstm[:split], y_lstm[:split], validation_data=(X_lstm[split:], y_lstm[split:]), epochs=50, batch_size=32)
best_lstm = tuner.get_best_models(1)[0]


Reloading Tuner from kt_dir\petrol\tuner0.json



  super().__init__(**kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


In [8]:
# %%
# Cell 5: Train Base LSTM & Forecast
best_lstm.fit(X_lstm, y_lstm, epochs=50, batch_size=32)
forecast_dates = pd.date_range(start=args.forecast_start, end=args.forecast_end)
steps = len(forecast_dates)
seq = data_scaled[-time_step:].copy()
lstm_preds = []
for _ in range(steps):
    p = best_lstm.predict(seq.reshape(1, time_step,1))[0,0]
    lstm_preds.append(p)
    seq = np.append(seq, [[p]], axis=0)[1:]
lstm_forecast = scaler.inverse_transform(np.array(lstm_preds).reshape(-1,1))

Epoch 1/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 113ms/step - loss: 0.0012
Epoch 2/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 105ms/step - loss: 4.8376e-04
Epoch 3/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 111ms/step - loss: 6.2582e-04
Epoch 4/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 122ms/step - loss: 4.9468e-04
Epoch 5/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 110ms/step - loss: 4.9803e-04
Epoch 6/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 115ms/step - loss: 5.5666e-04
Epoch 7/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 132ms/step - loss: 4.9698e-04
Epoch 8/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 124ms/step - loss: 6.2544e-04
Epoch 9/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 158ms/step - loss: 5.0114e-04
Epoch 10/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [9]:
# %%
# Cell 6: ARIMA, Prophet & Regression Forecasts
# ARIMA
arima = ARIMA(df['Petrol'], order=(5,1,0)).fit()
arima_f = arima.forecast(steps=steps).values.reshape(-1,1)
# Prophet
df_prop = df.rename(columns={'Date':'ds','Petrol':'y'})[['ds','y']]
prophet = Prophet().fit(df_prop)
fut = prophet.make_future_dataframe(periods=steps)
prophet_f = prophet.predict(fut)['yhat'][-steps:].values.reshape(-1,1)
# Regression Models
models = {
    'XGB': XGBRegressor(n_estimators=100).fit(X_reg, y_reg),
    'RF': RandomForestRegressor(n_estimators=100).fit(X_reg, y_reg),
    'SVR': SVR().fit(X_reg, y_reg.ravel()),
    'LR': LinearRegression().fit(X_reg, y_reg)
}
last_win = data_scaled[-time_step:].flatten()
reg_preds = {name: scaler.inverse_transform(recursive_forecast_reg(m, last_win, steps).reshape(-1,1)) for name,m in models.items()}

18:51:30 - cmdstanpy - INFO - Chain [1] start processing
18:51:31 - cmdstanpy - INFO - Chain [1] done processing


In [10]:
# %%
# Cell 7: Ensemble & Optional Test Evaluation
all_preds = np.hstack([lstm_forecast, arima_f, prophet_f] + list(reg_preds.values()))
ensemble = all_preds.dot(np.ones((len(models) + 3, 1)) / (len(models) + 3))

if args.test_file:
    # Auto-detect or skip missing test file
    test_path = args.test_file
    if not os.path.isfile(test_path):
        base = os.path.basename(test_path).split('.')[0]
        search_dir = os.path.dirname(test_path) or os.getcwd()
        candidates = glob.glob(os.path.join(search_dir, f"{base}*.csv"))
        if candidates:
            print(f"Auto-detected test file: {candidates[0]}")
            test_path = candidates[0]
        else:
            print(f"Test file not found at {test_path}. Skipping test evaluation.")
            test_path = None
    if test_path:
        test_df = pd.read_csv(test_path)
        # Rename and prepare test data
        if 'Petrol (USD)' in test_df.columns:
            test_df = test_df.rename(columns={'Petrol (USD)': 'Petrol'})
        elif 'Prediction' in test_df.columns:
            # use 'Prediction' as actual petrol values
            test_df = test_df.rename(columns={'Prediction': 'Petrol'})
        elif 'Petrol' not in test_df.columns:
            raise KeyError("Test data does not contain 'Petrol' or 'Prediction' column.")
        test_df = test_df.dropna(subset=['Petrol'])
        test_df['Date'] = pd.to_datetime(test_df['Date'], errors='coerce')
        vals = test_df.sort_values('Date')['Petrol'].values.reshape(-1, 1)
        scaled_vals = scaler.transform(vals)
        X_t, y_t = create_dataset(scaled_vals, time_step)
        rmse = lambda preds: np.sqrt(mean_squared_error(vals[time_step:], preds.flatten()))
        print('LSTM Test RMSE:', rmse(scaler.inverse_transform(best_lstm.predict(X_t.reshape(-1, time_step, 1)))))
        for nm, m in models.items():
            pr = scaler.inverse_transform(
                recursive_forecast_reg(m, scaled_vals[-time_step:].flatten(), len(y_t)).reshape(-1, 1)
            )
            print(f"{nm} Test RMSE: {rmse(pr):.3f}")
# %%():
            pr = scaler.inverse_transform(
                recursive_forecast_reg(m, scaled_vals[-time_step:].flatten(), len(y_t)).reshape(-1, 1)
            )
            print(f"{nm} Test RMSE: {rmse(pr):.3f}")


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step
LSTM Test RMSE: 0.129178012917996
XGB Test RMSE: 0.191
XGB Test RMSE: 0.191
RF Test RMSE: 0.198
RF Test RMSE: 0.198
SVR Test RMSE: 0.331
SVR Test RMSE: 0.331
LR Test RMSE: 0.330
LR Test RMSE: 0.330


In [11]:
# %%
# Cell 8: Interactive Plot & Table
# Prepare in-sample LSTM series for plotting
train_pred   = scaler.inverse_transform(best_lstm.predict(X_lstm))
train_dates  = df['Date'][time_step:]

fig=go.Figure()
fig.add_trace(go.Scatter(x=df['Date'],y=data.flatten(),mode='lines',name='Original'))
fig.add_trace(go.Scatter(x=train_dates,y=train_pred.flatten(),mode='lines',name='LSTM In-Sample'))
fig.add_trace(go.Scatter(x=forecast_dates,y=ensemble.flatten(),mode='lines',name='Ensemble Forecast'))
fig.update_layout(
    title='Petrol Price Prediction',
    xaxis=dict(rangeselector=dict(buttons=[dict(count=1,label='1m',step='month',stepmode='backward'),dict(count=6,label='6m',step='month',stepmode='backward'),dict(count=1,label='Y',step='year',stepmode='backward'),dict(count=5,label='5y',step='year',stepmode='backward'),dict(step='all')]),rangeslider=dict(visible=True),type='date'),
    yaxis=dict(title='Petrol Price (USD)')
)
fig.show()

forecast_df=pd.DataFrame({'Date':forecast_dates,'Predicted Petrol Price (USD)':ensemble.flatten()})
forecast_df.to_csv(args.output_file,index=False)
print("\nEnsemble Forecast Table:")
print(forecast_df.to_string(index=False))

[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step



Ensemble Forecast Table:
      Date  Predicted Petrol Price (USD)
2025-05-05                      3.133849
2025-05-06                      3.150037
2025-05-07                      3.161333
2025-05-08                      3.168448
2025-05-09                      3.177949
2025-05-10                      3.189814
2025-05-11                      3.203545
2025-05-12                      3.215075
2025-05-13                      3.224586
2025-05-14                      3.234211
2025-05-15                      3.244340
2025-05-16                      3.255197
2025-05-17                      3.267048
2025-05-18                      3.279953
2025-05-19                      3.288173
2025-05-20                      3.298700
2025-05-21                      3.308473
2025-05-22                      3.317628
2025-05-23                      3.327757
2025-05-24                      3.335473
2025-05-25                      3.342421
2025-05-26                      3.349105
2025-05-27                     