In [1]:
import os
import pandas as pd
import numpy as np
import xgboost
import matplotlib.pyplot as plt
from xgboost import plot_importance
from sklearn import metrics

In [2]:
import pandas as pd
# Load the dataset
dataframe = pd.read_csv('Petrol CSV w Date.csv')
# Convert 'Date' column to DatetimeIndex with the correct format
dataframe['date'] = pd.to_datetime(dataframe['date'], format='%Y')
# Now you can work with the DataFrame as needed
print(dataframe.head())

        date  consumption
0 1980-01-01     542254.0
1 1981-01-01     589017.0
2 1982-01-01     644907.0
3 1983-01-01     690214.0
4 1984-01-01     754725.0


In [3]:
df1 = dataframe.copy()
del df1['date']
df1

Unnamed: 0,consumption
0,542254.0
1,589017.0
2,644907.0
3,690214.0
4,754725.0
5,783807.0
6,822735.0
7,874255.0
8,940358.0
9,989211.0


In [4]:
print(dataframe)

         date  consumption
0  1980-01-01   542254.000
1  1981-01-01   589017.000
2  1982-01-01   644907.000
3  1983-01-01   690214.000
4  1984-01-01   754725.000
5  1985-01-01   783807.000
6  1986-01-01   822735.000
7  1987-01-01   874255.000
8  1988-01-01   940358.000
9  1989-01-01   989211.000
10 1990-01-01  1069662.000
11 1991-01-01  1066161.000
12 1992-01-01  1116531.000
13 1993-01-01  1187036.000
14 1994-01-01  1180097.000
15 1995-01-01  1157853.000
16 1996-01-01  1220068.000
17 1997-01-01  1276282.000
18 1998-01-01  1300373.000
19 1999-01-01  1245994.000
20 2000-01-01  1259601.000
21 2001-01-01  1103762.000
22 2002-01-01  1092482.000
23 2003-01-01  1098342.000
24 2004-01-01  1243675.000
25 2005-01-01  1330538.000
26 2006-01-01  1187530.000
27 2007-01-01  1147886.000
28 2008-01-01  1459570.000
29 2009-01-01  1527953.000
30 2010-01-01  1935678.000
31 2011-01-01  2257122.000
32 2012-01-01  2764843.000
33 2013-01-01  3353134.000
34 2014-01-01  3877984.000
35 2015-01-01  4754295.000
3

In [5]:
df = df1[:44]
df

Unnamed: 0,consumption
0,542254.0
1,589017.0
2,644907.0
3,690214.0
4,754725.0
5,783807.0
6,822735.0
7,874255.0
8,940358.0
9,989211.0


In [6]:
split_date = 34
train = df.loc[df.index <= split_date].copy()
test = df.loc[df.index > split_date].copy()

In [7]:
len(train)

35

In [8]:
len(test)

9

In [9]:
df.shape

(44, 1)

In [11]:
def create_features(df, label=None):
    """
    Creates time series features from datetime index
    """
    # Ensure the index is datetime
    if not pd.api.types.is_datetime64_any_dtype(df.index):
        df.index = pd.to_datetime(df.index)
    
    # Create a 'date' column from the index
    df['date'] = df.index
    
    # Ensure 'date' column is datetime
    if not pd.api.types.is_datetime64_any_dtype(df['date']):
        df['date'] = pd.to_datetime(df['date'])

    # Extract datetime features
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week  # Use isocalendar for week numbers

    X = df[['hour', 'dayofweek', 'quarter', 'month', 'year',
            'dayofyear', 'dayofmonth', 'weekofyear']]
    if label:
        y = df[label]
        return X, y
    return X


# Create features and labels for training data
X_train, y_train = create_features(train, label='consumption')

# Create features and labels for test data
X_test, y_test = create_features(test, label='consumption')

In [12]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
# Define a parameter grid for hyperparameter tuning
param_grid = {
 'max_depth': [3, 4, 5, 6],
 'learning_rate': [0.01, 0.02, 0.03],
 'n_estimators': [150, 200, 250],
 'colsample_bytree': [0.5, 0.7, 0.9]
}
# Create an XGBoost regressor with 'reg:squarederror' as the objective
xgb = XGBRegressor(objective='reg:squarederror')
# Create a GridSearchCV object with custom scoring functions
scoring = {
 'MSE': 'neg_mean_squared_error',
 'MAE': 'neg_mean_absolute_error',
 'MAPE': 'neg_mean_absolute_percentage_error',
 'RMSE': 'neg_root_mean_squared_error'
}
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring=scoring, cv=4, refit='MSE', verbose=1)
# Fit the grid search to your data
grid_search.fit(X_train, y_train)
# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
# Create a new XGBoost model with the best hyperparameters
best_xgb = XGBRegressor(objective='reg:squarederror', **best_params,  early_stopping_rounds=50,)
# Fit the model with early stopping
best_xgb.fit(X_train, y_train,
 eval_set=[(X_train, y_train), (X_test, y_test)],

 verbose=False)
# Evaluate the model on the test data
y_pred = best_xgb.predict(X_test)
# Calculate the evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred) * 100
mpe = ((y_test - y_pred) / y_test).mean() * 100
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Best Hyperparameters:", best_params)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("Mean Absolute Percentage Error (MAPE):", mape)
print("Mean Percentage Error (MPE):", mpe)
print("Root Mean Squared Error (RMSE):", rmse)

Fitting 4 folds for each of 108 candidates, totalling 432 fits
Best Hyperparameters: {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 150}
Mean Squared Error (MSE): 36771017388237.04
Mean Absolute Error (MAE): 5931803.984222222
Mean Absolute Percentage Error (MAPE): 81.17750591573298
Mean Percentage Error (MPE): 81.17750591573298
Root Mean Squared Error (RMSE): 6063911.063681346




In [13]:
X_train.head()

Unnamed: 0,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
1970-01-01 00:00:00.000000000,0,3,1,1,1970,1,1,1
1970-01-01 00:00:00.000000001,0,3,1,1,1970,1,1,1
1970-01-01 00:00:00.000000002,0,3,1,1,1970,1,1,1
1970-01-01 00:00:00.000000003,0,3,1,1,1970,1,1,1
1970-01-01 00:00:00.000000004,0,3,1,1,1970,1,1,1


In [14]:
y_train.head()

1970-01-01 00:00:00.000000000    542254.0
1970-01-01 00:00:00.000000001    589017.0
1970-01-01 00:00:00.000000002    644907.0
1970-01-01 00:00:00.000000003    690214.0
1970-01-01 00:00:00.000000004    754725.0
Name: consumption, dtype: float64

In [15]:
from xgboost import XGBRegressor
import numpy as np
xgb = XGBRegressor(objective='reg:linear', max_depth=4, learning_rate=0.03, n_estimators=250, colsample_bytree=0.7)
xgb
xgb.fit(X_train, y_train,
 eval_set=[(X_train, y_train), (X_test, y_test)],
 verbose=False)



In [16]:
plot_importance(xgb, height=0.8, color='blue');
plt.title("Feature Importance (For PETROLEUM ENERGY PRODUCTS CONSUMPTION BY FUEL-Motor Spirit Sector)")

ValueError: Booster.get_score() results in empty.  This maybe caused by having all trees as decision dumps.

In [17]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
# Create features from datetime index
def create_features(df):
    df['date'] = df.index
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear

    X = df[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    return X
# Define your training data
X_train = create_features(df)
y_train = df['domestic']
# Define your XGBoost model
xgb_model = XGBRegressor(objective='reg:squarederror', max_depth=4, learning_rate=0.03, n_estimators=250, colsample_bytree=0.7)
# Train your XGBoost model
xgb_model.fit(X_train, y_train)
# Create features for future dates (adjust this as needed)
future_dates = pd.date_range(start=df.index[-1], periods=36, freq='M') # Assuming you want to forecast 365 days into the future
df_future_dates = pd.DataFrame(index=future_dates)
X_test_future = create_features(df_future_dates)
# Predict future results
predicted_results_future = xgb_model.predict(X_test_future)
# Plot the predicted results
plt.figure(figsize=(13, 8))
plt.plot(predicted_results_future)
plt.title("Predicted Future Sale")
plt.ylabel("domestic ")
plt.legend(('Predicted',))
plt.show()
# Add the predicted results to your future dates DataFrame copy
df_future_dates_copy = df_future_dates.copy()
df_future_dates_copy['Prediction'] = predicted_results_future
# Concatenate the predicted results with your original data
Irr_all_future = pd.concat([df, df_future_dates_copy], sort=False)
# Plot the combined data and predictions
Overview_Complete_Data_And_Prediction_future = Irr_all_future[['domestic', 'Prediction']].plot(figsize=(15, 5))
plt.title("Combined Data and Predictions (Including Future)")
plt.ylabel("Demand")
plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = df.index


AttributeError: Can only use .dt accessor with datetimelike values

In [42]:
predicted_results_future

NameError: name 'predicted_results_future' is not defined

In [44]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming you have a Pandas Series 'f1' from your previous calculations
f1 = predicted_results_future


# Reshape 'f1_array' to a 2D array with one column
f1 = f1.reshape(-1, 1)
f1

NameError: name 'predicted_results_future' is not defined

In [46]:
import pandas as pd
import numpy as np

# Assuming you have the 'forecasted_values_first' array
forecasted_values_first = np.array([[20022.764],
       [19343.422],
       [19761.19 ],
       [19615.47 ],
       [20679.953],
       [22981.383],
       [29576.69 ],
       [37005.305],
       [30973.584],
       [26211.174],
       [21430.709],
       [20825.102],
       [19873.13 ],
       [19622.541],
       [19761.19 ],
       [19597.936],
       [20696.848],
       [22964.389],
       [30202.451],
       [37700.363],
       [30972.018],
       [26248.355],
       [22060.287],
       [20825.102],
       [19875.209],
       [19644.533],
       [19624.412],
       [19424.512],
       [20723.473],
       [22936.963],
       [30402.096],
       [36687.18 ],
       [30581.762],
       [25799.37 ],
       [21960.703],
       [20340.5  ]], dtype=float)
# Create a DataFrame with the 'Projected Daily Demand' column
data = pd.DataFrame({'Projected Sale': forecasted_values_first.ravel()})

# Print the first few rows of the new dataset
print(data.head())

   Projected Sale
0       20022.764
1       19343.422
2       19761.190
3       19615.470
4       20679.953


In [48]:
import numpy as np
import matplotlib.pyplot as plt

# Plot only forecasted values
plt.figure(figsize=(15, 5))
plt.plot(f1, label='Projected Values', linestyle='-', color='purple')

# Add gridlines
plt.grid(True)

# Add legend
plt.legend()

# Show the plot
plt.show()

NameError: name 'f1' is not defined

<Figure size 1500x500 with 0 Axes>

In [50]:
f2 = df1[150:186]

# Add 'Forecasted_Values' column to 'f2' and populate it with forecasted values
f2['Projected Sale'] = data['Projected Sale'].values
f2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f2['Projected Sale'] = data['Projected Sale'].values


Unnamed: 0,PETROLEUM ENERGY PRODUCTS CONSUMPTION BY FUEL-Motor Spirit,Projected Sale
0,,20022.764
1,,19343.422
2,,19761.19
3,,19615.47
4,,20679.953
5,,22981.383
6,,29576.69
7,,37005.305
8,,30973.584
9,,26211.174


In [52]:
import matplotlib.pyplot as plt

# Assuming 'f2' is a DataFrame containing 'Forecasted_Values' and 'Demand' columns

# Calculate MAPE and MPE
mape = ((f2['domestic'] - f2['Projected Sale']) / f2['domestic']).abs().mean() * 100
mpe = ((f2['domestic'] - f2['Projected Sale']) / f2['domestic']).mean() * 100

# Plot the 'Forecasted_Values' and 'Demand' series
plt.figure(figsize=(9, 3))
plt.plot(f2.index, f2['Projected Sale'], label='Projected Sale', linestyle='-')
plt.plot(f2.index, f2['domestic'], label='domestic', linestyle='-')

# Add labels and legend
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()

# Add MAPE and MPE as text annotations with adjusted positions
mape_x = f2.index[10]  # Adjust the x-coordinate for MAPE
mape_y = f2['domestic'].max() - 1500  # Adjust the y-coordinate for MAPE
plt.text(mape_x, mape_y, f'MAPE: {mape:.2f}%', fontsize=16, color='red')

mpe_x = f2.index[20]  # Adjust the x-coordinate for MPE
mpe_y = f2['domestic'].max() - 1500  # Adjust the y-coordinate for MPE
plt.text(mpe_x, mpe_y, f'MPE: {mpe:.2f}%', fontsize=16, color='blue')

# Show the plot
plt.grid(True)
plt.tight_layout()
plt.show()

KeyError: 'domestic'