In [16]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import eia

# Set plotting style
sns.set(style='whitegrid')

In [17]:
# Load the data from the text file
data = pd.read_csv('data/raw/PET/PET.txt', sep='\t', header=None, names=['json_str'])

# Function to parse JSON strings
def parse_json_str(json_str):
    return json.loads(json_str)

# Apply the function to parse the JSON strings
parsed_data = data['json_str'].apply(parse_json_str)

# Create a DataFrame from the parsed JSON data
df = pd.json_normalize(parsed_data)

# Display the first few rows of the DataFrame
print(df.head())

                       series_id  \
0  PET.EMM_EPMPR_PTE_Y35NY_DPG.W   
1  PET.EMM_EPMPR_PTE_Y44HO_DPG.W   
2  PET.EMM_EPMMR_PTE_R5XCA_DPG.W   
3  PET.EMM_EPMMR_PTE_Y05LA_DPG.W   
4  PET.EMM_EPMMR_PTE_Y05SF_DPG.W   

                                                name               units  f  \
0  New York Harbor Premium Reformulated Retail Ga...  Dollars per Gallon  W   
1  Houston, TX Premium Reformulated Retail Gasoli...  Dollars per Gallon  W   
2  West Coast (PADD 5) Except California Midgrade...  Dollars per Gallon  W   
3  Los Angeles, CA Midgrade Reformulated Retail G...  Dollars per Gallon  W   
4  San Francisco, CA Midgrade Reformulated Retail...  Dollars per Gallon  W   

  unitsshort                                        description copyright  \
0      $/gal  New York Harbor Premium Reformulated Retail Ga...      None   
1      $/gal  Houston, TX Premium Reformulated Retail Gasoli...      None   
2      $/gal  West Coast (PADD 5) Except California Midgrade...      None   


In [18]:
df.columns

Index(['series_id', 'name', 'units', 'f', 'unitsshort', 'description',
       'copyright', 'source', 'iso3166', 'geography', 'start', 'end',
       'last_updated', 'data', 'geography2', 'category_id',
       'parent_category_id', 'notes', 'childseries'],
      dtype='object')

In [19]:
# Sample this list to see ten random elements
df_list = df['data'].tolist()
print(df_list[:2])

[[['20240617', 4.306], ['20240610', 4.328], ['20240603', 4.367], ['20240527', 4.412], ['20240520', 4.399], ['20240513', 4.434], ['20240506', 4.457], ['20240429', 4.469], ['20240422', 4.472], ['20240415', 4.224], ['20240408', 4.177], ['20240401', 4.152], ['20240325', 4.185], ['20240318', 4.108], ['20240311', 4.098], ['20240304', 4.101], ['20240226', 4.067], ['20240219', 4.085], ['20240212', 4.032], ['20240205', 4.031], ['20240129', 4.041], ['20240122', 3.988], ['20240115', 3.986], ['20240108', 4.038], ['20240101', 4.053], ['20231225', 4.074], ['20231218', 4.059], ['20231211', 4.127], ['20231204', 4.2], ['20231127', 4.231], ['20231120', 4.226], ['20231113', 4.236], ['20231106', 4.295], ['20231030', 4.346], ['20231023', 4.384], ['20231016', 4.444], ['20231009', 4.473], ['20231002', 4.529], ['20230925', 4.568], ['20230918', 4.632], ['20230911', 4.561], ['20230904', 4.544], ['20230828', 4.571], ['20230821', 4.58], ['20230814', 4.576], ['20230807', 4.576], ['20230731', 4.538], ['20230724', 4

In [20]:
# Explode the 'data' column to separate rows for each date-value pair
df = df.explode('data')
df.head()

Unnamed: 0,series_id,name,units,f,unitsshort,description,copyright,source,iso3166,geography,start,end,last_updated,data,geography2,category_id,parent_category_id,notes,childseries
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240617, 4.306]",,,,,
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240610, 4.328]",,,,,
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240603, 4.367]",,,,,
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240527, 4.412]",,,,,
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240520, 4.399]",,,,,


In [21]:
# Select distinct series_id, name, units
df_series = df[['series_id', 'name', 'units']].drop_duplicates()
df_series.head()

Unnamed: 0,series_id,name,units
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon
1,PET.EMM_EPMPR_PTE_Y44HO_DPG.W,"Houston, TX Premium Reformulated Retail Gasoli...",Dollars per Gallon
2,PET.EMM_EPMMR_PTE_R5XCA_DPG.W,West Coast (PADD 5) Except California Midgrade...,Dollars per Gallon
3,PET.EMM_EPMMR_PTE_Y05LA_DPG.W,"Los Angeles, CA Midgrade Reformulated Retail G...",Dollars per Gallon
4,PET.EMM_EPMMR_PTE_Y05SF_DPG.W,"San Francisco, CA Midgrade Reformulated Retail...",Dollars per Gallon


In [22]:
# Filter df to only include name containing 'Louisiana' and units in Dollars per Gallon
df = df[df['name'].str.contains('Louisiana') & df['units'].str.contains('Dollars per Gallon')]
df

Unnamed: 0,series_id,name,units
91197,PET.EMA_EPMPR_PBS_SLA_DPG.M,Louisiana Reformulated Gasoline Premium Bulk S...,Dollars per Gallon
91248,PET.EMA_EPMPU_PDS_SLA_DPG.M,Louisiana Conventional Gasoline Premium DTW Sa...,Dollars per Gallon
91283,PET.EMA_EPMPR_PTA_SLA_DPG.M,Louisiana Reformulated Gasoline Premium Retail...,Dollars per Gallon
91336,PET.EMA_EPMPU_PTC_SLA_DPG.M,Louisiana Conventional Gasoline Premium Throug...,Dollars per Gallon
91581,PET.EMA_EPMPR_PRA_SLA_DPG.M,Louisiana Reformulated Gasoline Premium Rack S...,Dollars per Gallon
...,...,...,...
177004,PET.EMA_EPM0X_PDS_SLA_DPG.M,Louisiana Oxygenated Gasoline DTW Sales Price ...,Dollars per Gallon
177017,PET.EMA_EPMRX_PTC_SLA_DPG.M,Louisiana Oxygenated Gasoline Regular Through ...,Dollars per Gallon
177215,PET.EMA_EPMMX_PTC_SLA_DPG.M,Louisiana Oxygenated Gasoline Midgrade Through...,Dollars per Gallon
177223,PET.EMA_EPMMX_PTA_SLA_DPG.M,Louisiana Oxygenated Gasoline Midgrade Retail ...,Dollars per Gallon


In [23]:
# Drop rows where 'data' is NaN or not a list
df = df.dropna(subset=['data'])
df = df[df['data'].apply(lambda x: isinstance(x, list) and len(x) == 2)]
df.head()

Unnamed: 0,series_id,name,units,f,unitsshort,description,copyright,source,iso3166,geography,start,end,last_updated,data,geography2,category_id,parent_category_id,notes,childseries
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240617, 4.306]",,,,,
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240610, 4.328]",,,,,
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240603, 4.367]",,,,,
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240527, 4.412]",,,,,
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240520, 4.399]",,,,,


In [24]:
# Split 'data' column into 'date' and 'value'
df[['date', 'value']] = pd.DataFrame(df['data'].tolist(), index=df.index)

In [25]:
df

Unnamed: 0,series_id,name,units,f,unitsshort,description,copyright,source,iso3166,geography,...,end,last_updated,data,geography2,category_id,parent_category_id,notes,childseries,date,value
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,...,20240617,2024-06-17T22:10:18-04:00,"[20240617, 4.306]",,,,,,20240617,4.306
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,...,20240617,2024-06-17T22:10:18-04:00,"[20240610, 4.328]",,,,,,20240610,4.328
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,...,20240617,2024-06-17T22:10:18-04:00,"[20240603, 4.367]",,,,,,20240603,4.367
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,...,20240617,2024-06-17T22:10:18-04:00,"[20240527, 4.412]",,,,,,20240527,4.412
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,...,20240617,2024-06-17T22:10:18-04:00,"[20240520, 4.399]",,,,,,20240520,4.399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177386,PET.METSNP51.M,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,Thousand Barrels,M,Mbbl,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,,"EIA, U.S. Energy Information Administration",,USA-AK+USA-AZ+USA-CA+USA-HI+USA-NV+USA-OR+USA-WA,...,200412,2013-08-13T11:49:51-04:00,"[200205, 1]",,,,,,200205,1.000
177386,PET.METSNP51.M,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,Thousand Barrels,M,Mbbl,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,,"EIA, U.S. Energy Information Administration",,USA-AK+USA-AZ+USA-CA+USA-HI+USA-NV+USA-OR+USA-WA,...,200412,2013-08-13T11:49:51-04:00,"[200204, 1]",,,,,,200204,1.000
177386,PET.METSNP51.M,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,Thousand Barrels,M,Mbbl,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,,"EIA, U.S. Energy Information Administration",,USA-AK+USA-AZ+USA-CA+USA-HI+USA-NV+USA-OR+USA-WA,...,200412,2013-08-13T11:49:51-04:00,"[200203, 1]",,,,,,200203,1.000
177386,PET.METSNP51.M,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,Thousand Barrels,M,Mbbl,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,,"EIA, U.S. Energy Information Administration",,USA-AK+USA-AZ+USA-CA+USA-HI+USA-NV+USA-OR+USA-WA,...,200412,2013-08-13T11:49:51-04:00,"[200202, 1]",,,,,,200202,1.000


In [26]:
# Convert 'date' and 'value' to the correct types
# Convert 'date' to datetime, coercing errors to NaT
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d', errors='coerce')
# Convert 'value' to a numeric type, coercing errors to NaN
df['value'] = pd.to_numeric(df['value'], errors='coerce')
df

Unnamed: 0,series_id,name,units,f,unitsshort,description,copyright,source,iso3166,geography,...,end,last_updated,data,geography2,category_id,parent_category_id,notes,childseries,date,value
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,...,20240617,2024-06-17T22:10:18-04:00,"[20240617, 4.306]",,,,,,2024-06-17,4.306
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,...,20240617,2024-06-17T22:10:18-04:00,"[20240610, 4.328]",,,,,,2024-06-10,4.328
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,...,20240617,2024-06-17T22:10:18-04:00,"[20240603, 4.367]",,,,,,2024-06-03,4.367
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,...,20240617,2024-06-17T22:10:18-04:00,"[20240527, 4.412]",,,,,,2024-05-27,4.412
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,...,20240617,2024-06-17T22:10:18-04:00,"[20240520, 4.399]",,,,,,2024-05-20,4.399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177386,PET.METSNP51.M,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,Thousand Barrels,M,Mbbl,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,,"EIA, U.S. Energy Information Administration",,USA-AK+USA-AZ+USA-CA+USA-HI+USA-NV+USA-OR+USA-WA,...,200412,2013-08-13T11:49:51-04:00,"[200205, 1]",,,,,,NaT,1.000
177386,PET.METSNP51.M,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,Thousand Barrels,M,Mbbl,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,,"EIA, U.S. Energy Information Administration",,USA-AK+USA-AZ+USA-CA+USA-HI+USA-NV+USA-OR+USA-WA,...,200412,2013-08-13T11:49:51-04:00,"[200204, 1]",,,,,,NaT,1.000
177386,PET.METSNP51.M,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,Thousand Barrels,M,Mbbl,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,,"EIA, U.S. Energy Information Administration",,USA-AK+USA-AZ+USA-CA+USA-HI+USA-NV+USA-OR+USA-WA,...,200412,2013-08-13T11:49:51-04:00,"[200203, 1]",,,,,,NaT,1.000
177386,PET.METSNP51.M,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,Thousand Barrels,M,Mbbl,West Coast (PADD 5) Ethane-Ethylene Stocks at ...,,"EIA, U.S. Energy Information Administration",,USA-AK+USA-AZ+USA-CA+USA-HI+USA-NV+USA-OR+USA-WA,...,200412,2013-08-13T11:49:51-04:00,"[200202, 1]",,,,,,NaT,1.000


In [27]:
# Drop the original 'data' column if no longer needed
df.drop(columns=['data'], inplace=True)

In [28]:
# Get distinct series_id, name pairs
series_names = df[['series_id', 'name']].drop_duplicates()
series_names

Unnamed: 0,series_id,name
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...
1,PET.EMM_EPMPR_PTE_Y44HO_DPG.W,"Houston, TX Premium Reformulated Retail Gasoli..."
2,PET.EMM_EPMMR_PTE_R5XCA_DPG.W,West Coast (PADD 5) Except California Midgrade...
3,PET.EMM_EPMMR_PTE_Y05LA_DPG.W,"Los Angeles, CA Midgrade Reformulated Retail G..."
4,PET.EMM_EPMMR_PTE_Y05SF_DPG.W,"San Francisco, CA Midgrade Reformulated Retail..."
...,...,...
177382,PET.MGAEXP22.M,"Midwest (PADD 2) Exports of Aviation Gasoline,..."
177383,PET.MGAEXP31.M,Gulf Coast (PADD 3) Exports of Aviation Gasoli...
177384,PET.MGAEXP32.M,Gulf Coast (PADD 3) Exports of Aviation Gasoli...
177385,PET.MGAEXP41.M,Rocky Mountain (PADD 4) Exports of Aviation Ga...


In [None]:
# Split the 'data' column into separate columns
df[['date', 'value']] = pd.DataFrame(df['data'].tolist(), index=df.index)

# Convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

# Display the first few rows of the DataFrame
df.head()

In [None]:
# Plot time series data for different series
plt.figure(figsize=(14, 7))
for series in df['series_id'].unique():
    subset = df[df['series_id'] == series]
    plt.plot(subset['date'], subset['value'], label=subset['name'].iloc[0])

plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.title('Oil Prices Over Time')
plt.legend()
plt.show()

# Summary statistics
print(df.describe())

In [None]:
# Filter the dataset for a specific series_id for forecasting
series_id = 'PET.EMM_EPMPR_PTE_Y35NY_DPG.W'  # Example series_id
df_series = df[df['series_id'] == series_id][['date', 'value']].rename(columns={'date': 'ds', 'value': 'y'})

# Display the first few rows of the filtered DataFrame
print(df_series.head())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Assuming df_series is a DataFrame with two columns: 'ds' for dates and 'y' for values
# Convert 'ds' from datetime to ordinal
df_series['ds_ordinal'] = df_series['ds'].apply(lambda x: x.toordinal())

# Split data into features and target
X = df_series[['ds_ordinal']]
y = df_series['y']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict using the model
y_pred = model.predict(X_test)

# To forecast future dates, create a DataFrame for future dates and convert to ordinal
future_dates = pd.date_range(start=df_series['ds'].max(), periods=365, freq='D')
future_dates_ordinal = future_dates.to_series().apply(lambda x: x.toordinal()).values.reshape(-1, 1)

# Forecast future values
future_forecast = model.predict(future_dates_ordinal)

# Plot the forecast
plt.figure(figsize=(10, 6))
plt.scatter(df_series['ds'], y, color='black', label='Actual')
plt.plot(future_dates, future_forecast, color='blue', label='Forecast')
plt.title('Oil Price Forecast')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.legend()
plt.show()

In [None]:
# Convert 'ds' to ordinal
df_series['ds_ordinal'] = df_series['ds'].apply(lambda x: x.toordinal())

# Split data into training and testing sets
train = df_series[df_series['ds'] < '2023-01-01']
test = df_series[df_series['ds'] >= '2023-01-01']

# Prepare the features and target variables
X_train = train[['ds_ordinal']]
y_train = train['y']
X_test = test[['ds_ordinal']]
y_test = test['y']

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate MAE and RMSE
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'MAE: {mae}')
print(f'RMSE: {rmse}')

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test['ds'], y_test, label='Actual')
plt.plot(test['ds'], y_pred, label='Predicted')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.title('Actual vs Predicted Oil Prices')
plt.legend()
plt.show()

In [None]:
# # Save the model
# import joblib
# joblib.dump(model, 'models/oil_price_forecast_model.pkl')

In [None]:
# Save the parsed DataFrame to a CSV file
#df_parsed.to_csv('parsed_petroleum_data.csv', index=False)