In [None]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA

# Set plotting style
sns.set(style='whitegrid')

In [None]:
# Load the data from the text file
data = pd.read_csv('data/raw/PET/PET.txt', sep='\t', header=None, names=['json_str'])

# Function to parse JSON strings
def parse_json_str(json_str):
    return json.loads(json_str)

# Apply the function to parse the JSON strings
parsed_data = data['json_str'].apply(parse_json_str)

# Create a DataFrame from the parsed JSON data
df = pd.json_normalize(parsed_data)

# Display the first few rows of the DataFrame
print(df.head())

In [None]:
df.columns

In [None]:
# Explode the 'data' column to separate rows for each date-value pair
df = df.explode('data')
df.head()

In [None]:
# Select distinct series_id, name, units
df_series = df[['series_id', 'name', 'units']].drop_duplicates()
df_series.head()

In [None]:
# Return a count of the number of records by units 
df_series['units'].value_counts()

In [None]:
# Filter df to only include name containing 'Louisiana' and units in Dollars per Gallon
df_louisiana = df[df['name'].str.contains('Louisiana Total') & df['units'].str.contains('Dollars per Gallon')]
df_louisiana

In [None]:
# Drop rows where 'data' is NaN or not a list
df_louisiana = df_louisiana.dropna(subset=['data'])
df_louisiana = df_louisiana[df_louisiana['data'].apply(lambda x: isinstance(x, list) and len(x) == 2)]
df_louisiana.head()

In [None]:
# Split 'data' column into 'date' and 'value'
df_louisiana[['date', 'value']] = pd.DataFrame(df_louisiana['data'].tolist(), index=df_louisiana.index)
df_louisiana

In [None]:
# Convert 'date' to datetime, coercing errors to NaT
df_louisiana['date'] = pd.to_datetime(df_louisiana['date'], format='%Y%m%d', errors='coerce')
# Convert 'value' to a numeric type, coercing errors to NaN
df_louisiana['value'] = pd.to_numeric(df_louisiana['value'], errors='coerce')
df_louisiana

In [None]:
# Extract relevant columns and preprocess the data
df_louisiana['Date'] = pd.to_datetime(df_louisiana['date'])
df_louisiana['Price'] = df_louisiana['value']
df_louisiana = df_louisiana[['Date', 'Price']].sort_values(by='Date').reset_index(drop=True)
df_louisiana.tail()

In [None]:
# Plot historical prices
plt.figure(figsize=(14, 7))
plt.plot(df_louisiana['Date'], df_louisiana['Price'], label='Price')
plt.title('Historical Gasoline Prices')
plt.xlabel('Date')
plt.ylabel('Price ($/gal)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Moving Average
df_louisiana['Price_MA'] = df_louisiana['Price'].rolling(window=12).mean()

plt.figure(figsize=(14, 7))
plt.plot(df_louisiana['Date'], df_louisiana['Price'], label='Price')
plt.plot(df_louisiana['Date'], df_louisiana['Price_MA'], label='12-Month Moving Average', color='red')
plt.title('Gasoline Prices with Moving Average')
plt.xlabel('Date')
plt.ylabel('Price ($/gal)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Oil quantities. Filter df to only include units "Million Barrels"
df_oil = df[df['units'].str.contains('Million Barrels')]
df_oil

In [None]:
# # Save the model
# import joblib
# joblib.dump(model, 'models/oil_price_forecast_model.pkl')