In [None]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA

# Set plotting style
sns.set(style='whitegrid')

In [None]:
# Load the data from the text file
data = pd.read_csv('data/raw/PET/PET.txt', sep='\t', header=None, names=['json_str'])

# Function to parse JSON strings
def parse_json_str(json_str):
    return json.loads(json_str)

# Apply the function to parse the JSON strings
parsed_data = data['json_str'].apply(parse_json_str)

# Create a DataFrame from the parsed JSON data
df = pd.json_normalize(parsed_data)

# Display the first few rows of the DataFrame
print(df.head())

In [None]:
df.columns

In [None]:
# Sample this list to see ten random elements
df_list = df['data'].tolist()
print(df_list[:2])

In [None]:
# Explode the 'data' column to separate rows for each date-value pair
df = df.explode('data')
df.head()

In [None]:
# Select distinct series_id, name, units
df_series = df[['series_id', 'name', 'units']].drop_duplicates()
df_series.head()

In [None]:
# Filter df to only include name containing 'Louisiana' and units in Dollars per Gallon
df = df[df['name'].str.contains('Louisiana Total') & df['units'].str.contains('Dollars per Gallon')]
df

In [None]:
# Drop rows where 'data' is NaN or not a list
df = df.dropna(subset=['data'])
df = df[df['data'].apply(lambda x: isinstance(x, list) and len(x) == 2)]
df.head()

In [None]:
# Split 'data' column into 'date' and 'value'
df[['date', 'value']] = pd.DataFrame(df['data'].tolist(), index=df.index)
df

In [None]:
# Convert 'date' to datetime, coercing errors to NaT
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d', errors='coerce')
# Convert 'value' to a numeric type, coercing errors to NaN
df['value'] = pd.to_numeric(df['value'], errors='coerce')
df

In [None]:
# Extract relevant columns and preprocess the data
df['Date'] = pd.to_datetime(df['date'])
df['Price'] = df['value']
df = df[['Date', 'Price']].sort_values(by='Date').reset_index(drop=True)
df.tail()

In [None]:
# Plot historical prices
plt.figure(figsize=(14, 7))
plt.plot(df['Date'], df['Price'], label='Price')
plt.title('Historical Gasoline Prices')
plt.xlabel('Date')
plt.ylabel('Price ($/gal)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Moving Average
df['Price_MA'] = df['Price'].rolling(window=12).mean()

plt.figure(figsize=(14, 7))
plt.plot(df['Date'], df['Price'], label='Price')
plt.plot(df['Date'], df['Price_MA'], label='12-Month Moving Average', color='red')
plt.title('Gasoline Prices with Moving Average')
plt.xlabel('Date')
plt.ylabel('Price ($/gal)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Moving Average
df['Price_MA'] = df['Price'].rolling(window=12).mean()

plt.figure(figsize=(14, 7))
plt.plot(df['Date'], df['Price'], label='Price')
plt.plot(df['Date'], df['Price_MA'], label='12-Month Moving Average', color='red')
plt.title('Gasoline Prices with Moving Average')
plt.xlabel('Date')
plt.ylabel('Price ($/gal)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Decomposition
decompose_result = seasonal_decompose(df['Price'], model='additive', period=12)
decompose_result.plot()
plt.show()

In [None]:
# # Save the model
# import joblib
# joblib.dump(model, 'models/oil_price_forecast_model.pkl')