In [61]:
# Import necessary libraries
import json
import os
from datetime import timedelta

import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from pandas.tseries.offsets import MonthEnd
# Set plotting style
sns.set(style='whitegrid')

In [62]:
# Load the data from the text file
data = pd.read_csv('data/raw/PET/PET.txt', sep='\t', header=None, names=['json_str'])

# Function to parse JSON strings
def parse_json_str(json_str):
    return json.loads(json_str)

# Apply the function to parse the JSON strings
parsed_data = data['json_str'].apply(parse_json_str)

# Create a DataFrame from the parsed JSON data
df = pd.json_normalize(parsed_data)

# Display the first few rows of the DataFrame
df.head()

In [None]:
df.columns

In [None]:
# Explode the 'data' column to separate rows for each date-value pair
df = df.explode('data')
df.head(10)

In [None]:
# Select distinct series_id, name, units
df_series = df[['series_id', 'name', 'units', 'unitsshort']].drop_duplicates()
df_series.head(10)

In [None]:
# Filter rows based on columns: 'series_id', 'units'
df_series_nonas = df_series[(df_series['series_id'].notna()) & (df_series['units'].notna())]
df_series_nonas.head(10)

In [None]:
date_range_all = df.groupby('name')['end'].agg(['min', 'max'])
date_range_all_nonas = date_range_all[(date_range_all['min'].notna()) & (date_range_all['max'].notna())]
date_range_all_nonas.head(10)

In [None]:
# Return a count of the number of records by units 
df_series['units'].value_counts()

In [None]:
# Filter df to only include name containing 'Louisiana' and units in Dollars per Gallon
df_louisiana = df[df['name'].str.contains('Louisiana Total') & df['units'].str.contains('Dollars per Gallon')]
df_louisiana.head()

In [None]:
# Drop rows where 'data' is NaN or not a list
df_louisiana = df_louisiana.dropna(subset=['data'])
df_louisiana = df_louisiana[df_louisiana['data'].apply(lambda x: isinstance(x, list) and len(x) == 2)]
df_louisiana.head()

In [None]:
# Split 'data' column into 'date' and 'value'
df_louisiana[['date', 'value']] = pd.DataFrame(df_louisiana['data'].tolist(), index=df_louisiana.index)
df_louisiana.head()

In [None]:
# Convert 'date' to datetime, coercing errors to NaT
df_louisiana['date'] = pd.to_datetime(df_louisiana['date'], format='%Y%m%d', errors='coerce')
# Convert 'value' to a numeric type, coercing errors to NaN
df_louisiana['value'] = pd.to_numeric(df_louisiana['value'], errors='coerce')
df_louisiana.head()

In [None]:
df_louisiana.info()

In [None]:
# Extract relevant columns and preprocess the data
df_louisiana['Date'] = pd.to_datetime(df_louisiana['date'])
df_louisiana['Price'] = df_louisiana['value']
df_louisiana = df_louisiana[['Date', 'Price', 'unitsshort', 'series_id', 'name', 'last_updated']].sort_values(by='Date').reset_index(drop=True)
df_louisiana.head()

In [None]:
date_range_la = df_louisiana.groupby(['name', 'series_id'])['Date'].agg(['min', 'max'])
date_range_la.head(10)

In [None]:
# Return the table when both min and max are not equal to NaT 
date_range_la_nonas = date_range_la[(date_range_la['min'].notna()) & (date_range_la['max'].notna())]
date_range_la_nonas

In [None]:
# Load and prepare the dataset
def load_gas_price_data(filepath):
    """
    Load and prepare the dataset from a raw text file containing JSON strings.
    
    Parameters:
    - filepath: Path to the .txt file containing the raw data.
    
    Returns:
    - A DataFrame with the data extracted from JSON strings, focusing on 'date' and 'value' columns.
    """
    # Load the data from the text file
    data = pd.read_csv(filepath, sep='\t', header=None, names=['json_str'])
    
    # Function to parse JSON strings
    def parse_json_str(json_str):
        return json.loads(json_str)
    
    # Apply the function to parse the JSON strings
    parsed_data = data['json_str'].apply(parse_json_str)
    
    # Create a DataFrame from the parsed JSON data
    df = pd.json_normalize(parsed_data)
    
    # Assuming 'date' and 'value' are keys in the JSON data, convert 'date' to datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    # Keep only necessary columns and drop NA values
    #df = df[['date', 'value']].dropna()
    
    return df

# Prepare data for long format and additional transformations
def prepare_data(df, series_id="PET.EMA_EPM0_PBS_SLA_DPG.M"):
    """
    Prepare the dataset for analysis by performing several transformations.
    
    Parameters:
    - df: DataFrame to be transformed.
    - series_id: The series ID to filter the DataFrame by. Default is "PET.EMA_EPM0_PBS_SLA_DPG.M".
    
    Returns:
    - Transformed DataFrame.
    """
    # Filter based on series_id and non-NA 'units' column
    df = df[df['series_id'] == series_id]
    df = df.dropna(subset=['units'])
    
    # Explode the 'data' column to separate rows for each date-value pair
    df = df.explode('data')
    
    # Drop rows where 'data' is NaN or not a list
    df = df.dropna(subset=['data'])
    df = df[df['data'].apply(lambda x: isinstance(x, list) and len(x) == 2)]
    
    # Split 'data' column into 'date' and 'value'
    df[['date', 'value']] = pd.DataFrame(df['data'].tolist(), index=df.index)
    
    # Remove the 'data' column
    df = df.drop(columns=['data'])
    
    # Convert 'date' to datetime, coercing errors to NaT
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d', errors='coerce')
    
    # Convert 'value' to a numeric type, coercing errors to NaN
    df['value'] = pd.to_numeric(df['value'], errors='coerce')
    
    # Sort by 'date' to ensure chronological order
    df = df.sort_values('date')
    
    # Calculate log of 'value' and the difference in log_price
    df['log_price'] = np.log(df['value'])
    df['price_change'] = df['log_price'].diff()
    
    return df

# Function to perform AutoARIMA forecasting
def forecast_prices(df, cutoff_date):
    # Filter the DataFrame based on the cutoff date
    df_filtered = df[df['date'] < pd.to_datetime(cutoff_date)]
    
    # Convert prices to log prices to stabilize variance
    df_filtered['log_price'] = np.log(df_filtered['value'])
    
    # Define the model
    model = SARIMAX(df_filtered['log_price'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
    
    # Fit the model
    results = model.fit()
    
    # Generate future dates
    future_dates = pd.date_range(df_filtered['date'].max() + MonthEnd(1), periods=13, freq='M')
    
    # Forecast future log prices
    forecast_log_prices = results.forecast(steps=13)
    
    # Convert log prices back to regular prices
    forecast_prices = np.exp(forecast_log_prices)
    
    # Create a DataFrame for the forecasted prices
    forecast_df = pd.DataFrame({
        'date': future_dates,
        'forecast_price': forecast_prices
    })
    
    return forecast_df

# Visualization function
def plot_forecast(df, forecast_df):
    base = alt.Chart(df).encode(
        x='date:T',
        y='value:Q'
    ).properties(
        width=700,
        height=600
    )

    line = base.mark_line(color='blue', size=3)
    points = base.mark_point(color='red')

    forecast_chart = alt.Chart(forecast_df).mark_line(color='green').encode(
        x='date:T',
        y='forecast_price:Q'
    )

    return line + points + forecast_chart

In [None]:
df = load_gas_price_data('data/raw/PET/PET.txt')
df

In [None]:
df_prepared = prepare_data(df)
df_prepared.head()

In [None]:
forecast_df = forecast_prices(df, '2021-01-01')
forecast_df.tail()

In [None]:
plot_forecast(df, forecast_df)

In [None]:
# # Save the model
# import joblib
# joblib.dump(model, 'models/oil_price_forecast_model.pkl')