In [1]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import eia

# Set plotting style
sns.set(style='whitegrid')

In [None]:
# Load the data from the text file
data = pd.read_csv('data/raw/PET/PET.txt', sep='\t', header=None, names=['json_str'])

# Function to parse JSON strings
def parse_json_str(json_str):
    return json.loads(json_str)

# Apply the function to parse the JSON strings
parsed_data = data['json_str'].apply(parse_json_str)

# Create a DataFrame from the parsed JSON data
df = pd.json_normalize(parsed_data)

# Display the first few rows of the DataFrame
print(df.head())

In [None]:
df.columns

In [None]:
# Sample this list to see ten random elements
df_list = df['data'].tolist()
print(df_list[:2])

In [None]:
# Explode the 'data' column to separate rows for each date-value pair
df = df.explode('data')
df.head()

In [None]:
# Select distinct series_id, name, units
df_series = df[['series_id', 'name', 'units']].drop_duplicates()
df_series.head()

In [None]:
# Filter df to only include name containing 'Louisiana' and units in Dollars per Gallon
df = df[df['name'].str.contains('Louisiana Total') & df['units'].str.contains('Dollars per Gallon')]
df

In [None]:
# Drop rows where 'data' is NaN or not a list
df = df.dropna(subset=['data'])
df = df[df['data'].apply(lambda x: isinstance(x, list) and len(x) == 2)]
df.head()

In [None]:
# Split 'data' column into 'date' and 'value'
df[['date', 'value']] = pd.DataFrame(df['data'].tolist(), index=df.index)
df

In [None]:
df

In [None]:
# Convert 'date' to datetime, coercing errors to NaT
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d', errors='coerce')
# Convert 'value' to a numeric type, coercing errors to NaN
df['value'] = pd.to_numeric(df['value'], errors='coerce')
df

In [None]:
# Extract relevant columns and preprocess the data
df['Date'] = pd.to_datetime(df['date'])
df['Price'] = df['value']
df = df[['Date', 'Price']].sort_values(by='Date').reset_index(drop=True)

df.tail()

In [None]:
# Drop the original 'data' column if no longer needed
df.drop(columns=['data'], inplace=True)
df

In [None]:
# Get distinct series_id, name pairs
series_names = df[['series_id', 'name']].drop_duplicates()
series_names

In [None]:
# Plot time series data for different series
plt.figure(figsize=(14, 7))
for series in df['series_id'].unique():
    subset = df[df['series_id'] == series]
    plt.plot(subset['date'], subset['value'], label=subset['name'].iloc[0])

plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.title('Oil Prices Over Time')
plt.legend()
plt.show()

# Summary statistics
print(df.describe())

In [None]:
# Filter the dataset for a specific series_id for forecasting
series_id = 'PET.EMM_EPMPR_PTE_Y35NY_DPG.W'  # Example series_id
df_series = df[df['series_id'] == series_id][['date', 'value']].rename(columns={'date': 'ds', 'value': 'y'})

# Display the first few rows of the filtered DataFrame
print(df_series.head())

In [None]:
# Assuming df_series is a DataFrame with two columns: 'ds' for dates and 'y' for values
# Convert 'ds' from datetime to ordinal
df_series['ds_ordinal'] = df_series['ds'].apply(lambda x: x.toordinal())

# Split data into features and target
X = df_series[['ds_ordinal']]
y = df_series['y']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict using the model
y_pred = model.predict(X_test)

# To forecast future dates, create a DataFrame for future dates and convert to ordinal
future_dates = pd.date_range(start=df_series['ds'].max(), periods=365, freq='D')
future_dates_ordinal = future_dates.to_series().apply(lambda x: x.toordinal()).values.reshape(-1, 1)

# Forecast future values
future_forecast = model.predict(future_dates_ordinal)

# Plot the forecast
plt.figure(figsize=(10, 6))
plt.scatter(df_series['ds'], y, color='black', label='Actual')
plt.plot(future_dates, future_forecast, color='blue', label='Forecast')
plt.title('Oil Price Forecast')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.legend()
plt.show()

In [None]:
# Convert 'ds' to ordinal
df_series['ds_ordinal'] = df_series['ds'].apply(lambda x: x.toordinal())

# Split data into training and testing sets
train = df_series[df_series['ds'] < '2023-01-01']
test = df_series[df_series['ds'] >= '2023-01-01']

# Prepare the features and target variables
X_train = train[['ds_ordinal']]
y_train = train['y']
X_test = test[['ds_ordinal']]
y_test = test['y']

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate MAE and RMSE
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'MAE: {mae}')
print(f'RMSE: {rmse}')

# Plot actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(test['ds'], y_test, label='Actual')
plt.plot(test['ds'], y_pred, label='Predicted')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.title('Actual vs Predicted Oil Prices')
plt.legend()
plt.show()

In [None]:
# # Save the model
# import joblib
# joblib.dump(model, 'models/oil_price_forecast_model.pkl')