In [20]:
# Import necessary libraries
import json
import os
import warnings
from datetime import timedelta

import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from pandas.tseries.offsets import MonthEnd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.sm_exceptions import ValueWarning

# Fix for MaxRowsError
alt.data_transformers.disable_max_rows()

# Set plotting style
sns.set(style='whitegrid')

In [21]:
# Load the data from the text file
data = pd.read_csv('data/raw/NG/NG.txt', sep='\t', header=None, names=['json_str'])
data['json_str']

0        {"series_id":"NG.NW2_EPG0_SNO_R33_BCF.W","name...
1        {"series_id":"NG.NW2_EPG0_SSO_R33_BCF.W","name...
2        {"series_id":"NG.NW2_EPG0_SWO_R31_BCF.W","name...
3        {"series_id":"NG.NW2_EPG0_SWO_R32_BCF.W","name...
4        {"series_id":"NG.NW2_EPG0_SWO_R33_BCF.W","name...
                               ...                        
17598    {"category_id":"483303","parent_category_id":"...
17599    {"category_id":"483304","parent_category_id":"...
17600    {"category_id":"483357","parent_category_id":"...
17601    {"category_id":"483359","parent_category_id":"...
17602    {"category_id":"483361","parent_category_id":"...
Name: json_str, Length: 17603, dtype: object

In [22]:
def parse_json_str(json_str):
    """
    Parse a JSON string and return a dictionary.

    Parameters:
    json_str (str): A string in JSON format.

    Returns:
    dict: A dictionary parsed from the JSON string. If the input string is empty or invalid, an empty dictionary is returned.
    """
    if not json_str:
        return {}
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        return {}

# Apply the function to parse the JSON strings
parsed_data = data['json_str'].apply(parse_json_str)

# Create a DataFrame from the parsed JSON data
df = pd.json_normalize(parsed_data)

df

Unnamed: 0,series_id,name,units,f,unitsshort,description,copyright,source,start,end,last_updated,data,iso3166,geography,geography2,category_id,parent_category_id,notes,childseries
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,W,BCF,Nonsalt South Central Region Natural Gas Worki...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[[20240607, 823], [20240531, 817], [20240524, ...",,,,,,,
1,NG.NW2_EPG0_SSO_R33_BCF.W,Weekly Salt Region Natural Gas Working Undergr...,Billion Cubic Feet,W,BCF,Salt South Central Region Natural Gas Working ...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[[20240607, 336], [20240531, 330], [20240524, ...",,,,,,,
2,NG.NW2_EPG0_SWO_R31_BCF.W,Weekly East Region Natural Gas Working Undergr...,Billion Cubic Feet,W,BCF,East Region Natural Gas Working Underground St...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[[20240607, 603], [20240531, 575], [20240524, ...",,,,,,,
3,NG.NW2_EPG0_SWO_R32_BCF.W,Weekly Midwest Region Natural Gas Working Unde...,Billion Cubic Feet,W,BCF,Midwest Region Natural Gas Working Underground...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[[20240607, 712], [20240531, 688], [20240524, ...",,,,,,,
4,NG.NW2_EPG0_SWO_R33_BCF.W,Weekly South Central Region Natural Gas Worki...,Billion Cubic Feet,W,BCF,South Central Region Natural Gas Working Unde...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[[20240607, 1159], [20240531, 1146], [20240524...",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17598,,by Data Series,,,,,,,,,,,,,,483303,483194,,[]
17599,,Delivered to Consumers,,,,,,,,,,,,,,483304,483303,,"[NG.NGA_EPG0_VGTH_NUS_BTUCF.A, NG.NGA_EPG0_VGT..."
17600,,Total Consumption,,,,,,,,,,,,,,483357,483303,,[NG.NGA_EPG0_VC0H_NUS_BTUCF.A]
17601,,Electric Power,,,,,,,,,,,,,,483359,483303,,[NG.NGA_EPG0_VEUH_NUS_BTUCF.A]


In [23]:
df.columns

Index(['series_id', 'name', 'units', 'f', 'unitsshort', 'description',
       'copyright', 'source', 'start', 'end', 'last_updated', 'data',
       'iso3166', 'geography', 'geography2', 'category_id',
       'parent_category_id', 'notes', 'childseries'],
      dtype='object')

In [24]:
# Explode the 'data' column to separate rows for each date-value pair
df = df.explode('data')
df.head(10)

Unnamed: 0,series_id,name,units,f,unitsshort,description,copyright,source,start,end,last_updated,data,iso3166,geography,geography2,category_id,parent_category_id,notes,childseries
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,W,BCF,Nonsalt South Central Region Natural Gas Worki...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[20240607, 823]",,,,,,,
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,W,BCF,Nonsalt South Central Region Natural Gas Worki...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[20240531, 817]",,,,,,,
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,W,BCF,Nonsalt South Central Region Natural Gas Worki...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[20240524, 804]",,,,,,,
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,W,BCF,Nonsalt South Central Region Natural Gas Worki...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[20240517, 793]",,,,,,,
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,W,BCF,Nonsalt South Central Region Natural Gas Worki...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[20240510, 784]",,,,,,,
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,W,BCF,Nonsalt South Central Region Natural Gas Worki...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[20240503, 773]",,,,,,,
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,W,BCF,Nonsalt South Central Region Natural Gas Worki...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[20240426, 759]",,,,,,,
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,W,BCF,Nonsalt South Central Region Natural Gas Worki...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[20240419, 749]",,,,,,,
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,W,BCF,Nonsalt South Central Region Natural Gas Worki...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[20240412, 729]",,,,,,,
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,W,BCF,Nonsalt South Central Region Natural Gas Worki...,,"EIA, U.S. Energy Information Administration",20100101,20240607,2024-06-13T15:15:06-04:00,"[20240405, 714]",,,,,,,


In [25]:
# Select distinct series_id, name, units
df_series = df[['series_id', 'name', 'units', 'unitsshort']].drop_duplicates()
df_series.head(10)

Unnamed: 0,series_id,name,units,unitsshort
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,BCF
1,NG.NW2_EPG0_SSO_R33_BCF.W,Weekly Salt Region Natural Gas Working Undergr...,Billion Cubic Feet,BCF
2,NG.NW2_EPG0_SWO_R31_BCF.W,Weekly East Region Natural Gas Working Undergr...,Billion Cubic Feet,BCF
3,NG.NW2_EPG0_SWO_R32_BCF.W,Weekly Midwest Region Natural Gas Working Unde...,Billion Cubic Feet,BCF
4,NG.NW2_EPG0_SWO_R33_BCF.W,Weekly South Central Region Natural Gas Worki...,Billion Cubic Feet,BCF
5,NG.NW2_EPG0_SWO_R34_BCF.W,Weekly Mountain Region Natural Gas Working Und...,Billion Cubic Feet,BCF
6,NG.NW2_EPG0_SWO_R48_BCF.W,Weekly Lower 48 States Natural Gas Working Und...,Billion Cubic Feet,BCF
7,NG.NW2_EPG0_SWO_R35_BCF.W,Weekly Pacific Region Natural Gas Working Unde...,Billion Cubic Feet,BCF
8,NG.RNGWHHD.D,"Henry Hub Natural Gas Spot Price, Daily",Dollars per Million Btu,$/MMBTU
9,NG.RNGWHHD.W,"Henry Hub Natural Gas Spot Price, Weekly",Dollars per Million Btu,$/MMBTU


In [26]:
# Filter rows based on columns: 'series_id', 'units'
df_series_nonas = df_series[(df_series['series_id'].notna()) & (df_series['units'].notna())]
df_series_nonas.head(10)

Unnamed: 0,series_id,name,units,unitsshort
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,BCF
1,NG.NW2_EPG0_SSO_R33_BCF.W,Weekly Salt Region Natural Gas Working Undergr...,Billion Cubic Feet,BCF
2,NG.NW2_EPG0_SWO_R31_BCF.W,Weekly East Region Natural Gas Working Undergr...,Billion Cubic Feet,BCF
3,NG.NW2_EPG0_SWO_R32_BCF.W,Weekly Midwest Region Natural Gas Working Unde...,Billion Cubic Feet,BCF
4,NG.NW2_EPG0_SWO_R33_BCF.W,Weekly South Central Region Natural Gas Worki...,Billion Cubic Feet,BCF
5,NG.NW2_EPG0_SWO_R34_BCF.W,Weekly Mountain Region Natural Gas Working Und...,Billion Cubic Feet,BCF
6,NG.NW2_EPG0_SWO_R48_BCF.W,Weekly Lower 48 States Natural Gas Working Und...,Billion Cubic Feet,BCF
7,NG.NW2_EPG0_SWO_R35_BCF.W,Weekly Pacific Region Natural Gas Working Unde...,Billion Cubic Feet,BCF
8,NG.RNGWHHD.D,"Henry Hub Natural Gas Spot Price, Daily",Dollars per Million Btu,$/MMBTU
9,NG.RNGWHHD.W,"Henry Hub Natural Gas Spot Price, Weekly",Dollars per Million Btu,$/MMBTU


In [27]:
date_range_all = df.groupby('name')['end'].agg(['min', 'max'])
date_range_all_nonas = date_range_all[(date_range_all['min'].notna()) & (date_range_all['max'].notna())]
date_range_all_nonas.head(10)

Unnamed: 0_level_0,min,max
name,Unnamed: 1_level_1,Unnamed: 2_level_1
"\r\nSweetgrass, MT Compressed Natural Gas Exports to Canada, Annual",2023,2023
"\r\nSweetgrass, MT Compressed Natural Gas Exports to Canada, Monthly",202303,202303
"AGA Eastern Consuming Region Natural Gas Underground Storage Withdrawals, Annual",2014,2014
"AGA Eastern Consuming Region Natural Gas Underground Storage Withdrawals, Monthly",201412,201412
"AGA Eastern Consuming Region Natural Gas Count of Underground Storage Capacity, Monthly",201312,201312
"AGA Eastern Consuming Region Natural Gas Injections into Underground Storage, Annual",2014,2014
"AGA Eastern Consuming Region Natural Gas Injections into Underground Storage, Monthly",201412,201412
"AGA Eastern Consuming Region Natural Gas Total Underground Storage Capacity, Monthly",201312,201312
"AGA Eastern Consuming Region Natural Gas Underground Storage Volume, Monthly",201412,201412
"AGA Eastern Consuming Region Natural Gas Working Underground Storage Capacity, Monthly",201312,201312


In [28]:
# Return a count of the number of records by units 
df_series['units'].value_counts()

units
Million Cubic Feet                         6334
Billion Cubic Feet                         3507
Dollars per Thousand Cubic Feet            1896
Million Barrels                            1844
Percent                                     773
Number of Elements                          691
BTU per Cubic Foot                          107
Thousand Cubic Feet                         104
Thousand Barrels                             92
Million Cubic Feet per Day                   37
Cost                                         33
Nominal Dollars per Thousand Cubic Feet      23
Dollars per Million Btu                      22
Thousand Feet                                14
Feet per Well                                12
Dollars per Foot                              5
Thousand Dollars per Well                     5
(Dollars per Thousand Cubic Feet)             4
Count                                         1
Thousand Dollars                              1
Name: count, dtype: int64

In [29]:
df_series

Unnamed: 0,series_id,name,units,unitsshort
0,NG.NW2_EPG0_SNO_R33_BCF.W,Weekly Nonsalt Region Natural Gas Working Unde...,Billion Cubic Feet,BCF
1,NG.NW2_EPG0_SSO_R33_BCF.W,Weekly Salt Region Natural Gas Working Undergr...,Billion Cubic Feet,BCF
2,NG.NW2_EPG0_SWO_R31_BCF.W,Weekly East Region Natural Gas Working Undergr...,Billion Cubic Feet,BCF
3,NG.NW2_EPG0_SWO_R32_BCF.W,Weekly Midwest Region Natural Gas Working Unde...,Billion Cubic Feet,BCF
4,NG.NW2_EPG0_SWO_R33_BCF.W,Weekly South Central Region Natural Gas Worki...,Billion Cubic Feet,BCF
...,...,...,...,...
17541,,% of All Comm. Deliveries for the Acct. of Others,,
17542,,Industrial Deliveries,,
17543,,% of All Ind. Deliveries for the Acct. of Others,,
17544,,Heat Content of Natural Gas Consumed,,


In [30]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df_series):
    # Filter rows based on column: 'units'
    df_series = df_series[df_series['units'].str.contains("Million Cubic Feet", regex=False, na=False)]
    return df_series

df_series_clean = clean_data(df.copy())
df_series_clean

Unnamed: 0,series_id,name,units,f,unitsshort,description,copyright,source,start,end,last_updated,data,iso3166,geography,geography2,category_id,parent_category_id,notes,childseries
11,NG.NGA_EPG0_ENG_YFPT-NJA_MMCF.M,"Freeport, TX Exports to Japan Liquefied Natura...",Million Cubic Feet,M,MMcf,"Freeport, TX Exports to Japan Liquefied Natura...",,"EIA, U.S. Energy Information Administration",201912,202403,2024-05-31T16:21:39-04:00,"[202403, 7473]",JPN,JPN,USA-TX,,,,
11,NG.NGA_EPG0_ENG_YFPT-NJA_MMCF.M,"Freeport, TX Exports to Japan Liquefied Natura...",Million Cubic Feet,M,MMcf,"Freeport, TX Exports to Japan Liquefied Natura...",,"EIA, U.S. Energy Information Administration",201912,202403,2024-05-31T16:21:39-04:00,"[202402, 2047]",JPN,JPN,USA-TX,,,,
11,NG.NGA_EPG0_ENG_YFPT-NJA_MMCF.M,"Freeport, TX Exports to Japan Liquefied Natura...",Million Cubic Feet,M,MMcf,"Freeport, TX Exports to Japan Liquefied Natura...",,"EIA, U.S. Energy Information Administration",201912,202403,2024-05-31T16:21:39-04:00,"[202401, 1917]",JPN,JPN,USA-TX,,,,
11,NG.NGA_EPG0_ENG_YFPT-NJA_MMCF.M,"Freeport, TX Exports to Japan Liquefied Natura...",Million Cubic Feet,M,MMcf,"Freeport, TX Exports to Japan Liquefied Natura...",,"EIA, U.S. Energy Information Administration",201912,202403,2024-05-31T16:21:39-04:00,"[202312, 9951]",JPN,JPN,USA-TX,,,,
11,NG.NGA_EPG0_ENG_YFPT-NJA_MMCF.M,"Freeport, TX Exports to Japan Liquefied Natura...",Million Cubic Feet,M,MMcf,"Freeport, TX Exports to Japan Liquefied Natura...",,"EIA, U.S. Energy Information Administration",201912,202403,2024-05-31T16:21:39-04:00,"[202311, 3581]",JPN,JPN,USA-TX,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15500,NG.NGM_EPG0_IMB_SMS-NTD_MMCF.A,Mississippi Natural Gas Net International Rece...,Million Cubic Feet,A,MMcf,Mississippi Natural Gas Net International Rece...,,"EIA, U.S. Energy Information Administration",2011,2011,2013-08-13T11:49:51-04:00,"[2011, 2820]",TTO,TTO,USA-MS,,,,
15501,NG.NGM_EPG0_IMB_SMS-NEG_MMCF.A,Mississippi Natural Gas Net International Rece...,Million Cubic Feet,A,MMcf,Mississippi Natural Gas Net International Rece...,,"EIA, U.S. Energy Information Administration",2011,2011,2013-08-13T11:49:51-04:00,"[2011, 2954]",EGY,EGY,USA-MS,,,,
15502,NG.NGM_EPG0_IMB_SAK-NCH_MMCF.A,Alaska Natural Gas Net International Receipts ...,Million Cubic Feet,A,MMcf,Alaska Natural Gas Net International Receipts ...,,"EIA, U.S. Energy Information Administration",2011,2011,2013-08-13T11:49:51-04:00,"[2011, -1127]",CHN,CHN,USA-AK,,,,
15503,NG.NGM_EPG0_MIN_SWV-STX_MMCF.A,West Virginia Natural Gas Net Receipts From Te...,Million Cubic Feet,A,MMcf,West Virginia Natural Gas Net Receipts From Texas,,"EIA, U.S. Energy Information Administration",2010,2010,2013-08-13T11:49:51-04:00,"[2010, -417862]",USA-TX,USA-TX,USA-WV,,,,


In [31]:
df_series_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 506026 entries, 11 to 15504
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   series_id           506026 non-null  object
 1   name                506026 non-null  object
 2   units               506026 non-null  object
 3   f                   506026 non-null  object
 4   unitsshort          506026 non-null  object
 5   description         506026 non-null  object
 6   copyright           506026 non-null  object
 7   source              506026 non-null  object
 8   start               506026 non-null  object
 9   end                 506026 non-null  object
 10  last_updated        506026 non-null  object
 11  data                506026 non-null  object
 12  iso3166             481812 non-null  object
 13  geography           481812 non-null  object
 14  geography2          67789 non-null   object
 15  category_id         0 non-null       object
 16  parent_

In [32]:
# Extract relevant columns and preprocess the data
df_series_clean['Date'] = pd.to_datetime(df_series_clean['last_updated'])
df_series_clean

  df_series_clean['Date'] = pd.to_datetime(df_series_clean['last_updated'])


Unnamed: 0,series_id,name,units,f,unitsshort,description,copyright,source,start,end,last_updated,data,iso3166,geography,geography2,category_id,parent_category_id,notes,childseries,Date
11,NG.NGA_EPG0_ENG_YFPT-NJA_MMCF.M,"Freeport, TX Exports to Japan Liquefied Natura...",Million Cubic Feet,M,MMcf,"Freeport, TX Exports to Japan Liquefied Natura...",,"EIA, U.S. Energy Information Administration",201912,202403,2024-05-31T16:21:39-04:00,"[202403, 7473]",JPN,JPN,USA-TX,,,,,2024-05-31 16:21:39-04:00
11,NG.NGA_EPG0_ENG_YFPT-NJA_MMCF.M,"Freeport, TX Exports to Japan Liquefied Natura...",Million Cubic Feet,M,MMcf,"Freeport, TX Exports to Japan Liquefied Natura...",,"EIA, U.S. Energy Information Administration",201912,202403,2024-05-31T16:21:39-04:00,"[202402, 2047]",JPN,JPN,USA-TX,,,,,2024-05-31 16:21:39-04:00
11,NG.NGA_EPG0_ENG_YFPT-NJA_MMCF.M,"Freeport, TX Exports to Japan Liquefied Natura...",Million Cubic Feet,M,MMcf,"Freeport, TX Exports to Japan Liquefied Natura...",,"EIA, U.S. Energy Information Administration",201912,202403,2024-05-31T16:21:39-04:00,"[202401, 1917]",JPN,JPN,USA-TX,,,,,2024-05-31 16:21:39-04:00
11,NG.NGA_EPG0_ENG_YFPT-NJA_MMCF.M,"Freeport, TX Exports to Japan Liquefied Natura...",Million Cubic Feet,M,MMcf,"Freeport, TX Exports to Japan Liquefied Natura...",,"EIA, U.S. Energy Information Administration",201912,202403,2024-05-31T16:21:39-04:00,"[202312, 9951]",JPN,JPN,USA-TX,,,,,2024-05-31 16:21:39-04:00
11,NG.NGA_EPG0_ENG_YFPT-NJA_MMCF.M,"Freeport, TX Exports to Japan Liquefied Natura...",Million Cubic Feet,M,MMcf,"Freeport, TX Exports to Japan Liquefied Natura...",,"EIA, U.S. Energy Information Administration",201912,202403,2024-05-31T16:21:39-04:00,"[202311, 3581]",JPN,JPN,USA-TX,,,,,2024-05-31 16:21:39-04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15500,NG.NGM_EPG0_IMB_SMS-NTD_MMCF.A,Mississippi Natural Gas Net International Rece...,Million Cubic Feet,A,MMcf,Mississippi Natural Gas Net International Rece...,,"EIA, U.S. Energy Information Administration",2011,2011,2013-08-13T11:49:51-04:00,"[2011, 2820]",TTO,TTO,USA-MS,,,,,2013-08-13 11:49:51-04:00
15501,NG.NGM_EPG0_IMB_SMS-NEG_MMCF.A,Mississippi Natural Gas Net International Rece...,Million Cubic Feet,A,MMcf,Mississippi Natural Gas Net International Rece...,,"EIA, U.S. Energy Information Administration",2011,2011,2013-08-13T11:49:51-04:00,"[2011, 2954]",EGY,EGY,USA-MS,,,,,2013-08-13 11:49:51-04:00
15502,NG.NGM_EPG0_IMB_SAK-NCH_MMCF.A,Alaska Natural Gas Net International Receipts ...,Million Cubic Feet,A,MMcf,Alaska Natural Gas Net International Receipts ...,,"EIA, U.S. Energy Information Administration",2011,2011,2013-08-13T11:49:51-04:00,"[2011, -1127]",CHN,CHN,USA-AK,,,,,2013-08-13 11:49:51-04:00
15503,NG.NGM_EPG0_MIN_SWV-STX_MMCF.A,West Virginia Natural Gas Net Receipts From Te...,Million Cubic Feet,A,MMcf,West Virginia Natural Gas Net Receipts From Texas,,"EIA, U.S. Energy Information Administration",2010,2010,2013-08-13T11:49:51-04:00,"[2010, -417862]",USA-TX,USA-TX,USA-WV,,,,,2013-08-13 11:49:51-04:00


In [33]:
df_series_clean = df_series_clean.sort_values(by='Date').reset_index(drop=True)
df_series_clean.head()

Unnamed: 0,series_id,name,units,f,unitsshort,description,copyright,source,start,end,last_updated,data,iso3166,geography,geography2,category_id,parent_category_id,notes,childseries,Date
0,NG.NGM_EPG0_ENG_YENA-NCH_MMCF.M,"Kenai, AK Liquefied Natural Gas Exports to Chi...",Million Cubic Feet,M,MMcf,"Kenai, AK Liquefied Natural Gas Exports to China",,"EIA, U.S. Energy Information Administration",201105,201105,2013-08-13T11:49:51-04:00,"[201105, 1127]",CHN,CHN,USA-AK,,,,,2013-08-13 11:49:51-04:00
1,NG.N5050RI2.M,Rhode Island Natural Gas Underground Storage I...,Million Cubic Feet,M,MMcf,Rhode Island Natural Gas Underground Storage I...,,"EIA, U.S. Energy Information Administration",199401,199612,2013-08-13T11:49:51-04:00,"[199605, 0]",USA-RI,USA-RI,,,,,,2013-08-13 11:49:51-04:00
2,NG.N5050RI2.M,Rhode Island Natural Gas Underground Storage I...,Million Cubic Feet,M,MMcf,Rhode Island Natural Gas Underground Storage I...,,"EIA, U.S. Energy Information Administration",199401,199612,2013-08-13T11:49:51-04:00,"[199606, 0]",USA-RI,USA-RI,,,,,,2013-08-13 11:49:51-04:00
3,NG.N5050RI2.M,Rhode Island Natural Gas Underground Storage I...,Million Cubic Feet,M,MMcf,Rhode Island Natural Gas Underground Storage I...,,"EIA, U.S. Energy Information Administration",199401,199612,2013-08-13T11:49:51-04:00,"[199607, 0]",USA-RI,USA-RI,,,,,,2013-08-13 11:49:51-04:00
4,NG.N5050RI2.M,Rhode Island Natural Gas Underground Storage I...,Million Cubic Feet,M,MMcf,Rhode Island Natural Gas Underground Storage I...,,"EIA, U.S. Energy Information Administration",199401,199612,2013-08-13T11:49:51-04:00,"[199608, 0]",USA-RI,USA-RI,,,,,,2013-08-13 11:49:51-04:00


In [34]:
date_range_la = df_series_clean.groupby(['name', 'series_id'])['Date'].agg(['min', 'max'])
date_range_la

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max
name,series_id,Unnamed: 2_level_1,Unnamed: 3_level_1
"\r\nSweetgrass, MT Compressed Natural Gas Exports to Canada, Annual",NG.NGM_EPG0_ENC_YSWGR-NCA_MMCF.A,2024-02-29 18:56:24-05:00,2024-02-29 18:56:24-05:00
"\r\nSweetgrass, MT Compressed Natural Gas Exports to Canada, Monthly",NG.NGM_EPG0_ENC_YSWGR-NCA_MMCF.M,2023-10-25 13:20:41-04:00,2023-10-25 13:20:41-04:00
"AGA Eastern Consuming Region Natural Gas Underground Storage Withdrawals, Annual",NG.N5060882.A,2015-03-02 07:26:05-05:00,2015-03-02 07:26:05-05:00
"AGA Eastern Consuming Region Natural Gas Underground Storage Withdrawals, Monthly",NG.N5060882.M,2016-05-16 16:56:05-04:00,2016-05-16 16:56:05-04:00
"AGA Eastern Consuming Region Natural Gas Injections into Underground Storage, Annual",NG.N5050882.A,2015-10-01 12:49:41-04:00,2015-10-01 12:49:41-04:00
...,...,...,...
"Wyoming Working Natural Gas Underground Storage Capacity, Monthly",NG.NGA_EPG0_SACW0_SWY_MMCF.M,2024-05-31 16:21:39-04:00,2024-05-31 16:21:39-04:00
"Wyoming Working Natural Gas Underground Storage Depleted Fields Capacity, Annual",NG.NGA_EPG0_SACWD_SWY_MMCF.A,2023-09-29 12:50:45-04:00,2023-09-29 12:50:45-04:00
"Wyoming Working Natural Gas Underground Storage Salt Caverns Capacity, Annual",NG.NGA_EPG0_SACWS_SWY_MMCF.A,2023-09-29 12:50:45-04:00,2023-09-29 12:50:45-04:00
"the District of Columbia Natural Gas Industrial Consumption, Annual",NG.N3035DC2.A,2024-02-29 18:56:24-05:00,2024-02-29 18:56:24-05:00


In [35]:
# Return the table when both min and max are not equal to NaT 
date_range_la_nonas = date_range_la[(date_range_la['min'].notna()) & (date_range_la['max'].notna())]
date_range_la_nonas

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max
name,series_id,Unnamed: 2_level_1,Unnamed: 3_level_1
"\r\nSweetgrass, MT Compressed Natural Gas Exports to Canada, Annual",NG.NGM_EPG0_ENC_YSWGR-NCA_MMCF.A,2024-02-29 18:56:24-05:00,2024-02-29 18:56:24-05:00
"\r\nSweetgrass, MT Compressed Natural Gas Exports to Canada, Monthly",NG.NGM_EPG0_ENC_YSWGR-NCA_MMCF.M,2023-10-25 13:20:41-04:00,2023-10-25 13:20:41-04:00
"AGA Eastern Consuming Region Natural Gas Underground Storage Withdrawals, Annual",NG.N5060882.A,2015-03-02 07:26:05-05:00,2015-03-02 07:26:05-05:00
"AGA Eastern Consuming Region Natural Gas Underground Storage Withdrawals, Monthly",NG.N5060882.M,2016-05-16 16:56:05-04:00,2016-05-16 16:56:05-04:00
"AGA Eastern Consuming Region Natural Gas Injections into Underground Storage, Annual",NG.N5050882.A,2015-10-01 12:49:41-04:00,2015-10-01 12:49:41-04:00
...,...,...,...
"Wyoming Working Natural Gas Underground Storage Capacity, Monthly",NG.NGA_EPG0_SACW0_SWY_MMCF.M,2024-05-31 16:21:39-04:00,2024-05-31 16:21:39-04:00
"Wyoming Working Natural Gas Underground Storage Depleted Fields Capacity, Annual",NG.NGA_EPG0_SACWD_SWY_MMCF.A,2023-09-29 12:50:45-04:00,2023-09-29 12:50:45-04:00
"Wyoming Working Natural Gas Underground Storage Salt Caverns Capacity, Annual",NG.NGA_EPG0_SACWS_SWY_MMCF.A,2023-09-29 12:50:45-04:00,2023-09-29 12:50:45-04:00
"the District of Columbia Natural Gas Industrial Consumption, Annual",NG.N3035DC2.A,2024-02-29 18:56:24-05:00,2024-02-29 18:56:24-05:00


In [36]:
# Load and prepare the dataset
def load_gas_price_data(filepath):
    """
    Load and prepare the dataset from a raw text file containing JSON strings.
    
    Parameters:
    - filepath: Path to the .txt file containing the raw data.
    
    Returns:
    - A DataFrame with the data extracted from JSON strings, focusing on 'date' and 'value' columns.
    """
    # Load the data from the text file
    data = pd.read_csv(filepath, sep='\t', header=None, names=['json_str'])
    
    # Function to parse JSON strings
    def parse_json_str(json_str):
        return json.loads(json_str)
    
    # Apply the function to parse the JSON strings
    parsed_data = data['json_str'].apply(parse_json_str)
    
    # Create a DataFrame from the parsed JSON data
    df = pd.json_normalize(parsed_data)

    # Keep only necessary columns and drop NA values
    #df = df[['date', 'value']].dropna()
    
    return df


In [56]:

# Prepare data for long format and additional transformations
def prepare_data(df):
    """
    Prepare the dataset for analysis by performing several transformations.
    
    Parameters:
    - df: DataFrame to be transformed.
    - series_id: The series ID to filter the DataFrame by. Default is "PET.EMA_EPM0_PBS_SLA_DPG.M".
    
    Returns:
    - Transformed DataFrame.
    """
    # Filter based on series_id and non-NA 'units' column
    # Only select relevant columns used in downstream analysis
    df = df[['name', 'units', 'data', 'end']]
    

    # Explode the 'data' column to separate rows for each date-value pair
    df = df.explode('data')
    
    # Drop rows where 'data' is NaN or not a list
    df = df.dropna(subset=['data'])
    df = df[df['data'].apply(lambda x: isinstance(x, list) and len(x) == 2)]
    
    # # Split 'data' column into 'date' and 'value'
    df[['date', 'value']] = pd.DataFrame(df['data'].tolist(), index=df.index)
    
    # # Remove the 'data' column
    df = df.drop(columns=['data'])
    
    # # Convert 'date' to datetime, coercing errors to NaT
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d', errors='coerce')
    
    # # Convert 'value' to a numeric type, coercing errors to NaN
    df['value'] = pd.to_numeric(df['value'], errors='coerce')
    
    # Sort by 'date' to ensure chronological order
    df = df.sort_values('date')
    
    # Calculate log of 'value' and the difference in log_price
    df['log_price'] = np.log(df['value'])
    df['price_change'] = df['log_price'].diff()
    
    return df

df_prepared = prepare_data(df)
df_prepared

Unnamed: 0,name,units,end,date,value,log_price,price_change
10761,Imports of heavy sour crude oil from Canada to...,thousand barrels,202312,2009-01-01,83,4.418841,
39746,Imports of heavy sweet crude oil from Cameroon...,thousand barrels,201210,2009-01-01,1887,7.542744,3.123903
3224,Imports of heavy sour crude oil from Canada to...,thousand barrels,202403,2009-01-01,50,3.912023,-3.630721
19125,Imports of light sour crude oil from Canada to...,thousand barrels,202106,2009-01-01,460,6.131226,2.219203
3222,Imports of heavy sour crude oil from Canada to...,thousand barrels,202403,2009-01-01,7134,8.872627,2.741401
...,...,...,...,...,...,...,...
45066,Imports of light sweet crude oil from United K...,thousand barrels,201011,NaT,592,6.383507,0.377153
45066,Imports of light sweet crude oil from United K...,thousand barrels,201011,NaT,598,6.393591,0.010084
45066,Imports of light sweet crude oil from United K...,thousand barrels,201011,NaT,494,6.202536,-0.191055
45066,Imports of light sweet crude oil from United K...,thousand barrels,201011,NaT,523,6.259581,0.057046


In [57]:
#  Function to perform AutoARIMA forecasting
def forecast_prices(df_filtered):
    with warnings.catch_warnings():
        # Ignore specific warnings
        warnings.simplefilter("ignore", ValueWarning)
        warnings.simplefilter("ignore", FutureWarning)

        # Define the model
        model = SARIMAX(df_filtered['log_price'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
        
        # Fit the model
        results = model.fit()
        
        # Generate future dates with 'ME' instead of deprecated 'M'
        future_dates = pd.date_range(df_filtered['date'].max() + MonthEnd(1), periods=13, freq='ME')
        
        # Forecast future log prices
        forecast_log_prices = results.forecast(steps=13)
        
        # Convert log prices back to regular prices
        forecast_prices = np.exp(forecast_log_prices)
        
        # Create a DataFrame for the forecasted prices
        forecast_df = pd.DataFrame({
            'date': future_dates,
            'forecast_price': forecast_prices
        })
        
        return forecast_df

In [59]:
# Visualization function
def plot_forecast(df, forecast_df):
    """
    Plots the historical oil prices along with the forecasted prices.

    Parameters:
    - df: A pandas DataFrame containing the historical data with columns 'date' and 'value'.
    - forecast_df: A pandas DataFrame containing the forecasted data with columns 'date' and 'forecast_price'.

    Returns:
    - An Altair chart object that visualizes the historical and forecasted oil prices.
    """
    # Base chart for historical data
    base = alt.Chart(df).encode(
        x=alt.X('date:T', title='Date'),
        y=alt.Y('value:Q', title='Price')
    ).properties(
        width=700,
        height=400
    )

    # Line chart for historical data
    line = base.mark_line(color='blue', size=2, opacity=0.7).properties(
        title="Historical and Forecasted Oil Prices"
    )

    # Points for historical data
    points = base.mark_point(color='red', size=50, opacity=0.5)

    # Line chart for forecasted data
    forecast_chart = alt.Chart(forecast_df).mark_line(color='green', size=2, opacity=0.7).encode(
        x='date:T',
        y=alt.Y('forecast_price:Q', title='Forecast Price')
    )

    # Combine the charts
    chart = line + points + forecast_chart

    return chart

In [41]:
df = load_gas_price_data('data/raw/PET_IMPORTS/PET_IMPORTS.txt')
df

Unnamed: 0,series_id,name,units,f,copyright,source,lat,lon,geography,geography2,...,parent_category_id,notes,childseries,relation_id,bar_facets,stack_facets,geoset_ids,summable,vertex.name,vertex.geoset_id
0,PET_IMPORTS.OPN_N-PT_2002-ALL.M,Imports of all grades of crude oil from Non-OP...,thousand barrels,M,,"EIA, U.S. Energy Information Administration",29.9728,-90.059,ABW+AGO+ALB+ARE+ARG+AUS+AUT+AZE+BEL+BEN+BGR+BH...,USA-LA,...,,,,,,,,,,
1,PET_IMPORTS.CTY_UK-PP_3-ALL.M,Imports of all grades of crude oil from United...,thousand barrels,M,,"EIA, U.S. Energy Information Administration",,,GBR,USA-AL+USA-AR+USA-LA+USA-MS+USA-NM+USA-TX,...,,,,,,,,,,
2,PET_IMPORTS.CTY_CA-RP_4-ALL.M,Imports of all grades of crude oil from Canada...,thousand barrels,M,,"EIA, U.S. Energy Information Administration",,,CAN,USA-CO+USA-ID+USA-MT+USA-UT+USA-WY,...,,,,,,,,,,
3,PET_IMPORTS.CTY_CA-RF_92-ALL.M,Imports of all grades of crude oil from Canada...,thousand barrels,M,,"EIA, U.S. Energy Information Administration",44.85094,-92.998924,CAN,USA-MN,...,,,,,,,,,,
4,PET_IMPORTS.CTY_GH-PS_DE-ALL.M,Imports of all grades of crude oil from Ghana ...,thousand barrels,M,,"EIA, U.S. Energy Information Administration",,,GHA,USA-DE,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51408,,Imports of crude oil to Wyoming by grade (Annual),thousand barrels,A,,,,,,,...,,,,PET_IMPORTS.RS_WY-ALL.A|GRADE,"[light sweet, light sour, medium, heavy sweet,...",,"[PET_IMPORTS.RS_WY-LSW.A, PET_IMPORTS.RS_WY-LS...",Y,Imports of all grades of crude oil to refineri...,PET_IMPORTS.RS_WY-ALL.A
51409,,Imports of crude oil to Wyoming by grade (Mont...,thousand barrels,M,,,,,,,...,,,,PET_IMPORTS.RS_WY-ALL.M|GRADE,"[light sweet, light sour, medium, heavy sweet,...",,"[PET_IMPORTS.RS_WY-LSW.M, PET_IMPORTS.RS_WY-LS...",Y,Imports of all grades of crude oil to refineri...,PET_IMPORTS.RS_WY-ALL.M
51410,,Imports of crude oil to Unkown State by grade ...,thousand barrels,M,,,,,,,...,,,,PET_IMPORTS.RS_XX-ALL.M|GRADE,"[light sweet, light sour, medium, heavy sweet,...",,"[PET_IMPORTS.RS_XX-LSW.M, PET_IMPORTS.RS_XX-LS...",Y,Imports of all grades of crude oil to refineri...,PET_IMPORTS.RS_XX-ALL.M
51411,,Imports of crude oil to Total U.S. by grade (A...,thousand barrels,A,,,,,,,...,,,,PET_IMPORTS.US-ALL.A|GRADE,"[light sweet, light sour, medium, heavy sweet,...",,"[PET_IMPORTS.US-LSW.A, PET_IMPORTS.US-LSO.A, P...",Y,Imports of all grades of crude oil to Total U....,PET_IMPORTS.US-ALL.A


In [60]:
df_prepared = prepare_data(df)
df_prepared

Unnamed: 0,name,units,end,date,value,log_price,price_change
10761,Imports of heavy sour crude oil from Canada to...,thousand barrels,202312,2009-01-01,83,4.418841,
39746,Imports of heavy sweet crude oil from Cameroon...,thousand barrels,201210,2009-01-01,1887,7.542744,3.123903
3224,Imports of heavy sour crude oil from Canada to...,thousand barrels,202403,2009-01-01,50,3.912023,-3.630721
19125,Imports of light sour crude oil from Canada to...,thousand barrels,202106,2009-01-01,460,6.131226,2.219203
3222,Imports of heavy sour crude oil from Canada to...,thousand barrels,202403,2009-01-01,7134,8.872627,2.741401
...,...,...,...,...,...,...,...
45066,Imports of light sweet crude oil from United K...,thousand barrels,201011,NaT,592,6.383507,0.377153
45066,Imports of light sweet crude oil from United K...,thousand barrels,201011,NaT,598,6.393591,0.010084
45066,Imports of light sweet crude oil from United K...,thousand barrels,201011,NaT,494,6.202536,-0.191055
45066,Imports of light sweet crude oil from United K...,thousand barrels,201011,NaT,523,6.259581,0.057046


In [61]:
forecast_df = forecast_prices(df_prepared)
forecast_df

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.62083D+00    |proj g|=  2.65068D-01


 This problem is unconstrained.



At iterate    5    f=  1.47854D+00    |proj g|=  1.44932D-01

At iterate   10    f=  1.45387D+00    |proj g|=  9.96695D-03

At iterate   15    f=  1.44818D+00    |proj g|=  2.81242D-03

At iterate   20    f=  1.44742D+00    |proj g|=  2.42727D-03

At iterate   25    f=  1.44718D+00    |proj g|=  1.03788D-03

At iterate   30    f=  1.44714D+00    |proj g|=  5.02433D-04
  ys=-4.043E-05  -gs= 3.152E-06 BFGS update SKIPPED

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    5     32     58      1     1     0   1.043D-04   1.447D+00
  F =   1.4471410477753044     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


   evaluations in the last line search.  Termination
   may possibly be caused by a bad search direction.


: 

In [None]:
plot_forecast(df, forecast_df)

In [None]:
# # Save the model
# import joblib
# joblib.dump(model, 'models/oil_price_forecast_model.pkl')