In [1]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
try:
    from prophet import Prophet
except ImportError:
    print('Prophet is not installed.')
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import eia

In [3]:
# Load the data from the text file
data = pd.read_csv('data/raw/PET/PET.txt', sep='\t', header=None, names=['json_str'])

# Function to parse JSON strings
def parse_json_str(json_str):
    return json.loads(json_str)

# Apply the function to parse the JSON strings
parsed_data = data['json_str'].apply(parse_json_str)

# Create a DataFrame from the parsed JSON data
df = pd.json_normalize(parsed_data)

# Display the first few rows of the DataFrame
print(df.head())

                       series_id  \
0  PET.EMM_EPMPR_PTE_Y35NY_DPG.W   
1  PET.EMM_EPMPR_PTE_Y44HO_DPG.W   
2  PET.EMM_EPMMR_PTE_R5XCA_DPG.W   
3  PET.EMM_EPMMR_PTE_Y05LA_DPG.W   
4  PET.EMM_EPMMR_PTE_Y05SF_DPG.W   

                                                name               units  f  \
0  New York Harbor Premium Reformulated Retail Ga...  Dollars per Gallon  W   
1  Houston, TX Premium Reformulated Retail Gasoli...  Dollars per Gallon  W   
2  West Coast (PADD 5) Except California Midgrade...  Dollars per Gallon  W   
3  Los Angeles, CA Midgrade Reformulated Retail G...  Dollars per Gallon  W   
4  San Francisco, CA Midgrade Reformulated Retail...  Dollars per Gallon  W   

  unitsshort                                        description copyright  \
0      $/gal  New York Harbor Premium Reformulated Retail Ga...      None   
1      $/gal  Houston, TX Premium Reformulated Retail Gasoli...      None   
2      $/gal  West Coast (PADD 5) Except California Midgrade...      None   


In [4]:
df.columns

Index(['series_id', 'name', 'units', 'f', 'unitsshort', 'description',
       'copyright', 'source', 'iso3166', 'geography', 'start', 'end',
       'last_updated', 'data', 'geography2', 'category_id',
       'parent_category_id', 'notes', 'childseries'],
      dtype='object')

In [5]:
# Get distinct series_id, name pairs
series_names = df[['series_id', 'name']].drop_duplicates()
series_names

Unnamed: 0,series_id,name
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...
1,PET.EMM_EPMPR_PTE_Y44HO_DPG.W,"Houston, TX Premium Reformulated Retail Gasoli..."
2,PET.EMM_EPMMR_PTE_R5XCA_DPG.W,West Coast (PADD 5) Except California Midgrade...
3,PET.EMM_EPMMR_PTE_Y05LA_DPG.W,"Los Angeles, CA Midgrade Reformulated Retail G..."
4,PET.EMM_EPMMR_PTE_Y05SF_DPG.W,"San Francisco, CA Midgrade Reformulated Retail..."
...,...,...
191350,,"Off-Highway - Distillate F.O., Non-Construction"
191351,,All Other - Distillate Fuel Oil
191352,,All Other - Residual Fuel Oil
191353,,All Other - Kerosene


In [6]:
# Use the pandas DataFrame method `to_clipboard` to copy the first 5 rows of the DataFrame `df` to the clipboard.
df.head(2).to_clipboard(index=False)

In [7]:
# Unnest the 'data' column
df_parsed = df.explode('data')
df_parsed

Unnamed: 0,series_id,name,units,f,unitsshort,description,copyright,source,iso3166,geography,start,end,last_updated,data,geography2,category_id,parent_category_id,notes,childseries
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240617, 4.306]",,,,,
1,PET.EMM_EPMPR_PTE_Y44HO_DPG.W,"Houston, TX Premium Reformulated Retail Gasoli...",Dollars per Gallon,W,$/gal,"Houston, TX Premium Reformulated Retail Gasoli...",,"EIA, U.S. Energy Information Administration",USA-TX,USA-TX,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240617, 3.817]",,,,,
2,PET.EMM_EPMMR_PTE_R5XCA_DPG.W,West Coast (PADD 5) Except California Midgrade...,Dollars per Gallon,W,$/gal,West Coast (PADD 5) Except California Midgrade...,,"EIA, U.S. Energy Information Administration",USA-CA,USA-CA,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240617, 4.137]",,,,,
3,PET.EMM_EPMMR_PTE_Y05LA_DPG.W,"Los Angeles, CA Midgrade Reformulated Retail G...",Dollars per Gallon,W,$/gal,"Los Angeles, CA Midgrade Reformulated Retail G...",,"EIA, U.S. Energy Information Administration",USA-CA,USA-CA,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240617, 4.865]",,,,,
4,PET.EMM_EPMMR_PTE_Y05SF_DPG.W,"San Francisco, CA Midgrade Reformulated Retail...",Dollars per Gallon,W,$/gal,"San Francisco, CA Midgrade Reformulated Retail...",,"EIA, U.S. Energy Information Administration",USA-CA,USA-CA,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240617, 5.002]",,,,,


In [8]:
# Split the 'data' column into separate columns
df_parsed[['date', 'value']] = pd.DataFrame(df_parsed['data'].tolist(), index=df_parsed.index)

# Convert 'date' column to datetime format
df_parsed['date'] = pd.to_datetime(df_parsed['date'], format='%Y%m%d')

# Display the first few rows of the DataFrame
df_parsed.head()

In [9]:
# Save the parsed DataFrame to a CSV file
df_parsed.to_csv('parsed_petroleum_data.csv', index=False)