In [2]:
#%pip install pandas numpy scikit-learn matplotlib seaborn statsmodels prophet --quiet

In [3]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from prophet import Prophet
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import eia

Importing plotly failed. Interactive plots will not work.


In [4]:
api_key = "ItuuIP4NZvau4K3dJk2GHfP3dOzGzj3Owy2Ca8re"
api = eia.API(api_key)

In [5]:
# Load the data from the text file
data = pd.read_csv('data\PET\PET.txt', sep='\t', header=None, names=['json_str'])

# Function to parse JSON strings
def parse_json_str(json_str):
    return json.loads(json_str)

# Apply the function to parse the JSON strings
parsed_data = data['json_str'].apply(parse_json_str)

# Create a DataFrame from the parsed JSON data
df = pd.json_normalize(parsed_data)

# Display the first few rows of the DataFrame
print(df.head())

                       series_id  \
0  PET.EMM_EPMPR_PTE_Y35NY_DPG.W   
1  PET.EMM_EPMPR_PTE_Y44HO_DPG.W   
2  PET.EMM_EPMMR_PTE_R5XCA_DPG.W   
3  PET.EMM_EPMMR_PTE_Y05LA_DPG.W   
4  PET.EMM_EPMMR_PTE_Y05SF_DPG.W   

                                                name               units  f  \
0  New York Harbor Premium Reformulated Retail Ga...  Dollars per Gallon  W   
1  Houston, TX Premium Reformulated Retail Gasoli...  Dollars per Gallon  W   
2  West Coast (PADD 5) Except California Midgrade...  Dollars per Gallon  W   
3  Los Angeles, CA Midgrade Reformulated Retail G...  Dollars per Gallon  W   
4  San Francisco, CA Midgrade Reformulated Retail...  Dollars per Gallon  W   

  unitsshort                                        description copyright  \
0      $/gal  New York Harbor Premium Reformulated Retail Ga...      None   
1      $/gal  Houston, TX Premium Reformulated Retail Gasoli...      None   
2      $/gal  West Coast (PADD 5) Except California Midgrade...      None   


In [6]:
df.columns

Index(['series_id', 'name', 'units', 'f', 'unitsshort', 'description',
       'copyright', 'source', 'iso3166', 'geography', 'start', 'end',
       'last_updated', 'data', 'geography2', 'category_id',
       'parent_category_id', 'notes', 'childseries'],
      dtype='object')

In [7]:
# Get distinct series_id, name pairs
series_names = df[['series_id', 'name']].drop_duplicates()
series_names

Unnamed: 0,series_id,name
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...
1,PET.EMM_EPMPR_PTE_Y44HO_DPG.W,"Houston, TX Premium Reformulated Retail Gasoli..."
2,PET.EMM_EPMMR_PTE_R5XCA_DPG.W,West Coast (PADD 5) Except California Midgrade...
3,PET.EMM_EPMMR_PTE_Y05LA_DPG.W,"Los Angeles, CA Midgrade Reformulated Retail G..."
4,PET.EMM_EPMMR_PTE_Y05SF_DPG.W,"San Francisco, CA Midgrade Reformulated Retail..."
...,...,...
191350,,"Off-Highway - Distillate F.O., Non-Construction"
191351,,All Other - Distillate Fuel Oil
191352,,All Other - Residual Fuel Oil
191353,,All Other - Kerosene


In [10]:
# Use the pandas DataFrame method `to_clipboard` to copy the first 5 rows of the DataFrame `df` to the clipboard.
df.head(2).to_clipboard(index=False)

In [12]:
# Unnest the 'data' column
df_parsed = df.explode('data')
df_parsed

Unnamed: 0,series_id,name,units,f,unitsshort,description,copyright,source,iso3166,geography,start,end,last_updated,data,geography2,category_id,parent_category_id,notes,childseries
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240617, 4.306]",,,,,
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240610, 4.328]",,,,,
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240603, 4.367]",,,,,
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240527, 4.412]",,,,,
0,PET.EMM_EPMPR_PTE_Y35NY_DPG.W,New York Harbor Premium Reformulated Retail Ga...,Dollars per Gallon,W,$/gal,New York Harbor Premium Reformulated Retail Ga...,,"EIA, U.S. Energy Information Administration",USA-NY,USA-NY,20000605,20240617,2024-06-17T22:10:18-04:00,"[20240520, 4.399]",,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191459,,"Off-Highway - Distillate F.O., Construction",,,,,,,,,,,,,,455865,453302,,"[PET.K2DVAGNUS1.A, PET.K2DVAGR0X1.A, PET.K2DVA..."
191460,,"Off-Highway - Distillate F.O., Non-Construction",,,,,,,,,,,,,,455926,453302,,"[PET.K2DVAYNUS1.A, PET.K2DVAYR0X1.A, PET.K2DVA..."
191461,,All Other - Distillate Fuel Oil,,,,,,,,,,,,,,455987,453302,,"[PET.KD0VAXNUS1.A, PET.KD0VAXR0X1.A, PET.KD0VA..."
191462,,All Other - Residual Fuel Oil,,,,,,,,,,,,,,456048,453302,,"[PET.KPRVAXNUS1.A, PET.KPRVAXR0X1.A, PET.KPRVA..."


In [None]:
# Split the 'data' column into separate columns
df_parsed[['date', 'value']] = pd.DataFrame(df_parsed['data'].tolist(), index=df_parsed.index)

# Drop the original 'data' column
df_parsed = df_parsed.drop(columns=['data'])
df_parsed.head()

In [16]:

def load_datasets(base_path='data/'):
    """
    Load all datasets from subfolders within the specified base path into a dictionary.
    
    Each dataset is expected to be in a text file with tab-separated values, where each row contains a JSON string.
    The datasets are loaded into pandas DataFrames, with the dataset names as keys in the returned dictionary.
    
    Parameters:
    - base_path (str): The base directory path containing the dataset subfolders.
    
    Returns:
    - dict: A dictionary with dataset names as keys and the corresponding DataFrames as values.
    """
    datasets = {}
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith('.txt'):
                # Construct the full file path
                file_path = os.path.join(root, file)
                # Load the data from the text file
                data = pd.read_csv(file_path, sep='\t', header=None, names=['json_str'])
                # Parse the JSON strings
                parsed_data = data['json_str'].apply(lambda x: json.loads(x))
                # Normalize the JSON data into a DataFrame
                df = pd.json_normalize(parsed_data)
                # Extract the dataset name from the path and use it as the key
                dataset_name = os.path.basename(root)
                datasets[dataset_name] = df
    return datasets

In [17]:
all_datasets = load_datasets('data')
all_datasets.keys()