In [1]:
%load_ext lineapy 
%load_ext nb_black



<IPython.core.display.Javascript object>

In [32]:
from statsforecast.models import AutoARIMA
from statsmodels.tsa.stattools import acf 
import pandas as pd 
import lineapy
import requests 
import re 
import numpy as np
from numpy.linalg import svd
import altair as alt

<IPython.core.display.Javascript object>

In [13]:
response = requests.get("https://www.eia.gov/petroleum/gasdiesel/xls/pswrgvwall.xls")
df = pd.read_excel(
    response.content,
    sheet_name="Data 12",
    index_col=0,
    skiprows=2,
    parse_dates=["Date"],
).rename(
    columns=lambda c: re.sub(
        "\(PADD 1[A-C]\)",
        "",
        c.replace("Weekly ", "").replace(
            " All Grades All Formulations Retail Gasoline Prices  (Dollars per Gallon)",
            "",
        ),
    ).strip()
)
lineapy.save(df, "weekly_gas_price_data")

LineaArtifact(name='weekly_gas_price_data', _version=0)

<IPython.core.display.Javascript object>

In [16]:
df_long = (
    df.reset_index()
    .melt(id_vars=["Date"], var_name="region", value_name="price")
    .rename(columns={"Date": "week"})
    .sort_values(["region", "week"])
    .assign(
        # if we're missing one value, just use the last value
        # (happens twice)
        price=lambda x: x["price"].combine_first(x.groupby("region")["price"].shift(1)),
        # we'll forecast log(price) and then transform
        log_price=lambda x: np.log(x["price"]),
        # percentage price changes are approximately the difference in log(price)
        price_change=lambda x: (
            x["log_price"] - x.groupby("region")["log_price"].shift(1)
        ),
    )
    .query("price == price")  # filter out NAs
)

lineapy.save(df_long, "weekly_gas_price_data_long")
df_long.head()

Unnamed: 0,week,region,price,log_price,price_change
28249,2003-05-26,"Boston, MA",1.555,0.441476,
28250,2003-06-02,"Boston, MA",1.547,0.436318,-0.005158
28251,2003-06-09,"Boston, MA",1.534,0.427879,-0.008439
28252,2003-06-16,"Boston, MA",1.549,0.43761,0.009731
28253,2003-06-23,"Boston, MA",1.544,0.434376,-0.003233


<IPython.core.display.Javascript object>

In [18]:
df_long.groupby("region")["price"].count().reset_index().pipe(alt.Chart).encode(
    x=alt.X("price", title="Cases"), y=alt.Y("region", sort=alt.SortField("price"))
).mark_bar()

<IPython.core.display.Javascript object>

In [19]:
df_long.groupby("week")["price"].count().reset_index().pipe(alt.Chart).encode(
    x="week", y=alt.Y("price", title="Count")
).mark_line()

<IPython.core.display.Javascript object>

In [20]:
df_long["price_change"].mean() * 52

0.043646459802179696

<IPython.core.display.Javascript object>

In [21]:
(
    df_long.query("price_change == price_change")
    .sample(5000)
    .pipe(alt.Chart)
    .transform_density("price_change")
    .encode(x="value:Q", y="density:Q")
    .mark_area()
)

<IPython.core.display.Javascript object>

In [22]:
all_regions = df_long["region"].unique().tolist()
lineapy.save(all_regions, "all_regions")
num_regions = len(all_regions)
num_regions

28

<IPython.core.display.Javascript object>

In [23]:
complete_case_date = (
    df_long.groupby("week")["price"]
    .count()
    .reset_index()
    .query(f"price == {num_regions}")["week"]
    .min()
).strftime("%Y-%m-%d")
complete_case_date

'2003-05-26'

<IPython.core.display.Javascript object>

In [24]:
(
    df_long.groupby("region")["price_change"]
    .mean()
    .reset_index()
    .assign(annual_price_change=lambda x: x["price_change"] * 52)
    .pipe(alt.Chart)
    .encode(
        x=alt.X("region", sort=alt.SortField("annual_price_change")),
        y=alt.Y("annual_price_change", title="Annual Price Growth"),
    )
    .mark_bar()
)

<IPython.core.display.Javascript object>

In [28]:
wide = (
    df_long.query(f"week > '{complete_case_date}'")[["week", "region", "price_change"]]
    .set_index("week")
    .pivot(columns="region", values="price_change")
)
matrix = wide.values
print(matrix.shape)
u, d, v = svd(matrix)

(1010, 28)


<IPython.core.display.Javascript object>

In [29]:
scree_plot = (
    pd.DataFrame({"eigenvalue": d, "index": np.arange(d.shape[0])})
    .pipe(alt.Chart)
    .encode(x="index", y="eigenvalue")
    .mark_point()
)

lineapy.save(scree_plot, "scree_plot")
scree_plot

<IPython.core.display.Javascript object>

In [30]:
components = pd.DataFrame(
    v, columns=[f"component_{i}" for i in range(v.shape[0])], index=wide.columns
).reset_index()

components_plot = (
    components.pipe(alt.Chart)
    .encode(x="component_0", y="component_1", text="region")
    .mark_text()
    .interactive()
)

lineapy.save(components_plot, "components_plot")
components_plot

<IPython.core.display.Javascript object>

In [33]:
region = "U.S."
auto_correlation = (
    df_long.query(f"region == '{region}'")
    .query("price_change == price_change")["price_change"]
    .pipe(acf)
)
acf_plot = (
    pd.DataFrame({"rho": auto_correlation, "lag": np.arange(auto_correlation.shape[0])})
    .pipe(alt.Chart, title=region)
    .encode(x="lag", y="rho")
    .mark_bar()
)
lineapy.save(acf_plot, "acf_plot2")
acf_plot

<IPython.core.display.Javascript object>

In [34]:
H = 13
CI = 80
width = 300
height = 250
region = "U.S."
cutoff_date = "2022-10-02"
plot_start_date = "2022-01-01"
plot_title = f"{region} (as of {cutoff_date})"

<IPython.core.display.Javascript object>

In [35]:
region_df = df_long.query(f"region == '{region}'")
train = region_df.query(f"week < '{cutoff_date}'")
m_aa = AutoARIMA()
m_aa.fit(train["log_price"].values)

AutoARIMA

<IPython.core.display.Javascript object>

In [36]:
raw_forecast = m_aa.predict(h=H, level=(CI,))
raw_forecast_exp = {key: np.exp(value) for key, value in raw_forecast.items()}
forecast = pd.DataFrame(raw_forecast_exp).assign(
    week=pd.date_range(train["week"].max(), periods=H, freq="W")
    + pd.Timedelta("7 days")
)
forecast = pd.concat(
    [
        forecast,
        train.tail(1)
        .rename(columns={"price": "mean"})
        .assign(**{f"lo-{CI}": lambda x: x["mean"], f"hi-{CI}": lambda x: x["mean"]}),
    ]
)
forecast.head()

Unnamed: 0,mean,lo-80,hi-80,week,region,log_price,price_change
0,3.860496,3.785185,3.937306,2022-10-09,,,
1,3.877999,3.741205,4.019795,2022-10-16,,,
2,3.899743,3.705606,4.104052,2022-10-23,,,
3,3.917323,3.665836,4.186063,2022-10-30,,,
4,3.928728,3.622444,4.260909,2022-11-06,,,


<IPython.core.display.Javascript object>

In [37]:
uncertainty_plot = (
    forecast.pipe(alt.Chart, height=height, width=width)
    .encode(
        x="week",
        y=alt.Y(f"lo-{CI}", title="Price"),
        y2=alt.Y2(f"hi-{CI}", title="Price"),
    )
    .mark_area(opacity=0.2)
)

history_plot = (
    region_df.query(f"week >= '{plot_start_date}'")
    .pipe(alt.Chart, title=plot_title)
    .encode(x=alt.X("week", title="Week"), y=alt.Y("price", title="Price"))
    .mark_line()
)

forecast_plot = forecast.pipe(alt.Chart).encode(x="week", y="mean").mark_line()

cutoff_plot = (
    train.tail(1).pipe(alt.Chart).encode(x="week").mark_rule(strokeDash=[10, 2])
)

full_plot = uncertainty_plot + history_plot + forecast_plot + cutoff_plot
lineapy.save(full_plot, "gas_price_forecast")

LineaArtifact(name='gas_price_forecast', _version=0)

<IPython.core.display.Javascript object>

In [38]:
full_plot

<IPython.core.display.Javascript object>

In [39]:
forecast_region = lineapy.get_function(
    ["gas_price_forecast"],
    input_parameters=[
        "region",
        "cutoff_date",
        "H",
        "width",
        "height",
        "plot_start_date",
    ],
    reuse_pre_computed_artifacts=["weekly_gas_price_data_long"],
)

<IPython.core.display.Javascript object>

In [40]:
result = forecast_region(
    region="California", cutoff_date="2022-06-07", H=15, width=300, height=250
)
result["gas_price_forecast"]

<IPython.core.display.Javascript object>

In [41]:
plots = []
for region in all_regions:
    result = forecast_region(
        region=region, cutoff_date=cutoff_date, height=200, width=200
    )
    plots.append(result["gas_price_forecast"])

<IPython.core.display.Javascript object>

In [42]:
chart = alt.vconcat()
for i, plot in enumerate(plots):
    if i % 4 == 0:
        row = alt.hconcat()
        chart &= row
    row |= plot
chart

<IPython.core.display.Javascript object>

In [43]:
lineapy.save(chart, "all_forecasts_plot")

LineaArtifact(name='all_forecasts_plot', _version=0)

<IPython.core.display.Javascript object>

In [44]:
lineapy.to_pipeline(
    ["gas_price_forecast", "weekly_gas_price_data", "weekly_gas_price_data_long"],
    dependencies={
        "gas_price_forecast": {"weekly_gas_price_data_long": {"weekly_gas_price_data"}}
    },
    pipeline_name="gas_price_forecast",
    output_dir="pipeline",
    framework="AIRFLOW",
    input_parameters=["region", "cutoff_date"],
)

Generated module file: pipeline\gas_price_forecast_module.py                   
Generated requirements file: pipeline\gas_price_forecast_requirements.txt      
Generated DAG file: pipeline\gas_price_forecast_dag.py                         
Generated Docker file: pipeline\gas_price_forecast_Dockerfile                  


WindowsPath('pipeline')

<IPython.core.display.Javascript object>

In [51]:
!type pipeline/gas_price_forecast_dag.py

The syntax of the command is incorrect.


<IPython.core.display.Javascript object>

In [None]:
!type pipeline/gas_price_forecast_module.py

In [None]:
!type pipeline/gas_price_forecast_requirements.txt