In [1]:
import pandas as pd
df = pd.read_csv("https://download.bls.gov/pub/time.series/ap/ap.data.3.Food", sep="\t", low_memory=False)
df

Unnamed: 0,series_id,year,period,value,footnote_codes
0,APU0000701111,1980,M01,0.203,
1,APU0000701111,1980,M02,0.205,
2,APU0000701111,1980,M03,0.211,
3,APU0000701111,1980,M04,0.206,
4,APU0000701111,1980,M05,0.207,
...,...,...,...,...,...
141577,APU0400FS1101,2022,M10,5.082,
141578,APU0400FS1101,2022,M11,5.070,
141579,APU0400FS1101,2022,M12,5.296,
141580,APU0400FS1101,2023,M01,5.219,


In [2]:
df.columns

Index(['series_id        ', 'year', 'period', '       value',
       'footnote_codes'],
      dtype='object')

In [3]:
df.columns = list(map(str.strip, df.columns))
df.columns

Index(['series_id', 'year', 'period', 'value', 'footnote_codes'], dtype='object')

In [4]:
df["footnote_codes"].isnull().sum()

141582

In [5]:
df=df.drop(["footnote_codes"],axis=1)

In [6]:
df["series_id"].unique()

array(['APU0000701111    ', 'APU0000701311    ', 'APU0000701312    ',
       'APU0000701321    ', 'APU0000701322    ', 'APU0000702111    ',
       'APU0000702112    ', 'APU0000702211    ', 'APU0000702212    ',
       'APU0000702213    ', 'APU0000702221    ', 'APU0000702411    ',
       'APU0000702421    ', 'APU0000702611    ', 'APU0000703111    ',
       'APU0000703112    ', 'APU0000703113    ', 'APU0000703211    ',
       'APU0000703212    ', 'APU0000703213    ', 'APU0000703311    ',
       'APU0000703312    ', 'APU0000703411    ', 'APU0000703421    ',
       'APU0000703422    ', 'APU0000703423    ', 'APU0000703425    ',
       'APU0000703431    ', 'APU0000703432    ', 'APU0000703511    ',
       'APU0000703512    ', 'APU0000703611    ', 'APU0000703612    ',
       'APU0000703613    ', 'APU0000704111    ', 'APU0000704211    ',
       'APU0000704212    ', 'APU0000704311    ', 'APU0000704312    ',
       'APU0000704313    ', 'APU0000704314    ', 'APU0000704321    ',
       'APU000070441

In [7]:
df['series_id'] = df['series_id'].str.strip()

In [8]:
df_bread = df[df["series_id"] == "APU0000702111"]
df_bread = df_bread.drop(["series_id"],axis=1).reset_index(drop=True)
df_bread

Unnamed: 0,year,period,value
0,1980,M01,0.501
1,1980,M02,0.507
2,1980,M03,0.502
3,1980,M04,0.507
4,1980,M05,0.504
...,...,...,...
513,2022,M10,1.814
514,2022,M11,1.847
515,2022,M12,1.873
516,2023,M01,1.888


In [9]:
df_flour = df[df["series_id"] == "APU0000701111"]
df_flour = df_flour.drop(["series_id"],axis=1).reset_index(drop=True)
df_flour

Unnamed: 0,year,period,value
0,1980,M01,0.203
1,1980,M02,0.205
2,1980,M03,0.211
3,1980,M04,0.206
4,1980,M05,0.207
...,...,...,...
512,2022,M10,0.534
513,2022,M11,0.524
514,2022,M12,0.522
515,2023,M01,0.540


In [10]:
df["period"].unique()

array(['M01', 'M02', 'M03', 'M04', 'M05', 'M06', 'M07', 'M08', 'M09',
       'M10', 'M11', 'M12'], dtype=object)

In [11]:
def time_indexed_series(df):
  df["year_period"] = df["year"].astype(str) + "-" + df["period"].str[1:]
  df=df.set_index(pd.to_datetime(df["year_period"], format="%Y-%m"))
  return df["value"]

In [12]:
def delayed_df(series, k):
    delayed = pd.concat([series.shift(i) for i in range(k+1)], axis=1)
    delayed.columns = [f"t-{i}" for i in range(k, 0, -1)] + ["t"]
    delayed=delayed.dropna()
    return delayed

In [13]:
# create time-indexed series
bread_series = time_indexed_series(df_bread)
flour_series = time_indexed_series(df_flour)

# create delayed dataframes
k=3 #  prior 3 months 
delayed_df_bread = delayed_df(bread_series, k)
delayed_df_flour = delayed_df(flour_series, k)

#create predictor dataset (k-prior average prices)
df_predictor = pd.concat([delayed_df_flour.add_suffix('_flour'), delayed_df_bread.add_suffix('_bread')], axis=1).dropna()

# create label (whether the price increased from the prior value)
label=(df_predictor["t_bread"] > df_predictor[f"t-{k}_bread"]).astype(int)

In [14]:
X = df_predictor.drop(["t_bread","t_flour"],axis=1)
y = label

In [15]:
y.value_counts()

0    330
1    184
dtype: int64

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# split dataset
train_size = int(0.80 * len(X))
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:] 

# fit log reg model
model = LogisticRegression(penalty=None, solver="newton-cg") # grid search was used
model.fit(X_train, y_train)

# predict labs in test set (prior 100 months)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred) 
print("Accuracy: {:.2f}%".format(accuracy * 100))

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 81.55%
Confusion Matrix:
[[52  9]
 [10 32]]


In [17]:
last_month = df_predictor.tail(1).drop(["t-3_flour","t-3_bread"],axis=1)
last_month.columns=X.columns
last_month_label = model.predict(last_month)
print("The price of bread", "will increase" if last_month_label[0] else "will not raise", "in the following month.")

The price of bread will not raise in the following month.
