In [199]:
import pandas as pd
import numpy as np
import time
import requests
from sklearn import linear_model

from fredapi import Fred
fred = Fred(api_key='e6fa5d42379be298148d2be7a29b73ed') #connect to the FRED API using my registered API key
from pybea.client import BureauEconomicAnalysisClient
bea_api = '7D0F897D-CA14-43B8-99BE-51D85C0CAD10'
bea_client = BureauEconomicAnalysisClient(api_key= bea_api) #connect to the BEA API using my registered API key

# 1. Retrieves historical US JOLTS data set from FRED 

In [175]:
jolts_data = fred.get_series('JTSJOL', observation_start='2018-01-01', observation_end='2023-12-31')
jolts_df = pd.DataFrame(jolts_data).rename(columns={0:'JOLTS'})
jolts_df

Unnamed: 0,JOLTS
2018-01-01,6621.0
2018-02-01,6552.0
2018-03-01,6818.0
2018-04-01,6877.0
2018-05-01,7016.0
...,...
2022-09-01,10854.0
2022-10-01,10471.0
2022-11-01,10746.0
2022-12-01,11234.0


# 2. Retrieves related and publicly available data from BEA and other sources

In [148]:
# retrieve unemployment data
unemployment_data = fred.get_series('UNRATE', observation_start='2018-01-01', observation_end='2023-12-31')
unemployment_df = pd.DataFrame(unemployment_data).rename(columns={0:'unemployment'})

# retrieve GDP data
gdp_data = fred.get_series('GDP', observation_start='2018-01-01', observation_end='2023-12-31')
gdp_df = pd.DataFrame(gdp_data).rename(columns={0:'GDP'})

# retrieve initial claims data
initial_claims_data = fred.get_series('ICSA', observation_start='2018-01-01', observation_end='2023-12-31')
initial_claims_df = pd.DataFrame(initial_claims_data).rename(columns={0:'initial_claims'})
initial_claims_df = initial_claims_df.resample('M').sum() #resample to monthly
initial_claims_df.index = initial_claims_df.index + pd.DateOffset(days=1)

# retrieve consumer sentiment data
consumer_sentiment_data = fred.get_series('UMCSENT', observation_start='2018-01-01', observation_end='2023-12-31')
consumer_sentiment_df = pd.DataFrame(consumer_sentiment_data).rename(columns={0:'consumer_sentiment'})

# retrieve labor force data
labor_force_data = fred.get_series('CIVPART', observation_start='2018-01-01', observation_end='2023-12-31')
labor_force_df = pd.DataFrame(labor_force_data).rename(columns={0:'labor_force'})
# retrieve personal income data 
personal_income = bea_client.national_income_and_product_accounts(
    table_name='T20600', 
    frequency='M', 
    year=['2018', '2019', '2020', '2021', '2022', '2023'])
personal_income_df = pd.DataFrame(personal_income['BEAAPI']['Results']['Data'])
personal_income_df = personal_income_df[\
    personal_income_df['LineDescription'] == 'Personal income']\
    .drop(columns=['TableName', 'SeriesCode', 'LineNumber', 'LineDescription', 'METRIC_NAME', 'CL_UNIT', 'UNIT_MULT', 'NoteRef'])
personal_income_df = personal_income_df.rename(columns={'DataValue': 'PersonalIncome'})
personal_income_df['TimePeriod'] = pd.to_datetime(personal_income_df['TimePeriod'].str[:4] + '-' + personal_income_df['TimePeriod'].str[5:])
personal_income_df.set_index('TimePeriod', inplace=True)

# retrieve wage and salary data
wage_and_salary = bea_client.national_income_and_product_accounts(
    table_name='T20700B',
    frequency='M',
    year=['2018', '2019', '2020', '2021', '2022', '2023'])
wage_and_salary_df = pd.DataFrame(wage_and_salary['BEAAPI']['Results']['Data'])
wage_and_salary_df = wage_and_salary_df[\
    wage_and_salary_df['LineDescription'] == 'Wages and salaries']\
    .drop(columns=['TableName', 'SeriesCode', 'LineNumber', 'LineDescription', 'METRIC_NAME', 'CL_UNIT', 'UNIT_MULT', 'NoteRef'])
wage_and_salary_df = wage_and_salary_df.rename(columns={'DataValue': 'WageAndSalary'})
wage_and_salary_df['TimePeriod'] = pd.to_datetime(wage_and_salary_df['TimePeriod'].str[:4] + '-' + wage_and_salary_df['TimePeriod'].str[5:])
wage_and_salary_df.set_index('TimePeriod', inplace=True)

# retrieve consumption expenditure data
consumption_expenditure = bea_client.national_income_and_product_accounts(
    table_name='T20803',
    frequency='M',
    year=['2018', '2019', '2020', '2021', '2022', '2023'])
consumption_expenditure_df = pd.DataFrame(consumption_expenditure['BEAAPI']['Results']['Data'])
consumption_expenditure_df = consumption_expenditure_df[\
    consumption_expenditure_df['LineDescription'] == 'Personal consumption expenditures (PCE)']\
    .drop(columns=['TableName', 'SeriesCode', 'LineNumber', 'LineDescription', 'METRIC_NAME', 'CL_UNIT', 'UNIT_MULT', 'NoteRef'])
consumption_expenditure_df = consumption_expenditure_df.rename(columns={'DataValue': 'ConsumptionExpenditure'})
consumption_expenditure_df['TimePeriod'] = pd.to_datetime(consumption_expenditure_df['TimePeriod'].str[:4] + '-' + consumption_expenditure_df['TimePeriod'].str[5:])
consumption_expenditure_df.set_index('TimePeriod', inplace=True)

# retrieve price index data
price_index = bea_client.national_income_and_product_accounts_detail(
    table_name='U90100',
    frequency='M',
    year=['2018', '2019', '2020', '2021', '2022', '2023']
)
price_index_df = pd.DataFrame(price_index['BEAAPI']['Results']['Data'])
price_index_df = price_index_df[\
    price_index_df['LineDescription'] == 'Equals: CPI (percent change)']\
    .drop(columns=['TableName', 'SeriesCode', 'LineNumber', 'LineDescription', 'METRIC_NAME', 'CL_UNIT', 'UNIT_MULT', 'NoteRef'])
price_index_df = price_index_df.rename(columns={'DataValue': 'PriceIndex'})
price_index_df['TimePeriod'] = pd.to_datetime(price_index_df['TimePeriod'].str[:4] + '-' + price_index_df['TimePeriod'].str[5:])
price_index_df.set_index('TimePeriod', inplace=True)

https://apps.bea.gov/api/data/?userid=7D0F897D-CA14-43B8-99BE-51D85C0CAD10&method=GetData&datasetname=NIPA&year=2018%2C2019%2C2020%2C2021%2C2022%2C2023&resultformat=JSON&frequency=M&tablename=T20600
https://apps.bea.gov/api/data/?userid=7D0F897D-CA14-43B8-99BE-51D85C0CAD10&method=GetData&datasetname=NIPA&year=2018%2C2019%2C2020%2C2021%2C2022%2C2023&resultformat=JSON&frequency=M&tablename=T20700B
https://apps.bea.gov/api/data/?userid=7D0F897D-CA14-43B8-99BE-51D85C0CAD10&method=GetData&datasetname=NIPA&year=2018%2C2019%2C2020%2C2021%2C2022%2C2023&resultformat=JSON&frequency=M&tablename=T20803
https://apps.bea.gov/api/data/?userid=7D0F897D-CA14-43B8-99BE-51D85C0CAD10&method=GetData&datasetname=NIUnderlyingDetail&year=2018%2C2019%2C2020%2C2021%2C2022%2C2023&resultformat=JSON&frequency=M&tablename=U90100


# 3. Use Regression Model to predict JOLTS for Feb 2023

In [190]:
df_total = pd.concat([personal_income_df, wage_and_salary_df, consumption_expenditure_df, price_index_df, unemployment_df, gdp_df, initial_claims_df, consumer_sentiment_df, labor_force_df, jolts_df], axis=1)
df_total['GDP'] = df_total['GDP'].fillna(method='ffill') # fill the Nan values with the previous value in the column GDP
df_total['initial_claims'] = df_total['initial_claims'].fillna(method='bfill') # fill the Nan values with the next value in the column initial_claims
df_total = df_total.applymap(lambda x: x.replace(',', '') if isinstance(x, str) else x)
df_total = df_total.astype(float) # convert all columns to float
df_prediction = df_total.iloc[61].drop('JOLTS')
df_total = df_total.head(61)

df_total

Unnamed: 0,PersonalIncome,WageAndSalary,ConsumptionExpenditure,PriceIndex,unemployment,GDP,initial_claims,consumer_sentiment,labor_force,JOLTS
2018-01-01,17294840.0,8750137.0,114.908,0.4,4.0,20155.486,926000.0,95.7,62.7,6621.0
2018-02-01,17354081.0,8759097.0,114.964,0.3,4.1,20155.486,926000.0,99.7,63.0,6552.0
2018-03-01,17415962.0,8778478.0,115.364,0.0,4.0,20155.486,834000.0,101.4,62.9,6818.0
2018-04-01,17472708.0,8811071.0,115.615,0.3,4.0,20470.197,1016000.0,98.8,62.9,6877.0
2018-05-01,17547574.0,8836826.0,115.965,0.2,3.8,20470.197,819000.0,98.0,62.9,7016.0
...,...,...,...,...,...,...,...,...,...,...
2022-09-01,22080374.0,11450576.0,128.769,0.4,3.5,25723.941,962000.0,58.6,62.3,10854.0
2022-10-01,22283009.0,11506774.0,129.111,0.5,3.7,26137.992,825000.0,59.9,62.2,10471.0
2022-11-01,22375915.0,11553505.0,128.577,0.2,3.6,26137.992,1095000.0,56.8,62.2,10746.0
2022-12-01,22446071.0,11587745.0,128.328,0.1,3.5,26137.992,916000.0,59.7,62.3,11234.0


In [191]:
df_prediction

PersonalIncome            2.264311e+07
WageAndSalary             1.172933e+07
ConsumptionExpenditure    1.300510e+02
PriceIndex                4.000000e-01
unemployment              3.600000e+00
GDP                       2.613799e+04
initial_claims            7.670000e+05
consumer_sentiment        6.700000e+01
labor_force               6.250000e+01
Name: 2023-02-01 00:00:00, dtype: float64

In [195]:
# create a linear regression model to predict the JOLTS for February 2023
X = df_total[['PersonalIncome', 'WageAndSalary', 'ConsumptionExpenditure', 'PriceIndex', 'unemployment', 'GDP', 'initial_claims', 'consumer_sentiment', 'labor_force']]
y = df_total['JOLTS']

regr = linear_model.LinearRegression()
regr.fit(X, y)

predicted_jolts_for_feb = float(regr.predict([df_prediction]))


In [198]:
predicted_jolts_for_feb

11001.249982960275

In [204]:
print("Coefficients: ", regr.coef_)
print("Intercept: ", regr.intercept_)
print("R2: ", regr.score(X, y))

Coefficients:  [-2.10040034e-04 -9.23141980e-04  1.26277853e+02  7.08837915e+02
 -2.20792265e+02  9.11397745e-01  6.34435788e-05 -1.18940279e+01
 -7.86640642e+02]
Intercept:  36764.97549336901
R2:  0.8968836810662149
