In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, StratifiedKFold
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.feature_selection import RFECV
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L1, L2
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
from xgboost import XGBRegressor
from sklearn.base import TransformerMixin, BaseEstimator
import requests
import pickle
%matplotlib inline

In [2]:
series_list = ['DCOILBRENTEU', 'DPCCRV1Q225SBEA', 'DSPIC96', 'EXPGS', 'GDP', 'IMPGS', 'INDPRO', 'WM2NS', 'M2V', 'NASDAQCOM', 'CORESTICKM159SFRBATL', 'DEXUSUK', 'RECPROUSM156N', 'UNRATE', 'DEXJPUS', 'T10Y2Y']
method_map = {
    'FFTR': 'linear',
    'UNRATE': 'ffill',
    'DCOILBRENTEU': 'linear',
    'DPCCRV1Q225SBEA': 'ffill',
    'DSPIC96': 'ffill',
    'gdp_gap': 'ffill',
    'GDP': 'ffill',
    'EXPGS': 'ffill',
    'IMPGS': 'ffill',
    'INDPRO': 'ffill',
    'WM2NS': 'linear',
    'M2V': 'ffill',
    'NASDAQCOM': 'linear',
    'CORESTICKM159SFRBATL': 'ffill',
    'DEXUSUK': 'linear',
    'RECPROUSM156N': 'ffill',
    'DEXJPUS': 'linear',
    'T10Y2Y': 'linear'
        
}
def make_dataframe(series_list, method_map):
    base_url = 'https://api.stlouisfed.org/fred/series/observations?series_id=DFEDTAR&api_key=e702d160a3e348cf149d4a5c3faf0d0c&file_type=json&observation_start=1987-08-11'
    base_request = requests.get(base_url)
    base_json = json = base_request.json()
    base_df = pd.json_normalize(base_json['observations'])
    baseu_url = 'https://api.stlouisfed.org/fred/series/observations?series_id=DFEDTARU&api_key=e702d160a3e348cf149d4a5c3faf0d0c&file_type=json&observation_start=1987-08-11'
    #if you would like to use the lower bound of the FFTR instead replace the U right ^ with an L
    baseu_request = requests.get(baseu_url)
    baseu_json = baseu_request.json()
    baseu_df = pd.json_normalize(baseu_json['observations'])
    merge_df = pd.concat([base_df, baseu_df], ignore_index=True)
    merge_df['value'] = merge_df['value'].astype(float)
    rgdp_url = 'https://api.stlouisfed.org/fred/series/observations?series_id=GDPC1&api_key=e702d160a3e348cf149d4a5c3faf0d0c&file_type=json&observation_start=1987-08-11'
    rgdp_request = requests.get(rgdp_url)
    rgdp_json = rgdp_request.json()
    rgdp_df = pd.json_normalize(rgdp_json['observations'])
    rgdp_df['rgdp'] = rgdp_df['value']
    rgdp_df = rgdp_df.drop(columns = 'value')
    gdppot_url = 'https://api.stlouisfed.org/fred/series/observations?series_id=GDPPOT&api_key=e702d160a3e348cf149d4a5c3faf0d0c&file_type=json&observation_start=1987-08-11'
    gdppot_request = requests.get(gdppot_url)
    gdppot_json = gdppot_request.json()
    gdppot_df = pd.json_normalize(gdppot_json['observations'])
    gdppot_df['gdppot'] = gdppot_df['value']
    gdppot_df = gdppot_df.drop(columns = 'value')
    merge_df = pd.merge_ordered(merge_df, rgdp_df, fill_method="none", left_by="date")
    merge_df = pd.merge_ordered(merge_df, gdppot_df, fill_method="none", left_by="date")
    merge_df['date'] = pd.to_datetime(merge_df['date'])
    merge_df['rgdp'] = merge_df['rgdp'].replace('.', np.nan)
    merge_df['gdppot'] = merge_df['gdppot'].replace('.', np.nan)
    merge_df['rgdp'] = merge_df['rgdp'].astype(float)
    merge_df['gdppot'] = merge_df['gdppot'].astype(float)
    merge_df['gdp_gap'] = 100*((merge_df['rgdp'] - merge_df['gdppot'])/merge_df['rgdp'])
    merge_df = merge_df.drop(columns = ['realtime_start', 'realtime_end', 'gdppot', 'rgdp'])
    for series in series_list:
        url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series}&api_key=e702d160a3e348cf149d4a5c3faf0d0c&file_type=json&observation_start=1987-08-11'
        req = requests.get(url)
        json = req.json()
        df = pd.json_normalize(json['observations'])
        df = df.drop(columns = ['realtime_start', 'realtime_end'])
        df['date'] = pd.to_datetime(df['date'])
        df = df.rename(columns={'value': f'{series}'})
        df[series] = df[series].replace('.', np.nan)
        df[series] = df[series].astype(float)
        merge_df = pd.merge_ordered(merge_df, df, fill_method="none", left_by="date")
    merge_df = merge_df.set_index('date')
    merge_df = merge_df.rename(columns={'value': 'FFTR'})
    merge_df['1_month_ahead'] = merge_df['FFTR'].shift(periods=30)
    merge_df['2_month_ahead'] = merge_df['FFTR'].shift(periods=60)
    merge_df['3_month_ahead'] = merge_df['FFTR'].shift(periods=90)
    merge_df['1_month_back'] = merge_df['FFTR'].shift(periods=-30)
    merge_df['2_month_back'] = merge_df['FFTR'].shift(periods=-60)
    merge_df['3_month_back'] = merge_df['FFTR'].shift(periods=-90)
    merge_df['1_month_back'] = merge_df['1_month_back'].astype(float)
    merge_df['2_month_back'] = merge_df['2_month_back'].astype(float)
    merge_df['3_month_back'] = merge_df['3_month_back'].astype(float)
    for column, method in method_map.items():
            if method == 'ffill':
                merge_df[column] = merge_df[column].fillna(method='ffill')
            elif method == 'linear':
                merge_df[column] = merge_df[column].replace('.', np.nan)
                merge_df[column] = merge_df[column].astype(float)
                merge_df[column] = merge_df[column].interpolate(method='linear')
    merge_df = merge_df.iloc[90:]
    merge_df = merge_df.drop(merge_df.tail(90).index)
    return merge_df

In [3]:
merge_df = make_dataframe(series_list = series_list, method_map = method_map)
merge_df.head()

Unnamed: 0_level_0,FFTR,gdp_gap,DCOILBRENTEU,DPCCRV1Q225SBEA,DSPIC96,EXPGS,GDP,IMPGS,INDPRO,WM2NS,...,RECPROUSM156N,UNRATE,DEXJPUS,T10Y2Y,1_month_ahead,2_month_ahead,3_month_ahead,1_month_back,2_month_back,3_month_back
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1987-11-09,6.8125,-0.294146,17.5,4.1,6648.1,392.107,5007.994,537.511,60.0022,2826.4,...,0.06,5.8,134.3,1.14,7.3125,7.25,6.625,6.8125,6.8125,6.625
1987-11-10,6.8125,-0.294146,17.75,4.1,6648.1,392.107,5007.994,537.511,60.0022,2826.2,...,0.06,5.8,134.55,1.18,7.3125,7.25,6.625,6.8125,6.8125,6.625
1987-11-11,6.8125,-0.294146,17.8,4.1,6648.1,392.107,5007.994,537.511,60.0022,2826.0,...,0.06,5.8,135.5,1.155,7.3125,7.25,6.625,6.8125,6.8125,6.625
1987-11-12,6.8125,-0.294146,17.85,4.1,6648.1,392.107,5007.994,537.511,60.0022,2825.8,...,0.06,5.8,136.45,1.13,7.3125,7.25,6.625,6.8125,6.8125,6.625
1987-11-13,6.8125,-0.294146,17.8,4.1,6648.1,392.107,5007.994,537.511,60.0022,2825.6,...,0.06,5.8,135.85,1.09,7.3125,7.25,6.625,6.8125,6.8125,6.5


In [4]:
merge_df.to_csv('../data/merge_df_final.csv', index=True)

In [5]:
X = merge_df.drop(columns = ['1_month_ahead', '2_month_ahead', '3_month_ahead'])
y_1 = merge_df['1_month_ahead']
y_2 = merge_df['2_month_ahead']
y_3 = merge_df['3_month_ahead']

In [6]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y_1, random_state=42, test_size=0.25)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y_2, random_state=42, test_size=0.25)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X, y_3, random_state=42, test_size=0.25)

In [7]:
rfr_pipeline = Pipeline([
    ('rfr', RandomForestRegressor(criterion = 'squared_error', n_estimators = 1200, max_features = 'sqrt', max_depth = 15))
])


In [8]:
rfr_pipeline.fit(X_train_3, y_train_3) 

In [9]:
pickle.dump(rfr_pipeline, open( "../data/rfr_pipe.pkl", "wb" ) )

In [10]:
prediction = rfr_pipeline.predict(X_test_3)
mse = mean_squared_error(y_test_3, prediction)
rmse = mse**.5
print(mse)
print(rmse)

0.0016722945122774383
0.04089369770853986


In [11]:
MAE = mean_absolute_error(y_test_3, 
                          prediction)
print(MAE)

0.011935571891030297


In [12]:
r2_score(y_test_3, prediction)

0.9997545648931832

In [13]:
MAPE = mean_absolute_percentage_error(y_test_3, prediction)
print(MAPE)

0.005992957141490387


In [14]:
rfr_pipeline.fit(X_train_1, y_train_1) 

In [15]:
prediction = rfr_pipeline.predict(X_test_1)
mse = mean_squared_error(y_test_1, prediction)
rmse = mse**.5
print(mse)
print(rmse)

0.001359901124003782
0.03687683722885928


In [16]:
MAE = mean_absolute_error(y_test_1, 
                          prediction)
print(MAE)

0.010320830010552277


In [17]:
r2_score(y_test_1, prediction)

0.9997994483097729

In [18]:
MAPE = mean_absolute_percentage_error(y_test_1, prediction)
print(MAPE)

0.006005904778635872


In [26]:
xgb_pipeline = Pipeline([
    ('xgb', XGBRegressor(objective='reg:squarederror', n_estimators = 800, max_depth = 10, learning_rate = .1))
])

In [27]:
type(X_train_3['IMPGS'][1])

numpy.float64

In [28]:
xgb_pipeline.fit(X_train_3, y_train_3) 

In [29]:
pickle.dump(xgb_pipeline, open( "../data/xgb_pipe.pkl", "wb" ) )

In [30]:
prediction = xgb_pipeline.predict(X_test_3)
mse = mean_squared_error(y_test_3, prediction)
rmse = mse**.5
print(mse)
print(rmse)

0.002252126806469628
0.047456578115890616


In [31]:
MAE = mean_absolute_error(y_test_3, 
                          prediction)
print(MAE)

0.011249114403413533


In [32]:
r2_score(y_test_3, prediction)

0.9996694655281991

In [33]:
MAPE = mean_absolute_percentage_error(y_test_3, prediction)
print(MAPE)

0.00503192112752252
