In [124]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, StratifiedKFold
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import RFECV
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L1, L2
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
from xgboost import XGBRegressor
%matplotlib inline

In [2]:
merge_df = pd.read_csv('../data/merged_fftr.csv')
month_df = pd.read_csv('../data/monthly_data.csv')

In [3]:
merge_df = merge_df.set_index('DATE')
month_df = month_df.set_index('DATE')

In [4]:
merge_df.head()

Unnamed: 0_level_0,FFTR,UNRATE,Oil,PCE,DispInc,gdp_gap,GDP,EXPGS,IMPGS,ind_prod,m2,m2_velo,nasdaq,pci,gbp_dollar,pct_recession,yen_dollar,yield_curve
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1987-08-11,6.625,5.9,19.45,3.6,6574.0,-1.196214,4884.555,371.854,517.318,58.6889,2792.5,1.751,449.36,4.293051,1.5697,0.04,151.77,1.04
1987-08-12,6.625,5.9,19.5,3.6,6574.0,-1.196214,4884.555,371.854,517.318,58.6889,2792.5,1.751,449.23,4.293051,1.579,0.04,151.25,1.05
1987-08-13,6.625,5.9,19.4,3.6,6574.0,-1.196214,4884.555,371.854,517.318,58.6889,2792.5,1.751,451.55,4.293051,1.5797,0.04,151.45,0.99
1987-08-14,6.625,5.9,19.25,3.6,6574.0,-1.196214,4884.555,371.854,517.318,58.6889,2792.5,1.751,451.61,4.293051,1.589,0.04,150.0,0.96
1987-08-15,6.625,5.9,19.116667,3.6,6574.0,-1.196214,4884.555,371.854,517.318,58.6889,2792.5,1.751,451.623333,4.293051,1.5896,0.04,149.973333,0.953333


In [5]:
merge_df['1_month_ahead'] = merge_df['FFTR'].shift(periods=30)
merge_df['2_month_ahead'] = merge_df['FFTR'].shift(periods=60)
merge_df['3_month_ahead'] = merge_df['FFTR'].shift(periods=90)
merge_df['1_month_back'] = merge_df['FFTR'].shift(periods=-30)
merge_df['2_month_back'] = merge_df['FFTR'].shift(periods=-60)
merge_df['3_month_back'] = merge_df['FFTR'].shift(periods=-90)

In [6]:
month_df['1_month_ahead'] = month_df['FFTR'].shift(periods=1)
month_df['2_month_ahead'] = month_df['FFTR'].shift(periods=2)
month_df['3_month_ahead'] = month_df['FFTR'].shift(periods=3)
month_df['1_month_back'] = month_df['FFTR'].shift(periods=-1)
month_df['2_month_back'] = month_df['FFTR'].shift(periods=-2)
month_df['3_month_back'] = month_df['FFTR'].shift(periods=-3)

In [7]:
merge_df.head()

Unnamed: 0_level_0,FFTR,UNRATE,Oil,PCE,DispInc,gdp_gap,GDP,EXPGS,IMPGS,ind_prod,...,gbp_dollar,pct_recession,yen_dollar,yield_curve,1_month_ahead,2_month_ahead,3_month_ahead,1_month_back,2_month_back,3_month_back
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1987-08-11,6.625,5.9,19.45,3.6,6574.0,-1.196214,4884.555,371.854,517.318,58.6889,...,1.5697,0.04,151.77,1.04,,,,7.25,7.3125,6.8125
1987-08-12,6.625,5.9,19.5,3.6,6574.0,-1.196214,4884.555,371.854,517.318,58.6889,...,1.579,0.04,151.25,1.05,,,,7.25,7.3125,6.8125
1987-08-13,6.625,5.9,19.4,3.6,6574.0,-1.196214,4884.555,371.854,517.318,58.6889,...,1.5797,0.04,151.45,0.99,,,,7.25,7.3125,6.8125
1987-08-14,6.625,5.9,19.25,3.6,6574.0,-1.196214,4884.555,371.854,517.318,58.6889,...,1.589,0.04,150.0,0.96,,,,7.25,7.3125,6.8125
1987-08-15,6.625,5.9,19.116667,3.6,6574.0,-1.196214,4884.555,371.854,517.318,58.6889,...,1.5896,0.04,149.973333,0.953333,,,,7.25,7.3125,6.8125


In [8]:
merge_df = merge_df.iloc[90:]
merge_df = merge_df.drop(merge_df.tail(90).index)

In [9]:
month_df = month_df.iloc[3:]
month_df = month_df.drop(month_df.tail(3).index)

In [10]:
month_df.head()

Unnamed: 0_level_0,FFTR,UNRATE,Oil,PCE,DispInc,gdp_gap,GDP,EXPGS,IMPGS,ind_prod,...,gbp_dollar,pct_recession,yen_dollar,yield_curve,1_month_ahead,2_month_ahead,3_month_ahead,1_month_back,2_month_back,3_month_back
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1987-11-01,7.3125,5.8,18.686667,4.1,6648.1,-0.293283,5007.994,392.107,537.511,60.0022,...,1.733167,0.02,137.466667,1.273333,7.3125,6.75,6.625,6.8125,6.8125,6.625
1987-12-01,6.8125,5.7,17.65,4.1,6711.5,-0.293283,5007.994,392.107,537.511,60.3336,...,1.818,0.06,133.0,1.24,7.3125,7.3125,6.75,6.8125,6.625,6.5
1988-01-01,6.8125,5.7,17.6875,3.9,6730.2,-0.559415,5073.372,418.727,542.714,60.3186,...,1.883825,0.36,121.6125,1.06,6.8125,7.3125,7.3125,6.625,6.5,6.75
1988-02-01,6.625,5.7,16.1,3.9,6773.5,-0.559415,5073.372,418.727,542.714,60.6209,...,1.758,0.0,129.38,1.04,6.8125,6.8125,7.3125,6.5,6.75,6.75
1988-03-01,6.5,5.7,14.18,3.9,6804.3,-0.559415,5073.372,418.727,542.714,60.7591,...,1.7775,0.0,128.27,1.04,6.625,6.8125,6.8125,6.75,6.75,7.25


In [11]:
X = merge_df.drop(columns = ['1_month_ahead', '2_month_ahead', '3_month_ahead'])
y_1 = merge_df['1_month_ahead']
y_2 = merge_df['2_month_ahead']
y_3 = merge_df['3_month_ahead']
X_month = month_df.drop(columns = ['1_month_ahead', '2_month_ahead', '3_month_ahead'])
y_month_1 = month_df['1_month_ahead']
y_month_2 = month_df['2_month_ahead']
y_month_3 = month_df['3_month_ahead']

In [12]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y_1, random_state=42, test_size=0.25)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y_2, random_state=42, test_size=0.25)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X, y_3, random_state=42, test_size=0.25)
X_train_month_1, X_test_month_1, y_train_month_1, y_test_month_1 = train_test_split(X_month, y_month_1, random_state=42, test_size=0.25)
X_train_month_2, X_test_month_2, y_train_month_2, y_test_month_2 = train_test_split(X_month, y_month_2, random_state=42, test_size=0.25)
X_train_month_3, X_test_month_3, y_train_month_3, y_test_month_3 = train_test_split(X_month, y_month_3, random_state=42, test_size=0.25)

In [13]:
grid = { 
    'n_estimators': [200,300,400,500],
    'max_features': ['sqrt','log2'],
    'max_depth' : [2,3,4,5,6,7]
}
print(datetime.now())
rfr_CV = GridSearchCV(estimator=RandomForestRegressor(criterion='squared_error'), param_grid=grid, cv= 5)
rfr_CV.fit(X_train_month_1, y_train_month_1)
print(datetime.now())

2024-02-28 12:47:20.007226
2024-02-28 12:48:01.478104


In [14]:
rfr_CV.best_params_

{'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 200}

In [15]:
best_rfr_model = rfr_CV.best_estimator_

In [16]:
prediction = best_rfr_model.predict(X_test_month_1)
mse = mean_squared_error(y_test_month_1, prediction)
rmse = mse**.5
print(mse)
print(rmse)

0.07050018791396731
0.2655187148092716


In [17]:
grid = { 
    'n_estimators': [200,300,400,500],
    'max_features': ['sqrt','log2'],
    'max_depth' : [2,3,4,5,6,7]
}
print(datetime.now())
rfr_CV = GridSearchCV(estimator=RandomForestRegressor(criterion='squared_error'), param_grid=grid, cv= 5)
rfr_CV.fit(X_train_month_2, y_train_month_2)
print(datetime.now())

2024-02-28 12:48:01.517008
2024-02-28 12:48:42.949370


In [18]:
rfr_CV.best_params_

{'max_depth': 7, 'max_features': 'log2', 'n_estimators': 500}

In [19]:
best_rfr_model = rfr_CV.best_estimator_

In [20]:
prediction = best_rfr_model.predict(X_test_month_2)
mse = mean_squared_error(y_test_month_2, prediction)
rmse = mse**.5
print(mse)
print(rmse)

0.07296702823423433
0.27012409784066715


In [21]:
grid = { 
    'n_estimators': [200,300,400,500],
    'max_features': ['sqrt','log2'],
    'max_depth' : [2,3,4,5,6,7]
}
print(datetime.now())
rfr_CV = GridSearchCV(estimator=RandomForestRegressor(criterion='squared_error'), param_grid=grid, cv= 5)
rfr_CV.fit(X_train_month_3, y_train_month_3)
print(datetime.now())

2024-02-28 12:48:42.976487
2024-02-28 12:49:24.238721


In [22]:
rfr_CV.best_params_

{'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 200}

In [23]:
best_rfr_model = rfr_CV.best_estimator_

In [24]:
prediction = best_rfr_model.predict(X_test_month_3)
mse = mean_squared_error(y_test_month_3, prediction)
rmse = mse**.5
print(mse)
print(rmse)

0.08450488728153822
0.29069724333322844


In [28]:
grid = { 
    'n_estimators': [200,300,400,500],
    'max_features': ['sqrt','log2'],
    'max_depth' : [4,5,6,7]
}
print(datetime.now())
rfr_CV = GridSearchCV(estimator=RandomForestRegressor(criterion='squared_error'), param_grid=grid, cv= 5)
rfr_CV.fit(X_train_1, y_train_1)
print(datetime.now())

2024-02-28 13:03:56.559587
2024-02-28 13:08:09.241654


In [29]:
best_rfr_model = rfr_CV.best_estimator_

In [30]:
prediction = best_rfr_model.predict(X_test_1)
mse = mean_squared_error(y_test_1, prediction)
rmse = mse**.5
print(mse)
print(rmse)

0.009243389781367786
0.09614254927641448


In [31]:
grid = { 
    'n_estimators': [200,300,400,500],
    'max_features': ['sqrt','log2'],
    'max_depth' : [4,5,6,7]
}
print(datetime.now())
rfr_CV = GridSearchCV(estimator=RandomForestRegressor(criterion='squared_error'), param_grid=grid, cv= 5)
rfr_CV.fit(X_train_2, y_train_2)
print(datetime.now())

2024-02-28 13:08:09.406851
2024-02-28 13:12:14.028154


In [32]:
best_rfr_model = rfr_CV.best_estimator_

In [33]:
prediction = best_rfr_model.predict(X_test_2)
mse = mean_squared_error(y_test_2, prediction)
rmse = mse**.5
print(mse)
print(rmse)

0.011227203271568465
0.10595849787331106


In [72]:
grid = { 
    'n_estimators': [1000, 1200],
    'max_features': ['sqrt','log2'],
    'max_depth' : [13, 14]
}
print(datetime.now())
rfr_CV = GridSearchCV(estimator=RandomForestRegressor(criterion='squared_error'), param_grid=grid, cv= 5)
rfr_CV.fit(X_train_3, y_train_3)
print(datetime.now())

2024-02-28 13:47:46.060874
2024-02-28 13:52:37.797106


In [73]:
best_rfr_model = rfr_CV.best_estimator_

In [74]:
prediction = best_rfr_model.predict(X_test_3)
mse = mean_squared_error(y_test_3, prediction)
rmse = mse**.5
print(mse)
print(rmse)

0.002092547645564173
0.045744372829498635


In [75]:
rfr_CV.best_params_

{'max_depth': 14, 'max_features': 'sqrt', 'n_estimators': 1000}

In [65]:
X_train_3.head()

Unnamed: 0_level_0,FFTR,UNRATE,Oil,PCE,DispInc,gdp_gap,GDP,EXPGS,IMPGS,ind_prod,...,m2_velo,nasdaq,pci,gbp_dollar,pct_recession,yen_dollar,yield_curve,1_month_back,2_month_back,3_month_back
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1998-10-26,5.0,4.419355,12.74,1.3,9337.5,2.026079,9293.991,966.337,1145.0,86.5967,...,2.14,1724.98,2.535056,1.683,0.0,118.84,0.5,4.75,4.75,4.75
2009-08-01,0.25,9.6,71.02,1.4,12194.8,-5.287064,14448.882,1594.725,2018.722,86.6617,...,1.711,1988.536667,1.757047,1.6794,0.76,94.77,2.42,0.25,0.25,0.25
2023-01-05,4.5,3.425806,76.73,5.0,16601.9,-0.060974,26813.601,3064.804,3890.491,102.5478,...,1.273,10305.24,6.487554,1.1902,0.08,133.57,-0.74,4.75,4.75,5.0
1997-10-02,5.5,4.696774,20.08,1.3,8834.8,1.414139,8765.907,968.33,1085.344,82.7657,...,2.184,1702.41,2.618901,1.6135,0.0,121.8,0.28,5.5,5.5,5.5
2003-03-07,1.25,5.919355,34.47,1.2,10696.5,-2.560957,11174.129,1004.201,1529.463,91.0059,...,1.915,1305.29,2.619135,1.602,0.6,116.47,2.24,1.25,1.25,1.25


In [60]:
np.sqrt(np.nanmean(np.square(((y_test_3 - prediction) / y_test_3))))*100

9.017900167172133

In [117]:
grid = {'max_depth': [5,6,7], 'learning_rate': [.1, .2],
        'n_estimators': [600, 700]}
xgb_cv = GridSearchCV(estimator = XGBRegressor(objective='reg:squarederror'), scoring='neg_mean_absolute_error',
                  param_grid = grid, cv = 5)
xgb_cv.fit(X_train_3, y_train_3);

In [118]:
xgb_cv.best_estimator_

In [119]:
best_xgb_model = xgb_cv.best_estimator_
best_xgb_model.fit(X_train_3, y_train_3);
y_pred = best_xgb_model.predict(X_test_3)

In [120]:
MAE = mean_absolute_error(y_test_3, 
                          y_pred)

In [121]:
print(MAE)

0.014218367022679809


In [122]:
RMSE = np.sqrt(mean_squared_error(y_test_3,
                                  y_pred))
print(RMSE)

0.0520928570713274


In [125]:
r2_score(y_test_3, y_pred)

0.999599891105971

In [None]:
method_map = {
    'column1': 'ffill',
    'column2': 'ffill',
    'column3': 'linear'
}