In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, StratifiedKFold
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.feature_selection import RFECV
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L1, L2
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
from xgboost import XGBRegressor
from sklearn.base import TransformerMixin, BaseEstimator
%matplotlib inline

In [2]:
merge_df = pd.read_csv('../data/merged_fftr.csv')
month_df = pd.read_csv('../data/monthly_data.csv')

In [3]:
merge_df = merge_df.set_index('DATE')
month_df = month_df.set_index('DATE')

In [4]:
merge_df.head()

Unnamed: 0_level_0,FFTR,UNRATE,Oil,PCE,DispInc,gdp_gap,GDP,EXPGS,IMPGS,ind_prod,...,gbp_dollar,pct_recession,yen_dollar,yield_curve,1_month_ahead,2_month_ahead,3_month_ahead,1_month_back,2_month_back,3_month_back
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1987-11-09,6.8125,5.773333,17.5,4.1,6648.1,-0.293283,5007.994,392.107,537.511,60.0022,...,1.793,0.02,134.3,1.14,7.3125,7.25,6.625,6.8125,6.8125,6.625
1987-11-10,6.8125,5.77,17.75,4.1,6648.1,-0.293283,5007.994,392.107,537.511,60.0022,...,1.7865,0.02,134.55,1.18,7.3125,7.25,6.625,6.8125,6.8125,6.625
1987-11-11,6.8125,5.766667,17.8,4.1,6648.1,-0.293283,5007.994,392.107,537.511,60.0022,...,1.772,0.02,135.5,1.155,7.3125,7.25,6.625,6.8125,6.8125,6.625
1987-11-12,6.8125,5.763333,17.85,4.1,6648.1,-0.293283,5007.994,392.107,537.511,60.0022,...,1.7575,0.02,136.45,1.13,7.3125,7.25,6.625,6.8125,6.8125,6.625
1987-11-13,6.8125,5.76,17.8,4.1,6648.1,-0.293283,5007.994,392.107,537.511,60.0022,...,1.768,0.02,135.85,1.09,7.3125,7.25,6.625,6.8125,6.8125,6.5


In [5]:
X = merge_df.drop(columns = ['1_month_ahead', '2_month_ahead', '3_month_ahead'])
y_1 = merge_df['1_month_ahead']
y_2 = merge_df['2_month_ahead']
y_3 = merge_df['3_month_ahead']
X_month = month_df.drop(columns = ['1_month_ahead', '2_month_ahead', '3_month_ahead'])
y_month_1 = month_df['1_month_ahead']
y_month_2 = month_df['2_month_ahead']
y_month_3 = month_df['3_month_ahead']

In [6]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y_1, random_state=42, test_size=0.25)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y_2, random_state=42, test_size=0.25)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X, y_3, random_state=42, test_size=0.25)
X_train_month_1, X_test_month_1, y_train_month_1, y_test_month_1 = train_test_split(X_month, y_month_1, random_state=42, test_size=0.25)
X_train_month_2, X_test_month_2, y_train_month_2, y_test_month_2 = train_test_split(X_month, y_month_2, random_state=42, test_size=0.25)
X_train_month_3, X_test_month_3, y_train_month_3, y_test_month_3 = train_test_split(X_month, y_month_3, random_state=42, test_size=0.25)

In [7]:
class DataFrameInterpolator(BaseEstimator, TransformerMixin):
    def __init__(self, method_map=None):
        self.method_map = method_map

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame")
        
        if self.method_map is None:
            raise ValueError("You must provide a method map for interpolation.")
        
        X_interpolated = X.copy()
        
        for column, method in self.method_map.items():
            if method == 'ffill':
                X_interpolated[column] = X_interpolated[column].fillna(method='ffill')
            elif method == 'linear':
                X_interpolated[column] = X_interpolated[column].interpolate(method='linear')
            else:
                raise ValueError("Invalid interpolation method. Choose 'ffill' or 'linear'.")
        
        return X_interpolated

# Example usage:
# Create a method map specifying interpolation methods for each column
method_map = {
    'FFTR': 'linear',
    'UNRATE': 'ffill',
    'Oil': 'linear',
    'PCE': 'ffill',
    'DispInc': 'ffill',
    'gdp_gap': 'ffill',
    'GDP': 'ffill',
    'EXPGS': 'ffill',
    'IMPGS': 'ffill',
    'ind_prod': 'ffill',
    'm2': 'linear',
    'm2_velo': 'ffill',
    'nasdaq': 'linear',
    'pci': 'ffill',
    'gbp_dollar': 'linear',
    'pct_recession': 'ffill',
    'yen_dollar': 'linear',
    'yield_curve': 'linear'
    #'1_month_ahead': 'linear',
    #'2_month_ahead': 'linear',
    #'3_month_ahead': 'linear',
    #'1_month_back': 'linear',
    #'2_month_back': 'linear',
    #'3_month_back': 'linear'
        
}

# Create an instance of the DataFrameInterpolator class with the method map
interpolator = DataFrameInterpolator(method_map=method_map)

In [8]:
rfr_pipeline = Pipeline([
    ('interpolate', DataFrameInterpolator(method_map = method_map)),
    ('rfr', RandomForestRegressor(criterion = 'squared_error', n_estimators = 1200, max_features = 'sqrt', max_depth = 15))
])


In [9]:
rfr_pipeline.fit(X_train_3, y_train_3) 

In [10]:
prediction = rfr_pipeline.predict(X_test_3)
mse = mean_squared_error(y_test_3, prediction)
rmse = mse**.5
print(mse)
print(rmse)

0.0020214548135610525
0.04496059178392843


In [11]:
MAE = mean_absolute_error(y_test_3, 
                          prediction)
print(MAE)

0.01200566543949517


In [12]:
r2_score(y_test_3, prediction)

0.999701952221845

In [13]:
MAPE = mean_absolute_percentage_error(y_test_3, prediction)
print(MAPE)

0.008331802518663394


In [14]:
rfr_pipeline.fit(X_train_1, y_train_1) 

In [15]:
prediction = rfr_pipeline.predict(X_test_1)
mse = mean_squared_error(y_test_1, prediction)
rmse = mse**.5
print(mse)
print(rmse)

0.0011716077300994389
0.034228755894706996


In [16]:
MAE = mean_absolute_error(y_test_1, 
                          prediction)
print(MAE)

0.00981640341473193


In [19]:
r2_score(y_test_1, prediction)

0.999826198526176

In [17]:
MAPE = mean_absolute_percentage_error(y_test_1, prediction)
print(MAPE)

0.004975023931786693
