# Packages

In [70]:
import pandas as pd 
import numpy as np

from sklearn import decomposition, datasets
from sklearn.preprocessing import StandardScaler

# 1. Data

## 1. a) data from FRED-MD
"_We use the vintage as of January 2016. Our sample extends from January 1960 to December 2015 (672 observations), and
only variables with all observations in the sample period are used (122 variables)._"

In [124]:
# import data from FRED-MD, vintage of January 2016
database = pd.read_csv(r'C:\Users\jeann\Downloads\2016-01.csv', sep=',')

# set correctly the dates
database = database.drop([0])
database.sasdate=pd.to_datetime(database.sasdate)
database = database.rename(columns={"sasdate": "Date"})
database = database.set_index("Date",drop=True)

# keep only dates bewteen Jan 1960 and Dec 2015
database = database.drop(database[database.index < '1960-01-01'].index)
database = database.drop(database[database.index > '2015-12-01'].index)
database

Unnamed: 0_level_0,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,IPDCONGD,...,DSERRG3M086SBEA,CES0600000008,CES2000000008,CES3000000008,UMCSENTx,MZMSL,DTCOLNVHFNM,DTCTHFNM,INVEST,VXOCLSx
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1960-01-01,2391.2,2252.5,18.910,2.654060e+05,18762.29853,25.0099,25.2801,24.0998,34.2255,24.5120,...,12.502,2.22,2.57,2.13,,286.7,7362.00,14063.00,75.4515,
1960-02-01,2392.8,2253.6,18.993,2.692650e+05,18831.78085,24.7873,25.1368,24.0170,33.8357,24.1747,...,12.522,2.23,2.59,2.14,100.0,287.0,7396.00,14144.00,73.3363,
1960-03-01,2397.2,2255.5,19.262,2.617733e+05,18811.03986,24.5648,25.0508,23.9895,33.8747,23.6968,...,12.529,2.24,2.69,2.14,,287.8,7456.00,14239.00,71.5296,
1960-04-01,2405.6,2263.6,19.560,2.643641e+05,19304.67539,24.3701,25.1082,24.0170,34.1086,23.6687,...,12.555,2.24,2.61,2.14,,288.3,7582.00,14413.00,71.4718,
1960-05-01,2411.7,2269.4,19.166,2.561738e+05,19016.37565,24.3422,25.2514,24.1825,34.3035,23.8936,...,12.590,2.24,2.64,2.14,93.3,289.1,7673.00,14559.00,71.6858,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-08-01,14100.5,11661.9,114.403,1.214198e+06,447133.00000,107.5730,105.3324,105.1660,107.0618,120.8002,...,113.196,22.03,25.25,20.00,91.9,13530.5,338838.14,896818.51,3036.4756,19.8997
2015-09-01,14136.3,11690.1,114.682,1.229777e+06,446855.00000,107.5507,105.1969,104.8814,107.0055,121.2474,...,113.381,21.99,25.02,20.07,87.2,13569.7,343472.98,905935.02,3027.7410,24.4603
2015-10-01,14189.6,11737.4,114.654,1.228417e+06,446929.00000,107.3797,105.2874,104.5894,106.7995,122.2536,...,113.512,22.13,25.36,20.05,90.0,13617.7,343664.35,906420.20,3043.2053,16.5926
2015-11-01,14226.0,11767.2,114.994,,448580.00000,106.4080,104.6108,103.8916,106.1729,122.3621,...,113.750,22.14,25.43,20.06,91.3,13697.6,,,3067.2225,16.0358


In [125]:
database = database.dropna(axis='columns')

Problem here: using the data from January 2016 and keeping only columns with no missing observation over the period, I find 113 variables (not 122). Using data from February 2016, I find 127 variables to keep. 

# 1. b) principal components

"_In addition, we include as potential predictors the four principal component factors computed from this set of variables._"

In [126]:
X = database
std_slc = StandardScaler()
X_std = std_slc.fit_transform(X) # remove the outliners and scale the data (mean zero, standard deviation one)

print(X_std.shape)

pca = decomposition.PCA(n_components=4) # 4 principal component factors

X_std_pca = pca.fit_transform(X_std) # create the principal components

print(X_std_pca.shape)
print(X_std_pca)

(672, 113)
(672, 4)
[[-10.62133964  -6.78077911   0.47357959   0.1433929 ]
 [-10.71868904  -6.00917193   1.33829026   0.57302953]
 [-10.16783183  -5.30521052   3.76581964   0.91432604]
 ...
 [ 14.82304223  -3.96166708   0.12766715   2.73993954]
 [ 14.72002176  -3.7321304   -0.13056689   2.03511122]
 [ 14.44502079  -3.76997696   0.03391446   2.01154912]]


In [127]:
# add these four principal components to the database
pc = X_std_pca.transpose()
database = database.assign(pc1=pc[0], pc2=pc[1], pc3=pc[2], pc4=pc[3])

In [128]:
database

Unnamed: 0_level_0,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,IPDCONGD,IPNCONGD,IPBUSEQ,IPMAT,IPDMAT,...,CUSR0000SA0L5,CES0600000008,CES2000000008,CES3000000008,MZMSL,INVEST,pc1,pc2,pc3,pc4
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1960-01-01,18762.29853,25.0099,25.2801,24.0998,34.2255,24.5120,39.0678,9.3444,24.1864,14.8444,...,29.900,2.22,2.57,2.13,286.7,75.4515,-10.621340,-6.780779,0.473580,0.143393
1960-02-01,18831.78085,24.7873,25.1368,24.0170,33.8357,24.1747,38.6678,9.4050,23.8730,14.5985,...,30.000,2.23,2.59,2.14,287.0,73.3363,-10.718689,-6.009172,1.338290,0.573030
1960-03-01,18811.03986,24.5648,25.0508,23.9895,33.8747,23.6968,39.0234,9.4252,23.4812,14.1375,...,30.000,2.24,2.69,2.14,287.8,71.5296,-10.167832,-5.305211,3.765820,0.914326
1960-04-01,19304.67539,24.3701,25.1082,24.0170,34.1086,23.6687,39.4234,9.3242,22.9327,13.6304,...,30.100,2.24,2.61,2.14,288.3,71.4718,-10.360978,-4.891762,3.440928,0.853505
1960-05-01,19016.37565,24.3422,25.2514,24.1825,34.3035,23.8936,39.5567,9.3545,22.7237,13.4460,...,30.100,2.24,2.64,2.14,289.1,71.6858,-10.307163,-4.827300,3.632587,0.496258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-08-01,447133.00000,107.5730,105.3324,105.1660,107.0618,120.8002,103.6921,107.4097,110.1133,107.8878,...,227.855,22.03,25.25,20.00,13530.5,3036.4756,14.832774,-3.979578,0.005228,2.672024
2015-09-01,446855.00000,107.5507,105.1969,104.8814,107.0055,121.2474,103.5103,106.8087,110.2453,107.2997,...,227.443,21.99,25.02,20.07,13569.7,3027.7410,14.676713,-3.771285,0.106118,2.504429
2015-10-01,446929.00000,107.3797,105.2874,104.5894,106.7995,122.2536,103.0031,106.3283,109.7203,108.1047,...,227.802,22.13,25.36,20.05,13617.7,3043.2053,14.823042,-3.961667,0.127667,2.739940
2015-11-01,448580.00000,106.4080,104.6108,103.8916,106.1729,122.3621,102.1941,105.3161,108.3548,107.4341,...,227.804,22.14,25.43,20.06,13697.6,3067.2225,14.720022,-3.732130,-0.130567,2.035111


"_We consider four lags of all variables, as well as four autoregressive terms. Hence, the analysis contemplates 508 potential predictors._"

i.e. to predict inflation at date t, we use inflation from t-1 to t-5 (4 predictors) as well as all other variables (the 122 original ones + the four principal components) from t-1 to t-5. So in total we have 4+4*(122+4)=508 variables in theory, but we don't have 122 variables (only 113 or 127).