In [1]:
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import figure
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


In [2]:
# Settings:
pd.set_option('display.width', 190)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('max_colwidth', 200)
pd.options.display.float_format = '{:.4f}'.format
plt.style.use('default')
np.set_printoptions(threshold = 30, edgeitems = 30, precision = 2, suppress = False)


In [3]:
df_path = "../merged_data/BCI_GDP.csv"
features = []
for i in range(14):
    features.extend([f'BCI_{i}', f'BCIp_{i}', f'BCIg_{i}'])
model_names = ["Linear Regression"]
get_models = [lambda: linear_model.LinearRegression()]


In [4]:
# Read the data and do a little bit of wrangling:
df = pd.read_csv(df_path)
df.Date = pd.to_datetime(df.Date)
df = df.set_index("Date", drop=True)
df = df.drop(columns="Unnamed: 0")
df.head()

Unnamed: 0_level_0,GDP,BCI_0,BCIp_0,BCIg_0,BCI_1,BCIp_1,BCIg_1,BCI_2,BCIp_2,BCIg_2,BCI_3,BCIp_3,BCIg_3,BCI_4,BCIp_4,BCIg_4,BCI_5,BCIp_5,BCIg_5,BCI_6,BCIp_6,BCIg_6,BCI_7,BCIp_7,BCIg_7,BCI_8,BCIp_8,BCIg_8,BCI_9,BCIp_9,BCIg_9,BCI_10,BCIp_10,BCIg_10,BCI_11,BCIp_11,BCIg_11,BCI_12,BCIp_12,BCIg_12,BCI_13,BCIp_13,BCIg_13
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
1967-01-31,844.17,100.0,100.0,0.0,100.0,99.5,0.0,99.6,93.1,0.0,99.8,96.4,0.0,99.9,98.7,0.0,99.7,94.6,0.0,99.9,98.4,0.0,99.9,97.6,0.0,99.2,85.8,0.0,98.5,75.1,0.0,98.9,82.2,0.0,99.8,96.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1967-04-30,848.983,100.4,100.0,0.0,100.5,100.0,0.0,100.1,93.4,0.0,99.8,88.3,0.0,100.2,95.6,0.0,100.6,100.0,0.0,101.4,100.0,0.0,101.8,100.0,0.0,102.0,100.0,0.0,102.3,100.0,0.0,103.1,100.0,0.0,103.2,100.0,0.0,103.2,99.8,0.0,0.0,0.0,0.0
1967-07-31,865.233,103.4,100.0,0.0,103.4,99.9,0.0,103.4,100.0,0.0,103.6,100.0,0.0,103.5,98.0,0.0,104.0,100.0,0.0,104.3,100.0,0.0,104.9,100.0,0.0,104.9,100.0,0.0,104.7,96.7,0.0,104.9,99.7,0.0,104.5,93.0,0.0,104.5,93.1,0.0,0.0,0.0,0.0
1967-10-31,881.439,104.6,94.3,0.0,104.2,87.9,0.0,104.1,87.5,0.0,104.7,97.2,0.0,106.1,100.0,0.0,106.7,100.0,0.0,107.3,100.0,0.0,107.4,100.0,0.0,106.6,87.4,0.0,106.2,81.2,0.0,106.3,82.3,0.0,106.4,83.7,0.0,105.6,71.7,0.0,0.0,0.0,0.0
1968-01-31,909.387,105.0,62.7,12.3,104.9,60.5,11.3,104.2,49.2,10.1,104.2,49.3,9.2,104.3,51.9,8.7,103.6,40.9,7.9,104.1,48.7,7.7,104.1,48.5,7.5,104.7,57.4,7.5,105.9,76.7,8.5,107.2,96.9,9.8,108.5,100.0,11.7,108.3,97.3,13.1,13.1,13.1,13.1


In [5]:
# Split into training and test sets and hold out the test set until the end, so that it remains "unseen".
lag_of_y = 2 # This is the lag we introduce to the target variable so that we assess the indicator's 
              # ability to predict the target variable this many steps into the future.
              # With GDP_m, a lag of 2 data points corresponds to half a year.
        
X_train, y_train = df.iloc[:-lag_of_y, df.columns != "GDP"], df.iloc[lag_of_y:, df.columns == "GDP"]

In [6]:
# for feature in features:
#     plt.figure()
#     X_train[feature].hist(bins = 50)
#     plt.xlabel(feature,fontsize=15)
#     plt.ylabel("Frequency",fontsize=15)
#     plt.show()

In [7]:
# Do a time series cross-validation on the test set by splitting it to k folds and doing a "rolling"
# validation against a validation fold, then averaging out the metrics.
splits = 4 # This is the number of splits/folds in the rolling validation.
tscv = TimeSeriesSplit(n_splits=splits)

for train_index, test_index in tscv.split(X_train): # Rolling cross-validation happens inside this loop.
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43] TEST: [44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 ... 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
 74 75 76 77 78 79 80 81 82 83 84 85 86] TEST: [ 87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104
 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
 123 124 125 126 127 128 129]
TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29 ... 100 101 102 103 104
 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
 123 124 125 126 127 128 129] TEST: [130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
 148 149

## Validation

In [8]:
R2s = dict()
for model_name, get_model in zip(model_names, get_models):
    print(model_name)
    R2s[model_name] = 0
    for train_index, test_index in tscv.split(X_train): # Rolling cross-validation happens inside this loop.
        X_train_fold, X_validation_fold = X_train.iloc[train_index[:-lag_of_y], X_train.columns != "GDP"], \
            X_train.iloc[test_index[:-lag_of_y], X_train.columns != "GDP"]
        y_train_fold, y_validation_fold = y_train.iloc[train_index[lag_of_y:], y_train.columns == "GDP"], \
            y_train.iloc[test_index[lag_of_y:], y_train.columns == "GDP"]
            
        scalers = dict()
        for feature in features:
            scalers[feature] = StandardScaler()
            scalers[feature].fit(X_train_fold[[feature]])
            X_train_fold[feature] = scalers[feature].transform(X_train_fold[[feature]])
            X_validation_fold[feature] = scalers[feature].transform(X_validation_fold[[feature]])
            
        model = get_model()
        model.fit(X_train_fold[features], y_train_fold["GDP"])
        predictions = model.predict(X_validation_fold[features])
        R2 = r2_score(y_validation_fold, predictions)
        R2s[model_name] += R2
        print(R2)
        
    R2s[model_name] /= splits

Linear Regression
-26.06639163330296
-18.916153293791005
-5.0659637392593835
-76033.30885363856


In [9]:
for model_name in model_names:
    print(model_name)
    print(f"R2 score: {R2s[model_name]}")

Linear Regression
R2 score: -19020.83934057623
