In [21]:
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import figure
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


In [22]:
# Settings:
pd.set_option('display.width', 190)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('max_colwidth', 200)
pd.options.display.float_format = '{:.4f}'.format
plt.style.use('default')
np.set_printoptions(threshold = 30, edgeitems = 30, precision = 2, suppress = False)


In [23]:
df_path = "../merged_data/BCI_GDP.csv"
features = []
for i in range(14):
    features.extend([f'BCI_{i}', f'BCIp_{i}', f'BCIg_{i}'])
model_names = ["Linear Regression"]
get_models = [lambda: linear_model.LinearRegression()]


In [24]:
# Read the data and do a little bit of wrangling:
df = pd.read_csv(df_path)
df.Date = pd.to_datetime(df.Date)
df = df.sort_values(ascending=True, by="Date")
df = df.set_index("Date")

In [25]:
# Split into training and test sets and hold out the test set until the end, so that it remains "unseen".
lag_of_y = 2 # This is the lag we introduce to the target variable so that we assess the indicator's 
              # ability to predict the target variable this many steps into the future.
              # With GDP_m, a lag of 2 data points corresponds to half a year.
        
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:-lag_of_y, df.columns != "GDP"], \
    df.iloc[lag_of_y:, df.columns == "GDP"], test_size=0.1, shuffle=False)

In [26]:
# for feature in features:
#     plt.figure()
#     X_train[feature].hist(bins = 50)
#     plt.xlabel(feature,fontsize=15)
#     plt.ylabel("Frequency",fontsize=15)
#     plt.show()

In [27]:
# Do a time series cross-validation on the test set by splitting it to k folds and doing a "rolling"
# validation against a validation fold, then averaging out the metrics.
splits = 4 # This is the number of splits/folds in the rolling validation.
tscv = TimeSeriesSplit(n_splits=splits)

for train_index, test_index in tscv.split(X_train): # Rolling cross-validation happens inside this loop.
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41] TEST: [42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
 66 67 68 69 70 71 72 73 74 75 76 77 78 79]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 ... 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
 67 68 69 70 71 72 73 74 75 76 77 78 79] TEST: [ 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97
  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
 116 117]
TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29 ...  88  89  90  91  92
  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110
 111 112 113 114 115 116 117] TEST: [118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
 136 137 138 139 140 141 142 143 144 145 146 147 

## Validation

In [30]:
R2s = dict()
for model_name, get_model in zip(model_names, get_models):
    print(model_name)
    R2s[model_name] = 0
    for train_index, test_index in tscv.split(X_train): # Rolling cross-validation happens inside this loop.
        X_train_fold, X_validation_fold = X_train.iloc[train_index[:-lag_of_y], X_train.columns != "GDP"], \
            X_train.iloc[test_index[:-lag_of_y], X_train.columns != "GDP"]
        y_train_fold, y_validation_fold = y_train.iloc[train_index[lag_of_y:], y_train.columns == "GDP"], \
            y_train.iloc[test_index[lag_of_y:], y_train.columns == "GDP"]
            
        scalers = dict()
        for feature in features:
            scalers[feature] = StandardScaler()
            scalers[feature].fit(X_train_fold[[feature]])
            X_train_fold[feature] = scalers[feature].transform(X_train_fold[[feature]])
            X_validation_fold[feature] = scalers[feature].transform(X_validation_fold[[feature]])
            
        model = get_model()
        model.fit(X_train_fold[features], y_train_fold["GDP"])
        predictions = model.predict(X_validation_fold[features])
        R2 = r2_score(y_validation_fold, predictions)
        R2s[model_name] += R2
        print(R2)
        
    R2s[model_name] /= splits

Linear Regression
-22.756340891763607
-32.07207676224437
-6.503929371993402
-626.9909515366481


In [32]:
for model_name in model_names:
    print(model_name)
    print(f"R2 score: {R2s[model_name]}")

Linear Regression
R2 score: -172.08082464066237
