# 1. Import library

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import (GridSearchCV, TimeSeriesSplit, train_test_split)
from sklearn.metrics import (mean_squared_error, mean_absolute_error)

# 2. Import file

In [2]:
all_data  = pd.read_csv('all_data.csv', index_col='Date', parse_dates=True)

In [3]:
all_data

Unnamed: 0_level_0,Mkt-RF,SMB,HML,RMW,CMA,RF,cdd_18-France,cdd_18-Germany,cdd_18-India,cdd_18-Indonesia,...,ret_ETN,ret_GOOGL,ret_HD,ret_HIG,ret_MS,ret_MSFT,ret_PLD,ret_TT,ret_UNH,ret_V
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-09-30,-0.0307,-0.0280,0.0056,0.0180,-0.0060,0.0000,7.58800,1.4840,313.40,233.8,...,,,,,,,,,,
2015-10-31,0.0775,-0.0205,-0.0046,0.0086,0.0055,0.0000,0.59810,0.0000,280.10,263.1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2015-11-30,0.0056,0.0329,-0.0042,-0.0271,-0.0103,0.0000,0.01212,0.0000,169.60,256.0,...,0.144820,0.194997,0.159235,0.001562,0.093981,0.236248,0.098972,0.155603,-0.028446,0.136262
2015-12-31,-0.0217,-0.0297,-0.0261,0.0045,0.0003,0.0001,0.00000,0.0000,88.83,250.7,...,-0.105227,0.019873,-0.007803,-0.047765,-0.072595,0.020791,0.013632,-0.052647,0.048249,-0.018479
2016-01-31,-0.0577,-0.0348,0.0209,0.0280,0.0307,0.0001,0.00000,0.0000,72.22,260.8,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-08-31,-0.0377,0.0149,0.0030,-0.0478,0.0130,0.0019,137.80000,90.4000,314.80,233.5,...,0.090445,-0.006819,0.058453,-0.011266,0.131055,0.020227,0.058309,0.186340,0.011098,0.011031
2022-09-30,-0.0935,-0.0097,0.0006,-0.0151,-0.0084,0.0019,35.78000,8.6410,292.60,231.5,...,-0.024005,-0.116152,-0.043270,-0.036853,-0.072870,-0.109267,-0.178815,-0.055997,-0.024465,-0.105984
2022-10-31,0.0783,0.0186,0.0805,0.0307,0.0652,0.0023,7.65100,0.1096,240.10,232.9,...,0.125300,-0.011918,0.073168,0.169035,0.050081,-0.003306,0.090059,0.102341,0.099220,0.166113
2022-11-30,0.0460,-0.0267,0.0138,0.0601,0.0311,0.0029,0.06032,0.0000,132.40,229.1,...,0.094764,0.068564,0.100699,0.060633,0.132652,0.102223,0.063567,0.117710,-0.013312,0.049841


# 3. Data cleaning

In [4]:
all_data.fillna( method ='ffill', inplace = True)

In [5]:
all_data.isna().sum().sum() # nan from calculate return at first row

21

In [6]:
all_data.dropna(inplace=True)

In [7]:
all_data.isna().sum().sum()

0

# 4. Determine features

In [8]:
features = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 
                   
                   'cdd_18-France','cdd_18-Germany', 'cdd_18-India', 'cdd_18-Indonesia', 'cdd_18-Italy', 'cdd_18-Japan', 'cdd_18-Korea', 'cdd_18-China', 
                   
                   'cdd_21-France', 'cdd_21-Germany', 'cdd_21-India', 'cdd_21-Indonesia', 'cdd_21-Italy', 'cdd_21-Japan', 'cdd_21-Korea', 'cdd_21-China', 
                   
                   'irradiance-France', 'irradiance-Germany', 'irradiance-India', 'irradiance-Indonesia', 'irradiance-Italy', 'irradiance-Japan', 'irradiance-Korea',
                   'irradiance-China', 
                   
                   'hdd_16-France', 'hdd_16-Germany', 'hdd_16-India', 'hdd_16-Indonesia', 'hdd_16-Italy', 'hdd_16-Japan', 'hdd_16-Korea', 'hdd_16-China', 
                   
                   'hdd_18-France', 'hdd_18-Germany', 'hdd_18-India', 'hdd_18-Indonesia', 'hdd_18-Italy', 'hdd_18-Japan', 'hdd_18-Korea', 'hdd_18-China', 
                   
                   'heat-France', 'heat-Germany', 'heat-India', 'heat-Indonesia', 'heat-Italy', 'heat-Japan', 'heat-Korea', 'heat-China', 
                   
                   'humidity-France', 'humidity-Germany', 'humidity-India', 'humidity-Indonesia', 'humidity-Italy', 'humidity-Japan', 'humidity-Korea', 
                   'humidity-China', 
                   
                   'temperature-France', 'temperature-Germany', 'temperature-India', 'temperature-Indonesia', 'temperature-Italy', 'temperature-Japan', 
                   'temperature-Korea', 'temperature-China', 
                   
                   'precipitation-France', 'precipitation-Germany', 'precipitation-India', 'precipitation-Indonesia', 'precipitation-Italy', 'precipitation-Japan', 
                   'precipitation-Korea', 'precipitation-China']

# 5. FamaFrench 5 factors and climate 5 factors with XGBRegressor

## example with AAPL

In [9]:
X = all_data[features]
y = all_data['ret_AAPL'] - all_data['RF']    # you can change only ticker from stock list
#stock_list = ['AAPL',  'APTV', 'BAC', 'BALL',  'BEP', 'BKNG', 'BMRN', 'COST', 'CVS', 'DE', 'ETN', 'GOOGL', 'HD', 'HIG',  'MS', 'MSFT', 'PLD',  'TT', 'UNH', 'V']

In [10]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
%%time
params={'gamma':[0, 0.01, 0.05, 0.1, 0.5, 1, 2, 3], 
        'colsample_bylevel':[0.9], 
        'colsample_bytree':[0.8], 
        'colsample_bynode':[0.5], 
        'max_depth':np.arange(4, 10 ,2), 
        'n_estimators':[100,200],
        'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1], 
        'min_child_weight':[1, 2, 3, 4, 5], 
        'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5]}

xgb_reg = GridSearchCV(XGBRegressor(booster = 'gbtree'), params, cv=5, n_jobs=-1)

grid_result = xgb_reg.fit(X, y)
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  0.17419607642052107
Best Params:  {'colsample_bylevel': 0.9, 'colsample_bynode': 0.5, 'colsample_bytree': 0.8, 'gamma': 0.05, 'learning_rate': 0.4, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'subsample': 1}
CPU times: user 8min 5s, sys: 1.75 s, total: 8min 7s
Wall time: 11min 35s


In [12]:
y_pred = xgb_reg.predict(X)

In [13]:
print('mean_squared_error =\t\t', mean_squared_error(y, y_pred))
print('mean_absolute_error =\t\t', mean_absolute_error(y, y_pred))

mean_squared_error =		 0.00447785925818042
mean_absolute_error =		 0.05299870320662732
