# Import

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import json

# Configs

In [2]:
settings_path = '../configs/settings.json'
with open(settings_path, 'r') as f:
    config = json.load(f)

# Find weights of target_index = Sum_stock_id (weight_id*wap_id)

In [3]:
%%time
data = pd.read_csv(f"../{config['RAW_DATA_DIR']}/train.csv")

CPU times: user 8.26 s, sys: 683 ms, total: 8.94 s
Wall time: 9.57 s


### Allocate data

In [4]:
%%time
num_stocks = data["stock_id"].nunique()
num_dates = data["date_id"].nunique()
num_updates = data["seconds_in_bucket"].nunique()

print(f"# stocks         : {num_stocks}")
print(f"# dates          : {num_dates}")
print(f"# updates per day: {num_updates}")

stock_returns = np.zeros((num_stocks, num_dates, num_updates))
index_returns = np.zeros((num_stocks, num_dates, num_updates))

for (stock_id, date_id), frame in data.groupby(["stock_id", "date_id"]):
    frame["stock_return"] = ((frame["wap"] / frame["wap"].shift(6)).shift(-6) - 1) * 10_000
    frame["index_return"] = frame["stock_return"] - frame["target"]

    stock_returns[stock_id, date_id] = frame["stock_return"].values
    index_returns[stock_id, date_id] = frame["index_return"].values

index_return = np.mean(index_returns, axis=0)

# stocks         : 200
# dates          : 481
# updates per day: 55
CPU times: user 1min 43s, sys: 141 ms, total: 1min 43s
Wall time: 1min 43s


## Model

In [5]:
l_reg = LinearRegression()

## Data

In [6]:
y = index_return.reshape(-1)
X = stock_returns.reshape((num_stocks, -1)).T

mask = ~((np.isnan(y) | np.isnan(X).any(axis=1)))
X, y = X[mask], y[mask]

## Fit and quantify quality

In [7]:
l_reg.fit(X, y)

print(" Fit ".center(80, ">"))
print("Intercept:", l_reg.intercept_)
print("R2:", r2_score(y, l_reg.predict(X)))
print("Sum of Coef:", l_reg.coef_.sum())

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Fit >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Intercept: 3.6043735647528496e-06
R2: 0.99999999573038
Sum of Coef: 0.9999999372522721


Sum up to 1 means no other assets are in index


R^2 1 means fully explains


## Chck coefs

In [9]:
print("Coef: ", l_reg.coef_.round(3))

Coef:  [0.004 0.001 0.002 0.006 0.004 0.004 0.002 0.006 0.006 0.002 0.002 0.008
 0.006 0.002 0.008 0.006 0.002 0.006 0.004 0.002 0.004 0.001 0.006 0.004
 0.002 0.002 0.004 0.002 0.004 0.004 0.001 0.001 0.002 0.002 0.006 0.004
 0.004 0.004 0.006 0.002 0.002 0.04  0.002 0.002 0.004 0.04  0.002 0.001
 0.006 0.004 0.004 0.006 0.001 0.004 0.004 0.002 0.006 0.004 0.006 0.004
 0.006 0.004 0.002 0.001 0.002 0.004 0.002 0.008 0.004 0.004 0.002 0.004
 0.006 0.002 0.004 0.004 0.002 0.004 0.004 0.004 0.001 0.002 0.002 0.008
 0.02  0.004 0.006 0.002 0.02  0.002 0.002 0.006 0.004 0.002 0.001 0.02
 0.006 0.001 0.002 0.004 0.001 0.002 0.006 0.006 0.004 0.006 0.001 0.002
 0.004 0.006 0.006 0.001 0.04  0.006 0.002 0.004 0.002 0.002 0.006 0.002
 0.002 0.004 0.006 0.006 0.002 0.002 0.008 0.006 0.004 0.002 0.006 0.002
 0.004 0.006 0.002 0.004 0.001 0.004 0.002 0.004 0.008 0.006 0.008 0.002
 0.004 0.002 0.001 0.004 0.004 0.004 0.006 0.008 0.004 0.001 0.001 0.002
 0.006 0.004 0.001 0.002 0.006 0.004 0.006 0.

In [10]:
print(f"Max coef: {max(l_reg.coef_)}")
print(f"Min coef: {min(l_reg.coef_)}")

Max coef: 0.040000410425067634
Min coef: 0.0009992094870383728


In [11]:
index_stock_weights = l_reg.coef_.round(3)

## Store

In [12]:
with open(f"../{config['RAW_DATA_DIR']}/index_stock_weights.pkl", 
          'wb') as f:
    pickle.dump(index_stock_weights, f, pickle.HIGHEST_PROTOCOL)