# Import

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import json

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Configs

In [3]:
settings_path = '../configs/settings.json'
with open(settings_path, 'r') as f:
    config = json.load(f)

# Read raw data

In [4]:
%%time
data = pd.read_csv(f"../{config['RAW_DATA_DIR']}/train.csv")

CPU times: user 7.98 s, sys: 579 ms, total: 8.56 s
Wall time: 9.25 s


# EDA

In [5]:
data.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')

## Days numerations

In [6]:
data['date_id'].min()
data['date_id'].max()
data['date_id'].nunique()

0

480

481

## Interity columns

In [7]:
data['time_id'].nunique()
data['row_id'].nunique()

26455

5237980

In [8]:
data.shape

(5237980, 17)

## Find weights of target_index = Sum_stock_id (weight_id*wap_id)

### Allocate data

In [9]:
%%time
num_stocks = data["stock_id"].nunique()
num_dates = data["date_id"].nunique()
num_updates = data["seconds_in_bucket"].nunique()

print(f"# stocks         : {num_stocks}")
print(f"# dates          : {num_dates}")
print(f"# updates per day: {num_updates}")

stock_returns = np.zeros((num_stocks, num_dates, num_updates))
index_returns = np.zeros((num_stocks, num_dates, num_updates))

for (stock_id, date_id), frame in data.groupby(["stock_id", "date_id"]):
    frame["stock_return"] = ((frame["wap"] / frame["wap"].shift(6)).shift(-6) - 1) * 10_000
    frame["index_return"] = frame["stock_return"] - frame["target"]

    stock_returns[stock_id, date_id] = frame["stock_return"].values
    index_returns[stock_id, date_id] = frame["index_return"].values

index_return = np.mean(index_returns, axis=0)

# stocks         : 200
# dates          : 481
# updates per day: 55
CPU times: user 1min 48s, sys: 21.9 ms, total: 1min 48s
Wall time: 1min 48s


### Model

In [10]:
l_reg = LinearRegression()

### Data

In [11]:
y = index_return.reshape(-1)
X = stock_returns.reshape((num_stocks, -1)).T

mask = ~((np.isnan(y) | np.isnan(X).any(axis=1)))
X, y = X[mask], y[mask]

### Fit and quantify quality

In [12]:
l_reg.fit(X, y)

print(" Fit ".center(80, ">"))
print("Intercept:", l_reg.intercept_)
print("R2:", r2_score(y, l_reg.predict(X)))
print("Sum of Coef:", l_reg.coef_.sum())

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Fit >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Intercept: 3.6043735647528496e-06
R2: 0.99999999573038
Sum of Coef: 0.9999999372522721


Sum up to 1 means no other assets are in index


R^2 1 means fully explains


### Chck coefs

In [13]:
print("Coef: ", l_reg.coef_.round(3))

Coef:  [0.004 0.001 0.002 0.006 0.004 0.004 0.002 0.006 0.006 0.002 0.002 0.008
 0.006 0.002 0.008 0.006 0.002 0.006 0.004 0.002 0.004 0.001 0.006 0.004
 0.002 0.002 0.004 0.002 0.004 0.004 0.001 0.001 0.002 0.002 0.006 0.004
 0.004 0.004 0.006 0.002 0.002 0.04  0.002 0.002 0.004 0.04  0.002 0.001
 0.006 0.004 0.004 0.006 0.001 0.004 0.004 0.002 0.006 0.004 0.006 0.004
 0.006 0.004 0.002 0.001 0.002 0.004 0.002 0.008 0.004 0.004 0.002 0.004
 0.006 0.002 0.004 0.004 0.002 0.004 0.004 0.004 0.001 0.002 0.002 0.008
 0.02  0.004 0.006 0.002 0.02  0.002 0.002 0.006 0.004 0.002 0.001 0.02
 0.006 0.001 0.002 0.004 0.001 0.002 0.006 0.006 0.004 0.006 0.001 0.002
 0.004 0.006 0.006 0.001 0.04  0.006 0.002 0.004 0.002 0.002 0.006 0.002
 0.002 0.004 0.006 0.006 0.002 0.002 0.008 0.006 0.004 0.002 0.006 0.002
 0.004 0.006 0.002 0.004 0.001 0.004 0.002 0.004 0.008 0.006 0.008 0.002
 0.004 0.002 0.001 0.004 0.004 0.004 0.006 0.008 0.004 0.001 0.001 0.002
 0.006 0.004 0.001 0.002 0.006 0.004 0.006 0.

In [14]:
print(f"Max coef: {max(l_reg.coef_)}")
print(f"Min coef: {min(l_reg.coef_)}")

Max coef: 0.040000410425067634
Min coef: 0.0009992094870383728


In [15]:
index_stock_weights = l_reg.coef_.round(3)

### Store

In [16]:
stock_weight_dict = {}
for i, el in enumerate(index_stock_weights):
    stock_weight_dict[i] = el

In [20]:
with open(f"../{config['MODEL_DIR']}/index_stock_weights.json", 
          'w') as f:
    json.dump(stock_weight_dict, f)

# NaN analyses

In [21]:
nuls_by_cols = data.isna().sum()
mis_val_percent = 100 * nuls_by_cols / len( data )
mis_val_percent

stock_id                    0.000000
date_id                     0.000000
seconds_in_bucket           0.000000
imbalance_size              0.004200
imbalance_buy_sell_flag     0.000000
reference_price             0.004200
matched_size                0.004200
far_price                  55.256836
near_price                 54.547364
bid_price                   0.004200
bid_size                    0.000000
ask_price                   0.004200
ask_size                    0.000000
wap                         0.004200
target                      0.001680
time_id                     0.000000
row_id                      0.000000
dtype: float64

Target has missing values but they are extremely rare.

Base feature with significant missing count "near_price", "far_price"