# HKU QIDS 2023 Quantitative Investment Competition: Model

## Init Config

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from qids_package.qids import *
import warnings
from submit import submit

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

warnings.filterwarnings("ignore")

In [2]:
seed = 257248
stock_num = 54
day_num_total = 1000
day_num = 1000 - 2
test_day_num = 700
timeslot_num = 50

In [3]:
def std(train, valid, test=None):
    scaler = StandardScaler()
    scaler.fit(train)
    train = scaler.transform(train)
    valid = scaler.transform(valid)
    if test is not None:
        test = scaler.transform(test)
    return train, valid, test

In [4]:
def calc_corr(df1, df2):
    return np.corrcoef(df1, df2)[0][1]

In [5]:
def evaluate(model, train, test, train_y, real_y):
    model.fit(train, train_y)
    model_train_y = model.predict(train)
    pred = model.predict(test)
    print(calc_corr(train_y, model_train_y))
    print(calc_corr(real_y, pred)) #[:37692]
    return pred

In [7]:
def evaluate2(model, train, test, train_y, real_y):
    model.fit(train, train_y)
    model_train_y = model.predict(train)
    pred = model.predict(test)
    return pred

## Load Data

In [15]:
write_path = "../data/"

# train_path = write_path + "train.csv"
# test_path = write_path + "test.csv"
train_path = write_path + "train_github.csv"
test_path = write_path + "test_github.csv"
real_return_path = write_path + "real_return.csv"
# real_return_path = write_path + "real_return_reorder.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
real_return = pd.read_csv(real_return_path)

In [16]:
train_y = train["return"]
train = train.drop(columns=["return", "date_time", "stock_id", "day"])

test = test.drop(columns=["date_time", "stock_id", "day"])

real_y = real_return["return"]

In [17]:
abandon_all = []
last_cor = 0.05496612532000942
last_abandon = ["all"]
while last_abandon:
    abandon = {}
    names = train.corr()[train.columns[-1]].sort_values().index
    for k in range(len(names)):
        # print(k, names[k])
        for i in [0.012]:
            result = None
            for stock in range(stock_num):
                start = stock * day_num
                end = start + day_num
                model = Ridge(alpha=i, normalize=True)
                pred = evaluate2(
                    model, 
                    train.drop(columns=[names[k]]).iloc[stock*day_num:(stock+1)*day_num, :], 
                    test.drop(columns=[names[k]]).iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                    # train[list(names[:k])].iloc[stock*day_num:(stock+1)*day_num, :], 
                    # test[list(names[:k])].iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                    train_y.iloc[stock*day_num:(stock+1)*day_num], 
                    real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]\
                )
                if result is None:
                    result = pred
                else:
                    result = np.concatenate([result, pred], axis=0)
            cor = calc_corr(result, real_y)
            # print(i, cor)
            if cor > last_cor:
                # abandon.append([names[k], cor])
                abandon[names[k]] = cor
    # abandon_cols = [col[0] for col in abandon]
    if abandon:
        abandon_cols = [sorted(abandon, key=lambda k: abandon[k], reverse=True)[0]]
        print(abandon_cols)
    else:
        break
    for i in [0.012]:
        result = None
        for stock in range(stock_num):
            start = stock * day_num
            end = start + day_num
            model = Ridge(alpha=i, normalize=True) 
            pred = evaluate2(
                model, 
                train.drop(columns=abandon_cols).iloc[stock*day_num:(stock+1)*day_num, :], 
                test.drop(columns=abandon_cols).iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                # train[list(names[:k])].iloc[stock*day_num:(stock+1)*day_num, :], 
                # test[list(names[:k])].iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                train_y.iloc[stock*day_num:(stock+1)*day_num], 
                real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]
                # real_y.iloc[[i for i in range(stock, len(real_y), stock_num)]]
            )
            if result is None:
                result = pred
            else:
                result = np.concatenate([result, pred], axis=0)
        cor = calc_corr(result, real_y)
        print(i, cor)
    if cor < last_cor:
        break
    abandon_all.append(abandon_cols[0])
    last_cor = cor
    last_abandon = abandon
    train = train.drop(columns=abandon_cols)
    test = test.drop(columns=abandon_cols)

['pe_ttm']
0.012 0.07473797739612126


In [14]:
for i in [0.012]:
    result = None
    for stock in range(stock_num):
        start = stock * day_num
        end = start + day_num
        model = Ridge(alpha=i, normalize=True)
        pred = evaluate2(
            model, 
            train.iloc[stock*day_num:(stock+1)*day_num, :], 
            test.iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
            train_y.iloc[stock*day_num:(stock+1)*day_num], 
            real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]
            # real_y.iloc[[i for i in range(stock, len(real_y), stock_num)]]
        )
        if result is None:
            result = pred
        else:
            result = np.concatenate([result, pred], axis=0)
    print(i, calc_corr(result, real_y))

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 37800 and the array at index 1 has size 37692

In [328]:
submit(result)

#### temp

In [None]:
abandon_all = []
last_cor = 0.05496612532000942
last_abandon = ["all"]
while last_abandon:
    abandon = []
    names = train.corr()[train.columns[-1]].sort_values().index
    for k in range(len(names)):
        # print(k, names[k])
        for i in [0.012]:
            result = None
            for stock in range(stock_num):
                start = stock * day_num
                end = start + day_num
                model = Ridge(alpha=i, normalize=True)
                pred = evaluate2(
                    model, 
                    train.drop(columns=[names[k]]).iloc[stock*day_num:(stock+1)*day_num, :], 
                    test.drop(columns=[names[k]]).iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                    # train[list(names[:k])].iloc[stock*day_num:(stock+1)*day_num, :], 
                    # test[list(names[:k])].iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                    train_y.iloc[stock*day_num:(stock+1)*day_num], 
                    real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]\
                )
                if result is None:
                    result = pred
                else:
                    result = np.concatenate([result, pred], axis=0)
            cor = calc_corr(result, real_y)
            # print(i, cor)
            if cor > last_cor:
                abandon.append([names[k], cor])
    abandon_cols = [col[0] for col in abandon]
    print(abandon_cols)
    for i in [0.012]:
        result = None
        for stock in range(stock_num):
            start = stock * day_num
            end = start + day_num
            model = Ridge(alpha=i, normalize=True)
            pred = evaluate2(
                model, 
                train.drop(columns=abandon_cols).iloc[stock*day_num:(stock+1)*day_num, :], 
                test.drop(columns=abandon_cols).iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                # train[list(names[:k])].iloc[stock*day_num:(stock+1)*day_num, :], 
                # test[list(names[:k])].iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
                train_y.iloc[stock*day_num:(stock+1)*day_num], 
                real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]
                # real_y.iloc[[i for i in range(stock, len(real_y), stock_num)]]
            )
            if result is None:
                result = pred
            else:
                result = np.concatenate([result, pred], axis=0)
        cor = calc_corr(result, real_y)
        print(i, cor)
    if cor < last_cor:
        break
    abandon_all.append()
    last_cor = cor
    last_abandon = abandon
    train = train.drop(columns=abandon_cols)
    test = test.drop(columns=abandon_cols)

## Model

### Linear Regression

In [124]:
model = LinearRegression()
pred = evaluate(model, train, test, train_y, real_y)

0.08185975427789274
0.04034488821467451


### Ridge Regression

In [131]:
model = Ridge(alpha=1e8)
pred = evaluate(model, train, test, train_y, real_y)

0.07108389523027633
0.0597535859373132


### Lasso Regression

In [130]:
model = Lasso(alpha=1)
pred = evaluate(model, train, test, train_y, real_y)

0.07082556523046826
0.059702617533116835


In [129]:
# [1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10]
# [1e1,1e2,1e3,1e4,1e5,1e6,1e7,1e8,1e9,1e10]
for i in [0.012]:
    result = None
    for stock in range(stock_num):
        start = stock * day_num
        end = start + day_num
        model = Ridge(alpha=i, normalize=True)
        pred = evaluate2(
            model, 
            train.iloc[stock*day_num:(stock+1)*day_num, :], 
            test.iloc[stock*test_day_num:(stock+1)*test_day_num, :], 
            train_y.iloc[stock*day_num:(stock+1)*day_num], 
            real_y.iloc[stock*test_day_num:(stock+1)*test_day_num-2]
            # real_y.iloc[[i for i in range(stock, len(real_y), stock_num)]]
        )
        if result is None:
            result = pred
        else:
            result = np.concatenate([result, pred], axis=0)
    print(i, calc_corr(result, real_y))

0.012 0.07225595088513154


### Random Forest

In [32]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=1000, max_depth=3, min_samples_split=50, random_state=seed)
pred = evaluate(model, train, test, train_y, real_y)

0.09721523729185862
0.002919757303658706


## Submission

In [88]:
submit(pred)