In [3]:
import numpy as np
import pandas as pd
import joblib

from importlib import reload
from glob import glob
import os
import sys
import shutil
import model_pedestal as pedestal
from lightgbm import LGBMRegressor
import optuna

sys.path.append("..")
import kiwi_operators as kwo
import config
import data_environment as env

reload(kwo)
reload(config)
reload(env)

<module 'data_environment' from '../data_environment.py'>

In [18]:
class ModelLgb(object):
    def __init__(self):
        self.model = LGBMRegressor(
            n_estimators=100,
            max_depth=5,
            num_leaves=24,
            learning_rate=0.05,
            colsample_bytree=0.7,
            subsample=0.7,
            subsample_freq=5,
            reg_alpha=0.01,
            n_jobs=os.cpu_count() * 3 // 4,
        )

    def train(self, train_date_beg: int, train_date_end: int, verbose: bool = False):
        train_data = pedestal.prep_feat_target(train_date_beg, train_date_end)
        train_data = train_data.dropna()

        feat_cols = list(train_data.columns)
        feat_cols.remove("time")
        feat_cols.remove("investment")
        feat_cols.remove("target")
        x_train = np.array(train_data[feat_cols])
        y_train = np.array(train_data["target"])

        self.model.fit(x_train, y_train, verbose=verbose)

    def save_model(self, file_path: str = f"{config.dir_model}/xgb.pkl"):
        joblib.dump(self.model, file_path)

    def get_model(self):
        return self.model

    def predict(self, x_data: np.array):
        return self.model.predict(x_data)


In [19]:
train_date_beg = 20160101
train_date_end = 20180101

model = ModelLgb()
model.train(train_date_beg, train_date_end, verbose=True)

valid_date_beg = 20180101
valid_date_end = 20190101

valid_data = pedestal.prep_feat_target(valid_date_beg, valid_date_end)
feat_cols = list(valid_data.columns)
feat_cols.remove("time")
feat_cols.remove("investment")
feat_cols.remove("target")
x_valid = valid_data[feat_cols].fillna(method="ffill")
x_valid = np.array(x_valid)
p_valid = model.predict(x_valid)
y_valid = np.array(valid_data["target"].fillna(method="ffill"))
print(np.corrcoef(p_valid, y_valid)[0, 1])

0.040479586243139916


In [20]:
train_date_beg = 20160101
train_date_end = 20190101

model = ModelLgb()
model.train(train_date_beg, train_date_end, verbose=True)

test_date_beg = 20190101
test_date_end = 20200101

test_data = pedestal.prep_feat_target(test_date_beg, test_date_end)
feat_cols = list(test_data.columns)
feat_cols.remove("time")
feat_cols.remove("investment")
feat_cols.remove("target")
x_test = test_data[feat_cols].fillna(method="ffill")
x_test = np.array(x_test)
p_test = model.predict(x_test)
y_test = np.array(test_data["target"].fillna(method="ffill"))
print(np.corrcoef(p_test, y_test)[0, 1])

0.03685222913088071
