<a href="https://colab.research.google.com/github/microsoft/qlib/blob/main/examples/workflow_by_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#  Copyright (c) Microsoft Corporation.
#  Licensed under the MIT License.

In [None]:
import sys, site
from pathlib import Path


try:
    import qlib
except ImportError:
    # install qlib
    ! pip install pyqlib
    # reload
    site.main()

scripts_dir = Path.cwd().parent.joinpath("scripts")
if not scripts_dir.joinpath("get_data.py").exists():
    # download get_data.py script
    scripts_dir = Path("~/tmp/qlib_code/scripts").expanduser().resolve()
    scripts_dir.mkdir(parents=True, exist_ok=True)
    import requests
    with requests.get("https://raw.githubusercontent.com/microsoft/qlib/main/scripts/get_data.py") as resp:
        with open(scripts_dir.joinpath("get_data.py"), "wb") as fp:
            fp.write(resp.content)

In [1]:

import qlib
import pandas as pd
from qlib.config import REG_CN
from qlib.contrib.model.gbdt import LGBModel
from qlib.contrib.data.handler import Alpha158
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
from qlib.contrib.evaluate import (
    backtest as normal_backtest,
    risk_analysis,
)
from qlib.utils import exists_qlib_data, init_instance_by_config
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.utils import flatten_dict


In [2]:
# use default data
# NOTE: need to download data from remote: python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data
#provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
provider_uri = "~/.qlib/qlib_data/my_data/"
if not exists_qlib_data(provider_uri):
    print(f"Qlib data is not found in {provider_uri}")
    sys.path.append(str(scripts_dir))
    from get_data import GetData
    GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
qlib.init(provider_uri=provider_uri, region=REG_CN)

[56783:MainThread](2021-03-12 16:57:09,094) INFO - qlib.Initialization - [config.py:276] - default_conf: client.
[56783:MainThread](2021-03-12 16:57:09,101) INFO - qlib.Initialization - [__init__.py:46] - qlib successfully initialized based on client settings.
[56783:MainThread](2021-03-12 16:57:09,102) INFO - qlib.Initialization - [__init__.py:47] - data_path=/home/kenneth/.qlib/qlib_data/my_data


In [3]:
market = "all"
benchmark = "BTCUSDT-Spot"

# train model

In [4]:
###################################
# train model
###################################
data_handler_config = {
    'start_time': '2017-07-15',
    'end_time': '2021-01-15',
    'fit_start_time': '2017-07-15',
    'fit_end_time': '2020-06-30',
    'instruments': market,
    'freq': '30m'
}

task = {
    "model": {
        "class": "LGBModel",
        "module_path": "qlib.contrib.model.gbdt",
        "kwargs": {
            "loss": "mse",
            "colsample_bytree": 0.8879,
            "learning_rate": 0.0421,
            "subsample": 0.8789,
            "lambda_l1": 205.6999,
            "lambda_l2": 580.9768,
            "max_depth": 8,
            "num_leaves": 210,
            "num_threads": 20,
        },
    },
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha158",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                'train': ('2017-07-15', '2020-01-01'),
                'valid': ('2020-01-02', '2020-06-30'),
                'test': ('2020-07-07', '2021-01-15'),
            },
        },
    },
}

# model initiaiton
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])

# start exp to train model
with R.start(experiment_name="train_model"):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    R.save_objects(trained_model=model)
    rid = R.get_recorder().id


[56783:MainThread](2021-03-12 16:58:56,646) INFO - qlib.timer - [log.py:77] - Time cost: 107.014s | Loading data Done
[56783:MainThread](2021-03-12 16:59:02,439) INFO - qlib.timer - [log.py:77] - Time cost: 4.919s | DropnaLabel Done
[56783:MainThread](2021-03-12 17:03:55,651) INFO - qlib.timer - [log.py:77] - Time cost: 293.208s | CSZScoreNorm Done
[56783:MainThread](2021-03-12 17:03:55,656) INFO - qlib.timer - [log.py:77] - Time cost: 299.005s | fit & process data Done
[56783:MainThread](2021-03-12 17:03:55,658) INFO - qlib.timer - [log.py:77] - Time cost: 406.026s | Init data Done
[56783:MainThread](2021-03-12 17:03:55,661) INFO - qlib.workflow - [expm.py:248] - No tracking URI is provided. Use the default tracking URI.
[56783:MainThread](2021-03-12 17:03:55,664) INFO - qlib.workflow - [expm.py:284] - <mlflow.tracking.client.MlflowClient object at 0x7f89efe0bfd0>
[56783:MainThread](2021-03-12 17:03:55,669) INFO - qlib.workflow - [exp.py:182] - Experiment 1 starts running ...
[56783:M

Training until validation scores don't improve for 50 rounds
[20]	train's l2: 0.968527	valid's l2: 0.98483
[40]	train's l2: 0.966916	valid's l2: 0.984551
[60]	train's l2: 0.965782	valid's l2: 0.984482
[80]	train's l2: 0.964763	valid's l2: 0.984447
[100]	train's l2: 0.963888	valid's l2: 0.98442
[120]	train's l2: 0.963068	valid's l2: 0.984405
[140]	train's l2: 0.962195	valid's l2: 0.984433
[160]	train's l2: 0.961414	valid's l2: 0.984438
Early stopping, best iteration is:
[121]	train's l2: 0.963028	valid's l2: 0.984403


# prediction, backtest & analysis

In [None]:
###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "strategy": {
        "class": "TopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.strategy",
        "kwargs": {
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "verbose": False,
        "limit_threshold": 0.095,
        "account": 100000000,
        "benchmark": benchmark,
        "deal_price": "close",
        "open_cost": 0.0005,
        "close_cost": 0.0015,
        "min_cost": 5,
        'freq': '30m'
    },
}


# backtest and analysis
with R.start(experiment_name="backtest_analysis"):
    recorder = R.get_recorder(rid, experiment_name="train_model")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config)
    par.generate()


[56783:MainThread](2021-03-12 17:06:02,040) INFO - qlib.workflow - [expm.py:248] - No tracking URI is provided. Use the default tracking URI.
[56783:MainThread](2021-03-12 17:06:02,043) INFO - qlib.workflow - [expm.py:284] - <mlflow.tracking.client.MlflowClient object at 0x7f89cbfc1c70>
[56783:MainThread](2021-03-12 17:06:02,048) INFO - qlib.workflow - [exp.py:182] - Experiment 2 starts running ...
[56783:MainThread](2021-03-12 17:06:02,065) INFO - qlib.workflow - [recorder.py:270] - Recorder 868f6822e80541e3b5ac575c0aba8c94 starts running under Experiment 2 ...
[56783:MainThread](2021-03-12 17:06:13,757) INFO - qlib.workflow - [record_temp.py:125] - Signal record 'pred.pkl' has been saved as the artifact of the Experiment 2


'The following are prediction results of the LGBModel model.'
                                   score
datetime   instrument                   
2020-07-07 ADABTC-SPOT         -0.000390
           ADAETH-SPOT          0.063423
           ADAUSDT-FUTURESUSDT -0.005572
           ADAUSDT-SPOT        -0.002352
           AIONETH-SPOT         0.061694


[56783:MainThread](2021-03-12 17:06:14,197) INFO - qlib.backtest caller - [__init__.py:149] - Create new exchange
[56783:MainThread](2021-03-12 17:06:31,130) INFO - qlib.backtest caller - [__init__.py:204] - Create new executor 
[56783:MainThread](2021-03-13 00:49:57,484) INFO - qlib.workflow - [record_temp.py:264] - Portfolio analysis record 'port_analysis.pkl' has been saved as the artifact of the Experiment 2


# analyze graphs

In [None]:
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data import D
recorder = R.get_recorder(ba_rid, experiment_name="backtest_analysis")
pred_df = recorder.load_object("pred.pkl")
pred_df_dates = pred_df.index.get_level_values(level='datetime')
report_normal_df = recorder.load_object("portfolio_analysis/report_normal.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis.pkl")

## analysis position

### report

In [None]:
analysis_position.report_graph(report_normal_df)

### risk analysis

In [None]:
analysis_position.risk_analysis_graph(analysis_df, report_normal_df)

## analysis model

In [None]:
label_df = dataset.prepare("test", col_set="label")
label_df.columns = ['label']

### score IC

In [None]:
pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)
analysis_position.score_ic_graph(pred_label)

### model performance

In [None]:
analysis_model.model_performance_graph(pred_label)