<a href="https://colab.research.google.com/github/microsoft/qlib/blob/main/examples/workflow_by_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#  Copyright (c) Microsoft Corporation.
#  Licensed under the MIT License.

In [1]:
import sys, site
from pathlib import Path

################################# NOTE #################################
#  Please be aware that if colab installs the latest numpy and pyqlib  #
#  in this cell, users should RESTART the runtime in order to run the  #
#  following cells successfully.                                       #
########################################################################

try:
    import qlib
except ImportError:
    # install qlib
    ! pip install --upgrade numpy
    ! pip install pyqlib
    # reload
    site.main()

scripts_dir = Path.cwd().parent.joinpath("scripts")
if not scripts_dir.joinpath("get_data.py").exists():
    # download get_data.py script
    scripts_dir = Path("~/tmp/qlib_code/scripts").expanduser().resolve()
    scripts_dir.mkdir(parents=True, exist_ok=True)
    import requests
    with requests.get("https://raw.githubusercontent.com/microsoft/qlib/main/scripts/get_data.py") as resp:
        with open(scripts_dir.joinpath("get_data.py"), "wb") as fp:
            fp.write(resp.content)

In [2]:

import qlib
import pandas as pd
from qlib.config import REG_CN
# from qlib.contrib.model.gbdt import LGBModel
# from qlib.contrib.data.handler import Alpha158
from qlib.contrib.evaluate import (
    backtest as normal_backtest,
    risk_analysis,
)
from qlib.utils import exists_qlib_data, init_instance_by_config
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.utils import flatten_dict


In [3]:
# use default data
# NOTE: need to download data from remote: python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data
provider_uri = "/home/qlib_test/qlib_data/cn_data"  # target_dir
if not exists_qlib_data(provider_uri):
    print(f"Qlib data is not found in {provider_uri}")
    sys.path.append(str(scripts_dir))
    from get_data import GetData
    GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
qlib.init(provider_uri=provider_uri, region=REG_CN)

[24:MainThread](2021-10-09 10:12:41,903) INFO - qlib.Initialization - [config.py:386] - default_conf: client.
[24:MainThread](2021-10-09 10:12:41,911) INFO - qlib.Initialization - [__init__.py:56] - qlib successfully initialized based on client settings.
[24:MainThread](2021-10-09 10:12:41,912) INFO - qlib.Initialization - [__init__.py:58] - data_path={'__DEFAULT_FREQ': PosixPath('/home/qlib_test/qlib_data/cn_data')}


In [4]:
market = "csi300"
benchmark = "SH000300"

# train model

In [21]:
###################################
# train model
###################################
data_handler_config = {
    "start_time": "2008-01-01",
    "end_time": "2020-08-01",
    "fit_start_time": "2008-01-01",
    "fit_end_time": "2014-12-31",
    "instruments": market,
}

task = {
    "model": {
        "class": "LGBModel",
        "module_path": "qlib.contrib.model.gbdt",
        "kwargs": {
            "loss": "mse",
            "colsample_bytree": 0.8879,
            "learning_rate": 0.0421,
            "subsample": 0.8789,
            "lambda_l1": 205.6999,
            "lambda_l2": 580.9768,
            "max_depth": 8,
            "num_leaves": 210,
            "num_threads": 20,
        },
    },
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha158",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                "train": ("2008-01-01", "2014-12-31"),
                "valid": ("2015-01-01", "2016-12-31"),
                "test": ("2017-01-01", "2020-08-01"),
            },
        },
    },
}

# model initiaiton
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])

# start exp to train model
with R.start(experiment_name="train_model"):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    R.save_objects(trained_model=model,local_path='/home/qlib_test/examples/results/pred.pkl')
    rid = R.get_recorder().id


  if idx.is_monotonic_increasing and not (isinstance(idx, pd.MultiIndex) and not idx.is_lexsorted()):
[24:MainThread](2021-10-09 10:47:10,640) INFO - qlib.timer - [log.py:113] - Time cost: 333.995s | Loading data Done
[24:MainThread](2021-10-09 10:47:11,942) INFO - qlib.timer - [log.py:113] - Time cost: 0.243s | DropnaLabel Done
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
[24:MainThread](2021-10-09 10:47:18,591) INFO - qlib.timer - [log.py:113] - Time cost: 6.647s | CSZScoreNorm Done
[24:MainThread](2021-10-09 10:47:18,606) INFO - qlib.timer - [log.py:113] - Time cost: 7.964s | fit & process data Done
[24:MainThread](2021-10-09 10:47:18,607) INFO - qlib.timer - [log.py:113] - Time cost: 341.961s | Init data Done
[24:MainThread](2021-10-09 10:47:18,

Training until validation scores don't improve for 50 rounds
[20]	train's l2: 0.99032	valid's l2: 0.99407
[40]	train's l2: 0.98654	valid's l2: 0.993434
[60]	train's l2: 0.983952	valid's l2: 0.993192
[80]	train's l2: 0.981825	valid's l2: 0.993024
[100]	train's l2: 0.97992	valid's l2: 0.992874
[120]	train's l2: 0.978043	valid's l2: 0.992838
[140]	train's l2: 0.976317	valid's l2: 0.992838
[160]	train's l2: 0.974702	valid's l2: 0.992849
[180]	train's l2: 0.97313	valid's l2: 0.992885
[200]	train's l2: 0.971546	valid's l2: 0.992959
Early stopping, best iteration is:
[168]	train's l2: 0.974048	valid's l2: 0.992791


# prediction, backtest & analysis

In [22]:
model.predict

<bound method LGBModel.predict of <qlib.contrib.model.gbdt.LGBModel object at 0x7f934466f050>>

In [30]:
data_handler_config = {
    "start_time": "2020-07-01",
    "end_time": "2020-08-01",
    "fit_start_time": "2020-07-01",
    "fit_end_time": "2020-08-01",
    "instruments": market,
}
task = {
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha158",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                # "train": ("2008-01-01", "2014-12-31"),
                # "valid": ("2015-01-01", "2016-12-31"),
                "test": ("2020-07-01", "2020-08-01"),
            },
        },
    },
}

dataset_test = init_instance_by_config(task["dataset"])

In [102]:
data_handler_config = {
    "start_time": "2020-01-01",
    "end_time": "2021-09-25",
    "fit_start_time": "2020-01-01",
    "fit_end_time": "2021-09-25",
    "instruments": market,
}
task = {
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha158",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                # "train": ("2008-01-01", "2014-12-31"),
                # "valid": ("2015-01-01", "2016-12-31"),
                "test": ("2020-01-01", "2021-09-25"),
            },
        },
    },
}

dataset_recent = init_instance_by_config(task["dataset"])

  if idx.is_monotonic_increasing and not (isinstance(idx, pd.MultiIndex) and not idx.is_lexsorted()):
[24:MainThread](2021-10-09 11:39:39,726) INFO - qlib.timer - [log.py:113] - Time cost: 150.287s | Loading data Done
[24:MainThread](2021-10-09 11:39:39,795) INFO - qlib.timer - [log.py:113] - Time cost: 0.020s | DropnaLabel Done
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
[24:MainThread](2021-10-09 11:39:40,539) INFO - qlib.timer - [log.py:113] - Time cost: 0.743s | CSZScoreNorm Done
[24:MainThread](2021-10-09 11:39:40,541) INFO - qlib.timer - [log.py:113] - Time cost: 0.813s | fit & process data Done
[24:MainThread](2021-10-09 11:39:40,542) INFO - qlib.timer - [log.py:113] - Time cost: 151.103s | Init data Done


In [103]:
with R.start(experiment_name="test_model"):
    # recorder = R.get_recorder(recorder_id=rid, experiment_name="train_model")
    # model = recorder.load_object("trained_model")
    pred3 = model.predict(dataset_recent)

[24:MainThread](2021-10-09 11:39:40,551) INFO - qlib.workflow - [expm.py:270] - No tracking URI is provided. Use the default tracking URI.
[24:MainThread](2021-10-09 11:39:40,553) INFO - qlib.workflow - [expm.py:306] - <mlflow.tracking.client.MlflowClient object at 0x7f91aad5b250>
[24:MainThread](2021-10-09 11:39:40,559) INFO - qlib.workflow - [exp.py:249] - Experiment 5 starts running ...
[24:MainThread](2021-10-09 11:39:40,570) INFO - qlib.workflow - [recorder.py:284] - Recorder 635a1e5eecf5448383840e0faf1200fe starts running under Experiment 5 ...


In [112]:
df = pred2[-10000:]
df

datetime    instrument
2021-04-22  SH600584      0.002707
            SH600585     -0.033995
            SH600588      0.074878
            SH600600     -0.228211
            SH600606     -0.018376
                            ...   
2021-06-11  SZ300595     -0.242030
            SZ300601     -0.005275
            SZ300628     -0.018449
            SZ300676     -0.069561
            SZ300677      0.094834
Length: 10000, dtype: float64

In [113]:
df = df.sort_values(ascending=False)
df

datetime    instrument
2021-06-03  SH600085      0.315253
2021-05-21  SZ300601      0.296993
2021-05-12  SZ000069      0.277659
2021-05-20  SH600196      0.268223
2021-05-18  SH603156      0.260227
                            ...   
2021-06-11  SH603659     -0.314696
2021-06-04  SH603259     -0.315026
            SZ002371     -0.315322
2021-05-27  SZ300015     -0.323797
2021-06-01  SH601919     -0.327262
Length: 10000, dtype: float64

In [107]:
df.sort_values(ascending=False)

datetime    instrument
2020-07-15  SH601881      0.418699
2020-07-16  SH601881      0.383938
            SZ000783      0.373411
2020-07-15  SZ300033      0.372229
2020-07-16  SH601899      0.364158
                            ...   
2020-02-12  SZ300413     -0.365272
2020-02-18  SH600118     -0.367154
2021-01-25  SH600031     -0.369869
2021-02-19  SZ002044     -0.382109
2021-02-01  SH603392     -0.387270
Length: 104215, dtype: float64

In [114]:
###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "TopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.model_strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "start_time": "2017-01-01",
        "end_time": "2020-08-01",
        "account": 100000000,
        "benchmark": benchmark,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name="backtest_analysis"):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="train_model")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config, "day")
    par.generate()

[24:MainThread](2021-10-09 12:25:32,222) INFO - qlib.workflow - [expm.py:270] - No tracking URI is provided. Use the default tracking URI.
[24:MainThread](2021-10-09 12:25:32,224) INFO - qlib.workflow - [expm.py:306] - <mlflow.tracking.client.MlflowClient object at 0x7f91aa7b10d0>
[24:MainThread](2021-10-09 12:25:32,230) INFO - qlib.workflow - [exp.py:249] - Experiment 3 starts running ...
[24:MainThread](2021-10-09 12:25:32,242) INFO - qlib.workflow - [recorder.py:284] - Recorder a0167c5a0ec34f319d1d80399837b3ba starts running under Experiment 3 ...


LoadObjectError: No such file or directory: '/home/qlib_test/examples/mlruns/2/e7b50a7f5d3b4e4a9a548d2e8ac54be1/artifacts/trained_model'

# analyze graphs

In [7]:
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data import D
recorder = R.get_recorder(recorder_id=ba_rid, experiment_name="backtest_analysis")
print(recorder)
pred_df = recorder.load_object("pred.pkl")
pred_df_dates = pred_df.index.get_level_values(level='datetime')
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")

{'class': 'Recorder', 'id': '3d7490f78e644c0d8189b3aa3de0e995', 'name': 'mlflow_recorder', 'experiment_id': '3', 'start_time': '2021-10-04 06:22:46', 'end_time': '2021-10-04 06:23:52', 'status': 'FINISHED'}


## analysis position

### report

In [None]:
analysis_position.report_graph(report_normal_df)

### risk analysis

In [None]:
analysis_position.risk_analysis_graph(analysis_df, report_normal_df)

## analysis model

In [10]:
label_df = dataset.prepare("test", col_set="label")
label_df.columns = ['label']

### score IC

In [None]:
pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)
analysis_position.score_ic_graph(pred_label)

### model performance

In [None]:
analysis_model.model_performance_graph(pred_label)