In [1]:
import numpy as np
import pandas as pd
import jpx_tokyo_market_prediction
from sklearn.tree import DecisionTreeRegressor
from tqdm.notebook import tqdm
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split

import optuna
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

In [2]:
%%time
# Loading Stock Prices
path = "../input/jpx-tokyo-stock-exchange-prediction/"
df_prices = pd.read_csv(f"{path}train_files/stock_prices.csv")
df_prices = df_prices[~df_prices["Target"].isnull()]
prices = pd.read_csv(f"{path}supplemental_files/stock_prices.csv")#最新的辅助文件
df_prices = pd.concat([df_prices, prices])
df_prices = df_prices[df_prices.Date>="2021-10-01"]
df_prices.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 317958 entries, 2244531 to 229957
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   RowId             317958 non-null  object 
 1   Date              317958 non-null  object 
 2   SecuritiesCode    317958 non-null  int64  
 3   Open              316967 non-null  float64
 4   High              316967 non-null  float64
 5   Low               316967 non-null  float64
 6   Close             316967 non-null  float64
 7   Volume            317958 non-null  int64  
 8   AdjustmentFactor  317958 non-null  float64
 9   ExpectedDividend  2109 non-null    float64
 10  SupervisionFlag   317958 non-null  bool   
 11  Target            317952 non-null  float64
dtypes: bool(1), float64(7), int64(2), object(2)
memory usage: 29.4+ MB
CPU times: user 4.84 s, sys: 365 ms, total: 5.21 s
Wall time: 7.05 s


In [3]:
print(prices.shape,df_prices.shape)

(229958, 12) (317958, 12)


In [4]:
def fill_nans(prices):
    prices.set_index(["SecuritiesCode", "Date"], inplace=True)
    prices.ExpectedDividend.fillna(0,inplace=True)
    prices.ffill(inplace=True)
    prices.fillna(0,inplace=True)
    prices.reset_index(inplace=True)
    return prices

In [5]:
%%time
df_prices = fill_nans(df_prices)
prices = fill_nans(prices)
# pd.options.display.float_format = '{:,.6g}'.format
df_prices.describe()

CPU times: user 316 ms, sys: 37 µs, total: 316 ms
Wall time: 319 ms


Unnamed: 0,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,Target
count,317958.0,317958.0,317958.0,317958.0,317958.0,317958.0,317958.0,317958.0,317958.0
mean,5883.169098,2678.654362,2712.015742,2641.133593,2675.499613,634836.0,0.999983,0.209583,-0.000368
std,2389.5796,4201.170015,4253.857654,4140.124817,4194.782076,2591023.0,0.016752,5.066329,0.023925
min,1301.0,34.0,35.0,33.0,34.0,0.0,0.25,0.0,-0.524904
25%,3916.0,1024.0,1037.0,1010.25,1023.0,32600.0,1.0,0.0,-0.012048
50%,6201.0,1817.0,1839.0,1793.0,1815.0,98900.0,1.0,0.0,-0.000367
75%,7937.0,2983.0,3015.0,2942.0,2980.0,381200.0,1.0,0.0,0.010778
max,9997.0,82000.0,82060.0,79100.0,80030.0,313148100.0,10.0,1080.0,0.597907


In [6]:
# Utilities 
def calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)#通过定义均匀间隔创建数值序列。
    weights_mean = weights.mean()
    df = df.sort_values(by='Rank')
    purchase = (df['Target'][:portfolio_size]  * weights).sum() / weights_mean
    short    = (df['Target'][-portfolio_size:] * weights[::-1]).sum() / weights_mean
    return purchase - short

def calc_spread_return_sharpe(df, portfolio_size=200, toprank_weight_ratio=2):
    grp = df.groupby('Date')
    min_size = grp["Target"].count().min()#交易最少的某一天
    if min_size<2*portfolio_size:
        portfolio_size=min_size//2
        if portfolio_size<1:
            return 0, None
    buf = grp.apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio, buf

def add_rank(df, col_name="pred"):
    df["Rank"] = df.groupby("Date")[col_name].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

In [7]:
## By Yuike - https://www.kaggle.com/code/ikeppyo/examples-of-higher-scores-than-perfect-predictions
# This function adjusts the predictions so that the daily spread return approaches a certain value.
def adjuster(df):
    def calc_pred(df, x, y, z):
        return df['Target'].where(df['Target'].abs() < x, df['Target'] * y + np.sign(df['Target']) * z)

    def objective(trial, df):
        x = trial.suggest_uniform('x', 0, 0.2)
        y = trial.suggest_uniform('y', 0, 0.05)
        z = trial.suggest_uniform('z', 0, 1e-3)
        df["Rank"] = calc_pred(df, x, y, z).rank(ascending=False, method="first") - 1 
        return calc_spread_return_per_day(df, 200, 2)

    def predictor_per_day(df):
        study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SD))#5187
        study.optimize(lambda trial: abs(objective(trial, df) - 3), 3)
        return calc_pred(df, *study.best_params.values())
    return df.groupby("Date").apply(predictor_per_day).reset_index(level=0, drop=True)

def _predictor_base(feature_df):
    return model.predict(feature_df[feats])

def _predictor_with_adjuster(feature_df):
    df_pred = feature_df.copy()
    df_pred["Target"] = model.predict(feature_df[feats])
    return adjuster(df_pred).values.T

def get_model_type(train_x,target,m_type='lgb'): 
    train_X,valid_x,train_y,valid_y = train_test_split(train_x,target,test_size=0.2,random_state=5)  
    print(train_X.shape,train_y.shape)
    print(valid_x.shape,valid_y.shape)
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.5, objective='rmse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.8,
                                )
        model.fit(train_X, train_y, 
              eval_set=[(train_X, train_y),(valid_x, valid_y)], 
            #   categorical_feature=cat_feats, 
              early_stopping_rounds=100, verbose=100)      
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=6 , learning_rate=0.05, n_estimators=2000, 
                                objective='reg:squarederror', tree_method = 'hist',subsample=0.9, 
                                colsample_bytree=0.9, min_child_samples=5,eval_metric = 'rmse',
                                random_state=2019,
                                reg_lambda = 0.5
                                )

        model.fit(train_X, train_y, 
              eval_set=[(train_X, train_y),(valid_x, valid_y)], 
              early_stopping_rounds=100, verbose=100)  
    return model

In [8]:
# np.random.seed(0)
# feats = ['Open','High','Low','Close']
# max_score = 0
# max_depth = 0
# #     model = DecisionTreeRegressor( max_depth=md ) # Controlling the overfit with max_depth parameter
# #     model.fit(df_prices[feats],df_prices["Target"])
# model_name='lgb'
# model=get_model_type(df_prices[feats],df_prices["Target"],model_name)
# predictor = _predictor_base
# prices["pred"] = predictor(prices)
# score, buf = calc_spread_return_sharpe(add_rank(prices))
# print(f'model_name={model_name} : Sharpe Ratio Score base -> {max_score}')

In [9]:
np.random.seed(0)
feats = ['Open','High','Low','Close']
max_score = 0
max_depth = 0
for md in tqdm(range(3,40)):
    model = DecisionTreeRegressor( max_depth=md ) # Controlling the overfit with max_depth parameter
    model.fit(df_prices[feats],df_prices["Target"])
    predictor = _predictor_base
    prices["pred"] = predictor(prices)
    score, buf = calc_spread_return_sharpe(add_rank(prices))
    if score>max_score:
        max_score = score
        max_depth = md
        
model = DecisionTreeRegressor( max_depth=max_depth )
model.fit(df_prices[feats],df_prices["Target"])
print(f'Max_deph={max_depth} : Sharpe Ratio Score base -> {max_score}')

  0%|          | 0/37 [00:00<?, ?it/s]

Max_deph=39 : Sharpe Ratio Score base -> 3.7184151763589814


In [10]:
# # Controlling the Sharpe Ratio Score (≃3)
# predictor = _predictor_with_adjuster
# err = 1
# maxSD = 3683
# for SD in tqdm(range(maxSD,4000)):
#     prices["pred"] = predictor(prices)
#     score, buf = calc_spread_return_sharpe(add_rank(prices))
#     if abs(score-3)<=err and score<3:
#         err=abs(score-3)
#         maxSD = SD
#         print(f'{maxSD} Sharpe Ratio Score with adjuster -> {score}')
        
# SD = maxSD

In [11]:
%%time
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for prices, options, financials, trades, secondary_prices, sample_prediction in iter_test:
    prices = fill_nans(prices)
    prices.loc[:,"pred"] = predictor(prices)
    prices = add_rank(prices)
    rank = prices.set_index('SecuritiesCode')['Rank'].to_dict()
    sample_prediction['Rank'] = sample_prediction['SecuritiesCode'].map(rank)
    env.predict(sample_prediction)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
CPU times: user 173 ms, sys: 5.01 ms, total: 178 ms
Wall time: 277 ms
