In [2]:
import numpy as np
import pandas as pd
import math
import os
from scipy import stats
import lightgbm as lgb
#import jpx_tokyo_market_prediction
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.model_selection import cross_val_score, KFold, TimeSeriesSplit, GroupKFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
import joblib
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [3]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED=42
seed_everything(SEED)

In [4]:
train = pd.read_csv("./train_files/stock_prices.csv",parse_dates=["Date"])
train=train.drop(columns=['RowId','ExpectedDividend','AdjustmentFactor','SupervisionFlag']).dropna().reset_index(drop=True)

In [5]:
def add_features(feats):
    feats["return_1month"] = feats["Close"].pct_change(20)
    feats["return_2month"] = feats["Close"].pct_change(40)
    feats["return_3month"] = feats["Close"].pct_change(60)
    feats["volatility_1month"] = (
        np.log(feats["Close"]).diff().rolling(20).std()
    )
    feats["volatility_2month"] = (
        np.log(feats["Close"]).diff().rolling(40).std()
    )
    feats["volatility_3month"] = (
        np.log(feats["Close"]).diff().rolling(60).std()
    )
    feats["MA_gap_1month"] = feats["Close"] / (
        feats["Close"].rolling(20).mean()
    )
    feats["MA_gap_2month"] = feats["Close"] / (
        feats["Close"].rolling(40).mean()
    )
    feats["MA_gap_3month"] = feats["Close"] / (
        feats["Close"].rolling(60).mean()
    )
    
    return feats

In [6]:
train = add_features(train)

In [9]:
def feval_rmse(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'rmse', mean_squared_error(y_true, y_pred), False

def feval_pearsonr(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'pearsonr', stats.pearsonr(y_true, y_pred)[0], True

def calc_spread_return_per_day(df, portfolio_size=200, toprank_weight_ratio=2):
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    return purchase - short

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size=200, toprank_weight_ratio=2):
    buf = df.groupby('Date').apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio#, buf

def add_rank(df):
    df["Rank"] = df.groupby("Date")["Target"].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

def fill_nan_inf(df):
    df = df.fillna(0)
    df = df.replace([np.inf, -np.inf], 0)
    return df

def feval_rmse(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'rmse', mean_squared_error(y_true, y_pred), False

def feval_pearsonr(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'pearsonr', stats.pearsonr(y_true, y_pred)[0], True

def calc_spread_return_per_day(df, portfolio_size=200, toprank_weight_ratio=2):
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    return purchase - short

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size=200, toprank_weight_ratio=2):
    buf = df.groupby('Date').apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio#, buf

def add_rank(df):
    df["Rank"] = df.groupby("Date")["Target"].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

def fill_nan_inf(df):
    df = df.fillna(0)
    df = df.replace([np.inf, -np.inf], 0)
    return df

def check_score(df,preds,Securities_filter=[]):
    tmp_preds=df[['Date','SecuritiesCode']].copy()
    tmp_preds['Target']=preds
    
    #Rank Filter. Calculate median for this date and assign this value to the list of Securities to filter.
    tmp_preds['target_mean']=tmp_preds.groupby("Date")["Target"].transform('median')
    tmp_preds.loc[tmp_preds['SecuritiesCode'].isin(Securities_filter),'Target']=tmp_preds['target_mean']
    
    tmp_preds = add_rank(tmp_preds)
    df['Rank']=tmp_preds['Rank']
    score=round(calc_spread_return_sharpe(df, portfolio_size= 200, toprank_weight_ratio= 2),5)
    score_mean=round(df.groupby('Date').apply(calc_spread_return_per_day, 200, 2).mean(),5)
    score_std=round(df.groupby('Date').apply(calc_spread_return_per_day, 200, 2).std(),5)
    print(f'Competition_Score:{score}, rank_score_mean:{score_mean}, rank_score_std:{score_std}')

In [10]:
list_spred_h=list((train.groupby('SecuritiesCode')['Target'].max()-train.groupby('SecuritiesCode')['Target'].min()).sort_values()[:1000].index)
list_spred_l=list((train.groupby('SecuritiesCode')['Target'].max()-train.groupby('SecuritiesCode')['Target'].min()).sort_values()[1000:].index)

In [11]:
# Training just with Securities with hight target_spread and validated with Securities with low target_spread.

features =['High','Low','Open','Close','Volume', 'return_1month', 'return_2month', 'return_3month', 'volatility_1month', 'volatility_2month', 'volatility_3month',
       'MA_gap_1month', 'MA_gap_2month', 'MA_gap_3month']
# features =['High','Low','Open','Close','Volume',]
train=fill_nan_inf(train)

params_lgb = {'learning_rate': 0.005,'metric':'None','objective': 'regression','boosting': 'gbdt','verbosity': 0,'n_jobs': -1,'force_col_wise':True}  

tr_dataset = lgb.Dataset(train[train['SecuritiesCode'].isin(list_spred_h)][features],train[train['SecuritiesCode'].isin(list_spred_h)]["Target"],feature_name = features )
vl_dataset = lgb.Dataset(train[train['SecuritiesCode'].isin(list_spred_l)][features], train[train['SecuritiesCode'].isin(list_spred_l)]["Target"],feature_name = features)

model = lgb.train(params = params_lgb, 
                train_set = tr_dataset, 
                valid_sets = [tr_dataset, vl_dataset], 
                num_boost_round = 3000, 
                feval=feval_pearsonr,
                callbacks=[ lgb.early_stopping(stopping_rounds=300, verbose=True), lgb.log_evaluation(period=100)])    


Training until validation scores don't improve for 300 rounds
[100]	training's pearsonr: 0.0564282	valid_1's pearsonr: 0.0108009
[200]	training's pearsonr: 0.0680563	valid_1's pearsonr: 0.0134107
[300]	training's pearsonr: 0.0761529	valid_1's pearsonr: 0.0142165
[400]	training's pearsonr: 0.082453	valid_1's pearsonr: 0.0146069
[500]	training's pearsonr: 0.0883774	valid_1's pearsonr: 0.0147149
[600]	training's pearsonr: 0.0938508	valid_1's pearsonr: 0.0148599
[700]	training's pearsonr: 0.0986576	valid_1's pearsonr: 0.014839
[800]	training's pearsonr: 0.103034	valid_1's pearsonr: 0.014711
[900]	training's pearsonr: 0.106989	valid_1's pearsonr: 0.0146882
Early stopping, best iteration is:
[606]	training's pearsonr: 0.0942029	valid_1's pearsonr: 0.0148712


In [13]:
test = pd.read_csv("./supplemental_files/stock_prices.csv",parse_dates=["Date"])
test=test.drop(columns=['RowId','ExpectedDividend','AdjustmentFactor','SupervisionFlag'])
test = add_features(test)
test=fill_nan_inf(test)
preds=model.predict(test[features])
print(math.sqrt(mean_squared_error(preds,test.Target)))

check_score(test,preds)
check_score(test,preds,list_spred_h)
check_score(test,preds,list_spred_l)

0.023902313714663583
Competition_Score:0.25684, rank_score_mean:0.18588, rank_score_std:0.72371
Competition_Score:0.22799, rank_score_mean:0.18293, rank_score_std:0.80235
Competition_Score:0.194, rank_score_mean:0.10493, rank_score_std:0.54088
