### This is a simple LGB baseline. You can work for feature engineering.
### The seed is 42, which will bring good luck!


In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import time
import datetime
from numba import jit
from lightgbm import LGBMRegressor
from multiprocessing import Pool
from sklearn.model_selection import TimeSeriesSplit

import pickle
import gc

from tqdm import tqdm

n_fold = 10
group_gap = 31
seed = 42


TRAIN_MARKET_PATH = '/kaggle/input/hku-qids-2023-quantitative-investment-competition/first_round_train_market_data.csv'
TRAIN_FUNADMENTAL_PATH = '/kaggle/input/hku-qids-2023-quantitative-investment-competition/first_round_train_fundamental_data.csv'
TRAIN_RETURN_PATH = '/kaggle/input/hku-qids-2023-quantitative-investment-competition/first_round_train_return_data.csv'


TEST_MARKET_PATH = '/kaggle/input/hku-qids-2023-quantitative-investment-competition/qids_package/first_round_test_market_data.csv'
TEST_FUNADMENTAL_PATH = '/kaggle/input/hku-qids-2023-quantitative-investment-competition/qids_package/first_round_test_fundamental_data.csv'


pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 350)

In [2]:
df_train_market = pd.read_csv(TRAIN_MARKET_PATH)
df_train_return = pd.read_csv(TRAIN_RETURN_PATH)
df_train_fundamental = pd.read_csv(TRAIN_FUNADMENTAL_PATH)

df_test_market = pd.read_csv(TEST_MARKET_PATH)
df_test_fundamental = pd.read_csv(TEST_FUNADMENTAL_PATH)


In [3]:
def split_time(x):
    df1 = x['date_time'].str.split('d', expand=True)
    df1.columns=['code','s']
    code = df1['code']
    df1 = df1['s'].str.split('p', expand=True)
    df1.columns=['day','time_step']
    df2 = x['date_time'].str.rsplit('p', expand=True)
    df2.columns=['day_s','s']
    df1['day_s'] = df2['day_s']
    df1['code'] = code
    x = pd.concat([x,df1],axis=1)
    
    return x

df_train_market = split_time(df_train_market)
df = pd.merge(df_train_fundamental,df_train_market, left_on='date_time',right_on='day_s')  
df = pd.merge(df,df_train_return, left_on='day_s',right_on='date_time')  

df_test_market = split_time(df_test_market)
test = pd.merge(df_test_fundamental,df_test_market, left_on='date_time',right_on='day_s')  


In [4]:
df = df.drop_duplicates(subset='day_s', keep='last').reset_index(drop=True)
test = test.drop_duplicates(subset='day_s', keep='last').reset_index(drop=True)


In [5]:
def correlation(a, train_data):
    
    b = train_data.get_label()
    
    a = np.ravel(a)
    b = np.ravel(b)

    len_data = len(a)
    mean_a = np.sum(a) / len_data
    mean_b = np.sum(b) / len_data
    var_a = np.sum(np.square(a - mean_a)) / len_data
    var_b = np.sum(np.square(b - mean_b)) / len_data

    cov = np.sum((a * b))/len_data - mean_a*mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return 'corr', corr, True

# For CV score calculation
def corr_score(pred, valid):
    len_data = len(pred)
    mean_pred = np.sum(pred) / len_data
    mean_valid = np.sum(valid) / len_data
    var_pred = np.sum(np.square(pred - mean_pred)) / len_data
    var_valid = np.sum(np.square(valid - mean_valid)) / len_data

    cov = np.sum((pred * valid))/len_data - mean_pred*mean_valid
    corr = cov / np.sqrt(var_pred * var_valid)

    return corr

# For CV score calculation
def wcorr_score(pred, valid, weight):
    len_data = len(pred)
    sum_w = np.sum(weight)
    mean_pred = np.sum(pred * weight) / sum_w
    mean_valid = np.sum(valid * weight) / sum_w
    var_pred = np.sum(weight * np.square(pred - mean_pred)) / sum_w
    var_valid = np.sum(weight * np.square(valid - mean_valid)) / sum_w

    cov = np.sum((pred * valid * weight)) / sum_w - mean_pred*mean_valid
    corr = cov / np.sqrt(var_pred * var_valid)

    return corr

In [6]:
df

Unnamed: 0,date_time_x,turnoverRatio,transactionAmount,pe_ttm,pe,pb,ps,pcf,date_time_y,open,close,high,low,volume,money,day,time_step,day_s,code,date_time,return
0,s0d1,3.6794,17229.0,34.4425,32.3029,4.9425,3.8180,-578.7700,s0d1p50,24.3731,24.3852,24.3852,24.3731,170476.0,4.157520e+06,1,50,s0d1,s0,s0d1,-0.026877
1,s1d1,2.5150,3706.0,28.9934,27.2726,5.0552,3.0484,23.8260,s1d1p50,16.1557,16.1314,16.2771,16.1071,70944.0,1.146780e+06,1,50,s1d1,s1,s1d1,-0.052674
2,s2d1,1.2858,5136.0,42.9352,41.9279,4.8083,4.1392,-58.2185,s2d1p50,9.0307,9.0307,9.0307,9.0307,84204.0,7.603632e+05,1,50,s2d1,s2,s2d1,-0.002691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53889,s51d998,1.1501,5179.0,14.5922,14.5922,1.4130,0.7009,19.5205,s51d998p50,3.7142,3.7142,3.7142,3.7142,189327.0,7.036010e+05,998,50,s51d998,s51,s51d998,-0.052286
53890,s52d998,0.5684,7558.0,28.9922,28.9922,5.7855,3.7150,-582.4621,s52d998p50,28.8642,28.8642,28.8642,28.8642,59609.0,1.720926e+06,998,50,s52d998,s52,s52d998,-0.015559
53891,s53d998,1.2933,9681.0,10.6513,10.6513,1.9956,0.4073,134.5467,s53d998p50,9.9410,9.9410,9.9410,9.9410,185080.0,1.838895e+06,998,50,s53d998,s53,s53d998,-0.003662


In [7]:
def train_and_evaluate(train,test):
    # Hyperparammeters (just basic)
    params = {
      'objective': 'rmse',  
      'boosting_type': 'gbdt',
      'n_jobs': -1,
      'verbose': -1
    }
    
    # Split features and target
    x = train[[i for i in df.columns if i not in ['date_time_x', 'date_time_y', 'day', 'time_step', 'day_s', 'code', 'date_time','return']]]
    y = train['return']
    
    x_test = test[[i for i in df.columns if i not in ['date_time_x', 'date_time_y', 'day', 'time_step', 'day_s', 'code', 'date_time','return']]]

    oof_predictions = np.zeros(x.shape[0])
    test_predictions = np.zeros(x_test.shape[0])
    scores = []

    # Create a KFold object
    gkf = TimeSeriesSplit(n_splits=n_fold,gap=group_gap)
    for fold, (trn_ind, val_ind) in enumerate(gkf.split(train['day'].values)):
    
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]

        train_dataset = lgb.Dataset(x_train, y_train)
        val_dataset = lgb.Dataset(x_val, y_val)
        model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 200, 
                          early_stopping_rounds = 20, 
                          verbose_eval = False,
                          feval = correlation)
        # Add predictions to the out of folds array
        
        oof_predictions[val_ind] = model.predict(x_val)
        
        rmspe_score = corr_score(y_val,oof_predictions[val_ind])
        print(f'Our out of folds corr_score is {rmspe_score}')
        scores.append(rmspe_score)
        test_predictions += model.predict(x_test) 
        
    rmspe_score = corr_score(y, oof_predictions)
    print(scores)
    print(f'Our out of folds corr score is {rmspe_score}')
    
    # Return test predictions
    return test_predictions

In [8]:
test_predictions = train_and_evaluate(df,test)


Training fold 1
Our out of folds corr_score is 0.04469048981302667
Training fold 2




Our out of folds corr_score is 0.0016948699620375541
Training fold 3
Our out of folds corr_score is 0.03304920588876939
Training fold 4
Our out of folds corr_score is 0.06361372392927257
Training fold 5
Our out of folds corr_score is 0.059660214497047954
Training fold 6
Our out of folds corr_score is -0.020799893035058553
Training fold 7
Our out of folds corr_score is 0.05544207293633959
Training fold 8
Our out of folds corr_score is 0.0758929277054789
Training fold 9
Our out of folds corr_score is 0.06857213133435174
Training fold 10
Our out of folds corr_score is -0.019069953010508692
[0.04469048981302667, 0.0016948699620375541, 0.03304920588876939, 0.06361372392927257, 0.059660214497047954, -0.020799893035058553, 0.05544207293633959, 0.0758929277054789, 0.06857213133435174, -0.019069953010508692]
Our out of folds corr score is -0.01887318116066339


In [9]:
# Save test predictions
test['return'] = test_predictions

prediction = test[['date_time_x','return']]
prediction.columns=['date_time','return']
prediction.to_csv('submission.csv',index = False)