In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import lightgbm as lgb
import time
import datetime

# from lightgbm import LGBMRegressor
from multiprocessing import Pool
from sklearn.model_selection import TimeSeriesSplit

import pickle
import gc

import tqdm

n_fold = 10
group_gap = 31
seed = 42

In [2]:
ROOT_PATH = "../../data/"
TRAIN_MARKET_PATH = f'{ROOT_PATH}first_round_train_market_data.csv'
# TRAIN_MARKET_PATH = f'{ROOT_PATH}train.csv'
TRAIN_FUNADMENTAL_PATH = f'{ROOT_PATH}first_round_train_fundamental_data.csv'
TRAIN_RETURN_PATH = f'{ROOT_PATH}first_round_train_return_data.csv'

TEST_ROOT_PATH = "../qids_package/"
TEST_MARKET_PATH = f'{TEST_ROOT_PATH}first_round_test_market_data.csv'
# TEST_MARKET_PATH = f'{ROOT_PATH}test.csv'
TEST_FUNADMENTAL_PATH = f'{TEST_ROOT_PATH}first_round_test_fundamental_data.csv'


pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 350)

#read data
df_train_market = pd.read_csv(TRAIN_MARKET_PATH)
df_train_return = pd.read_csv(TRAIN_RETURN_PATH)
df_train_fundamental = pd.read_csv(TRAIN_FUNADMENTAL_PATH)

df_test_market = pd.read_csv(TEST_MARKET_PATH)
df_test_fundamental = pd.read_csv(TEST_FUNADMENTAL_PATH)

#merge train dataset and test dataset
def split_time(x):
    df1 = x['date_time'].str.split('d', expand=True)
    df1.columns=['code','s']
    code = df1['code']
    df1 = df1['s'].str.split('p', expand=True)
    df1.columns=['day','time_step']
    df2 = x['date_time'].str.rsplit('p', expand=True)
    df2.columns=['day_s','s']
    df1['day_s'] = df2['day_s']
    df1['code'] = code
    x = pd.concat([x,df1],axis=1)
    return x

df_train_market = split_time(df_train_market)
df = pd.merge(df_train_fundamental,df_train_market, left_on='date_time',right_on='day_s')  
df = pd.merge(df,df_train_return, left_on='day_s',right_on='date_time')  

df_test_market = split_time(df_test_market)
test = pd.merge(df_test_fundamental,df_test_market, left_on='date_time',right_on='day_s')  

#drop duplicates
df = df.drop_duplicates(subset='day_s', keep='last').reset_index(drop=True)
test = test.drop_duplicates(subset='day_s', keep='last').reset_index(drop=True)

def growth(data, features, group):
    
    """
    create growth rate column based on selected features
    """
    
    grouped = data.groupby(group)
    
    for feature in features:
        data[f'{feature}_growth'] = grouped[feature].pct_change()
        
    return data

def lag_feature_with_group(data, features, n, group):

    """
    create a lagged column in data from feature with n lagging periods
    """
    
    grouped = data.groupby(group)
    
    for i in range(1, n+1):
        for feature in features:
            data[f'{feature}_{i}'] = grouped[feature].shift(i)
        
    return data

def sma(data, features, n, group):
    
    """
    create sma(n) column in data from feature
    """
    
    grouped = data.groupby(group)
    
    for i in n:
        for feature in features:
            data[f'{feature}_sma{i}'] = grouped.rolling(i)[feature].mean().reset_index(drop=True)
        
    return data

features = ['pe_ttm', 'pe', 'pb', 'ps', 'pcf']
sma_periods = [10,25,50]
df_lag = lag_feature_with_group(df, features, 2, 'code')
df_sma = sma(df_lag, features, sma_periods, 'code')
df_growth = growth(df_sma, features, 'code')
# fig, ax = plt.subplots(figsize=(7,15))
# sns.heatmap(df_lag.corr(numeric_only=True)[['return']].sort_values(by='return', ascending=False),annot=True);

test = lag_feature_with_group(test, features, 2, 'code')
test = sma(test, features, sma_periods, 'code')
test = growth(test, features, 'code')

# df = df.dropna()
# test = test.dropna()

FileNotFoundError: [Errno 2] No such file or directory: '../../data/first_round_train_market_data.csv'

In [3]:
import pandas as pd
import numpy as np
import copy
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

seed = 257248
stock_num = 54
train_day_num_total = 1000
train_day_num = 1000 - 2
test_day_num = 700
timeslot_num = 50

calc_log = lambda df: np.log(np.where(df > 1e-8, df, 1e-8))
calc_mean = lambda df: df.mean(axis=0)
calc_max = lambda df: df.max(axis=0)
calc_min = lambda df: df.min(axis=0)
calc_std = lambda df: df.std()
calc_var = lambda df: df.var()
calc_add = lambda df1, df2: df1 + df2
calc_diff = lambda df1, df2: df1 - df2
calc_prod = lambda df1, df2: df1 * df2
calc_div = lambda df1, df2: df1 / df2

def preprocess(fun, mar, ret=None):
    fun["stock_id"] = fun["date_time"].apply(lambda x: x.split("d")[0][1:]).astype("int")
    fun["day"] = fun["date_time"].apply(lambda x: x.split("d")[1][:]).astype("int")
    # fun["log_pb"] = calc_log(fun["pb"])
    # fun["log_ps"] = calc_log(fun["ps"])
    fun = fun.sort_values(by=["stock_id", "day"])
    na_fun = fun.loc[fun["day"].isin([999, 1000])]
    fun = fun.drop(na_fun.index, axis=0).reset_index(drop=True)
    na_fun = na_fun.reset_index(drop=True)

    mar["stock_id"] = mar["date_time"].apply(lambda x: x.split("d")[0][1:]).astype("int")
    mar["day"] = mar["date_time"].apply(lambda x: x.split("d")[1].split("p")[0]).astype("int")
    mar["time"] = mar["date_time"].apply(lambda x: x.split("p")[1]).astype("int")
    mar = mar.sort_values(by=["stock_id", "day", "time"]).reset_index(drop=True)
    na_mar = mar.loc[mar["day"].isin([999, 1000])]
    mar = mar.drop(na_mar.index, axis=0).reset_index(drop=True)
    na_mar = na_mar.reset_index(drop=True)

    combined = copy.deepcopy(fun)
    if ret is not None:
        ret["stock_id"] = ret["date_time"].apply(lambda x: x.split("d")[0][1:]).astype("int")
        ret["day"] = ret["date_time"].apply(lambda x: x.split("d")[1][:]).astype("int")
        # ret["log_pb"] = calc_log(ret["pb"])
        # ret["log_ps"] = calc_log(ret["ps"])
        ret = ret.sort_values(by=["stock_id", "day"]).reset_index(drop=True)
        combined["return"] = ret["return"]
        day_num = train_day_num
    else:
        day_num = test_day_num

    mar_summary = []
    start = 0
    for stock in range(stock_num):
        end = start + day_num * timeslot_num
        stock_info = mar.iloc[start:end, :]
        day_start = 0
        for day in range(day_num):
            day_end = day_start + timeslot_num
            stock_info_per_day = stock_info.iloc[day_start:day_end, :]
            mar_summary.append([
                calc_mean(stock_info_per_day["open"]),
                calc_mean(stock_info_per_day["close"]),
                calc_mean(stock_info_per_day["high"]),
                calc_mean(stock_info_per_day["low"]),
                calc_mean(stock_info_per_day["volume"]),
                calc_mean(stock_info_per_day["money"]),
                calc_max(stock_info_per_day["high"]),
                calc_max(stock_info_per_day["volume"]),
                calc_max(stock_info_per_day["money"]),
                calc_min(stock_info_per_day["low"]),
                calc_min(stock_info_per_day["volume"]),
                calc_min(stock_info_per_day["money"]),
                calc_std(stock_info_per_day["volume"]),
                calc_std(stock_info_per_day["money"]),
                calc_var(stock_info_per_day["volume"]),
                calc_var(stock_info_per_day["money"]),
                calc_max(calc_div(calc_diff(stock_info_per_day["close"], stock_info_per_day["open"]), stock_info_per_day["open"])),
                calc_max(calc_div(calc_diff(stock_info_per_day["high"], stock_info_per_day["low"]), stock_info_per_day["open"])),
            ])
            day_start = day_end
        start = end
    cols = [
        "open_mean",
        "close_mean",
        "high_mean",
        "low_mean",
        "volume_mean",
        "money_mean",
        "high_max",
        "volume_max",
        "money_max",
        "low_min",
        "volume_min",
        "money_min",
        "volume_std",
        "money_std",
        "volume_var",
        "money_var",
        "price_diff",
        "price_diff_max",
    ]
    mar_summary = pd.DataFrame(mar_summary, columns=cols)
    combined = pd.concat([combined, mar_summary], axis=1)

    return [combined, fun, mar, na_fun, na_mar, ret] if ret is not None else [combined, fun, mar, na_fun, na_mar]

train = df.drop(columns=["date_time_x", "date_time_y", "day_s", "code"]).fillna(0)
test = test.rename(columns={"date_time_x": "date_time"}).drop(columns=["date_time_y", "day_s", "code"]).fillna(0)

train_fun, train_mar, train_ret = train, df_train_market, df_train_return
test_fun, test_mar = test, df_test_market

train_combined, train_fun, train_mar, train_na_fun, train_na_mar, train_ret = preprocess(train_fun, train_mar, train_ret)
test_combined, test_fun, test_mar, test_na_fun, test_na_mar = preprocess(test_fun, test_mar)

In [4]:
def reorder(df):
    df_cols = df.columns
    if 'return' not in df_cols:
        df_cols_prior = ['date_time', 'stock_id', 'day']
    else:
        df_cols_prior = ['date_time', 'stock_id', 'day', 'return']
    for col in df_cols:
        if col not in df_cols_prior:
            df_cols_prior.append(col)
    if 'return' in df_cols_prior:
        df_cols_prior.remove('return')
        df_cols_prior.append('return')
    return df[df_cols_prior]

train = reorder(train_combined)
test = reorder(test_combined)

train.to_csv("../../data/train_github.csv", index=False)
test.to_csv("../../data/test_github.csv", index=False)