# QRT Challenge Data 2021

## Summary

The aim of the 2021 QRT Challenge Data is to determine the link between two types of assets: liquids and illiquids. We provide returns of 100 illiquid assets and the aim is to predict, for the same day, the sign of the return of 100 liquid assets.

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

## Loading the data

In [2]:
X_train = pd.read_csv('../ChallengeDataQRT2021Data/X_train_itDkypA.csv', index_col = 0)
Y_train = pd.read_csv('../ChallengeDataQRT2021Data/y_train_3LeeT2g.csv', index_col = 0)
X_test = pd.read_csv('../ChallengeDataQRT2021Data/X_test_Beg4ey3.csv', index_col = 0)

In [3]:
X_train.head()

Unnamed: 0_level_0,ID_DAY,RET_216,RET_238,RET_45,RET_295,RET_230,RET_120,RET_188,RET_260,RET_15,...,RET_122,RET_194,RET_72,RET_293,RET_281,RET_193,RET_95,RET_162,RET_297,ID_TARGET
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3316,0.004024,0.009237,0.004967,,0.01704,0.013885,0.041885,0.015207,-0.003143,...,0.007596,0.01501,0.014733,-0.000476,0.006539,-0.010233,0.001251,-0.003102,-0.094847,139
1,3316,0.004024,0.009237,0.004967,,0.01704,0.013885,0.041885,0.015207,-0.003143,...,0.007596,0.01501,0.014733,-0.000476,0.006539,-0.010233,0.001251,-0.003102,-0.094847,129
2,3316,0.004024,0.009237,0.004967,,0.01704,0.013885,0.041885,0.015207,-0.003143,...,0.007596,0.01501,0.014733,-0.000476,0.006539,-0.010233,0.001251,-0.003102,-0.094847,136
3,3316,0.004024,0.009237,0.004967,,0.01704,0.013885,0.041885,0.015207,-0.003143,...,0.007596,0.01501,0.014733,-0.000476,0.006539,-0.010233,0.001251,-0.003102,-0.094847,161
4,3316,0.004024,0.009237,0.004967,,0.01704,0.013885,0.041885,0.015207,-0.003143,...,0.007596,0.01501,0.014733,-0.000476,0.006539,-0.010233,0.001251,-0.003102,-0.094847,217


## Reshaping the data

We transform the data so that each line corresponds to a specific day

In [4]:
idx_ret_features = np.where(X_train.columns.str.contains('RET'))[0]
init_ret_features = X_train.columns[idx_ret_features]
target_ret_features = ['RET_' + str(a) for a in X_train['ID_TARGET'].unique()]

In [5]:
returns = {}
for day in tqdm(X_train.ID_DAY.unique()):
    u = X_train.loc[X_train.ID_DAY == day]
    a = u.iloc[0, idx_ret_features]
    b = Y_train[X_train.ID_DAY == day]['RET_TARGET']
    b.index = ['RET_' + str(a) for a in u.ID_TARGET]
    returns[day] = pd.concat([a, b])
returns = pd.DataFrame(returns).T.astype(float)

HBox(children=(IntProgress(value=0, max=2748), HTML(value='')))




In [6]:
returns.head()

Unnamed: 0,RET_0,RET_1,RET_102,RET_105,RET_106,RET_108,RET_109,RET_110,RET_114,RET_115,...,RET_88,RET_9,RET_90,RET_91,RET_93,RET_95,RET_96,RET_97,RET_98,RET_99
3316,-0.016501,0.018693,0.005316,,0.001352,0.027374,-0.01089,0.057911,0.014155,0.021104,...,0.027113,0.008602,0.01825,0.038581,0.027588,0.001251,0.037917,-0.002689,0.00214,0.023026
3355,0.000674,0.005759,0.007938,,-0.005017,-0.007413,-0.00598,0.0119,-0.011755,0.032401,...,0.017933,0.001658,0.061274,-0.018643,0.000372,-0.010232,-0.013496,0.013819,-0.036501,0.017736
1662,-0.000919,0.01043,0.007901,,0.006959,0.005593,-0.066666,0.049822,0.021599,0.019816,...,0.016645,-0.016289,0.02769,-0.026479,0.006664,0.023721,0.013057,-0.008237,0.014655,0.011614
3405,-0.004989,0.004558,0.004325,,-0.00772,0.002788,-0.038432,-0.021308,0.000595,0.001613,...,-0.039065,0.021897,-0.031456,0.029652,0.016171,-0.024835,-0.020391,0.00679,0.015796,-0.015763
1602,0.002468,-0.024584,0.008947,,-0.004735,-0.031061,0.012366,0.045117,-0.015403,-0.015327,...,-0.02808,0.01653,0.019826,0.014627,0.010884,-0.025205,0.006157,-0.006526,0.002125,0.036186


## Determine the best models

For each target asset (liquid assets), we determine the best logit models.
We first test the most significant feature, and then add them sequentially. At the end, we take the best AIC.

In [7]:
from sklearn.covariance import oas
features = returns.columns
cov = pd.DataFrame(oas(returns.fillna(0))[0], index=features, columns=features)

In [8]:
returns_corr = returns.corr()
returns_corr.head()

Unnamed: 0,RET_0,RET_1,RET_102,RET_105,RET_106,RET_108,RET_109,RET_110,RET_114,RET_115,...,RET_88,RET_9,RET_90,RET_91,RET_93,RET_95,RET_96,RET_97,RET_98,RET_99
RET_0,1.0,0.178785,0.112833,0.146518,0.124386,0.08975,0.209515,0.099405,0.176252,0.172117,...,0.220032,-0.011212,0.18845,0.071751,0.198287,0.192124,0.061262,0.181465,0.098042,0.141503
RET_1,0.178785,1.0,0.102085,0.204024,0.112454,0.115006,0.185288,0.17657,0.217383,0.255642,...,0.287438,-0.050225,0.241026,0.099254,0.169832,0.231416,0.087668,0.173145,0.084043,0.155121
RET_102,0.112833,0.102085,1.0,0.18749,0.404844,0.202437,0.074562,0.193401,0.10124,0.159988,...,0.172473,0.009289,0.152705,0.043701,0.11582,0.202409,0.034653,0.175302,0.445446,0.226669
RET_105,0.146518,0.204024,0.18749,1.0,0.181432,0.146691,0.135362,0.250015,0.180981,0.225471,...,0.216492,-0.003092,0.23141,0.179422,0.226487,0.214873,0.135527,0.23741,0.200838,0.246471
RET_106,0.124386,0.112454,0.404844,0.181432,1.0,0.142693,0.093292,0.176528,0.117702,0.1718,...,0.170236,-0.018825,0.14098,0.020384,0.164396,0.180597,0.012277,0.209689,0.403044,0.194298


In [20]:
# Scoring different kernels and features
from sklearn.svm import SVR
SVR_kernels = ['linear', 'poly', 'rbf', 'sigmoid']

target_svr_scores = {}
for id_target in tqdm(target_ret_features):
    target_corr = returns_corr.loc[init_ret_features, id_target]
    target_corr = pd.DataFrame(target_corr).sort_values(by = id_target, ascending = False)
    na_days = list(returns[id_target][returns[id_target].isna()].index.values)
    target_scores = {}
    for kern in SVR_kernels:
        target_scores[kern] = []
    for param_number in range(1,idx_ret_features.shape[0]+1):
        na_days = na_days + list(returns[target_corr.index[param_number-1]][returns[target_corr.index[param_number-1]].isna()].index.values)
        X = returns[target_corr.index[0:param_number]][~returns.index.isin(na_days)]
        Y = returns[id_target][~returns.index.isin(na_days)]
        for kern in SVR_kernels:
            model = SVR(kernel=kern, C = 1e-6)
            score = cross_val_score(model, X, Y, cv = 10).mean()
            target_scores[kern].append(score)
    target_svr_scores[id_target] = target_scores 
    break

HBox(children=(IntProgress(value=0), HTML(value='')))

































































































































































































































In [23]:
Y

3116    0.003270
2632    0.003631
2098   -0.000590
2724    0.013399
2750   -0.011426
          ...   
3900    0.008145
1934    0.010781
2406    0.002304
1350   -0.000300
2354   -0.003185
Name: RET_139, Length: 390, dtype: float64

In [31]:
 target_param_number = target_scores.index(min(target_scores))+1
    target_features = target_corr.index[0:target_param_number]
    na_days = list(returns[id_target][returns[id_target].isna()].index.values)
    for param_number in range(1,target_param_number+1):
        na_days = na_days + list(returns[target_corr.index[param_number-1]][returns[target_corr.index[param_number-1]].isna()].index.values)
    X = returns[target_features][~returns.index.isin(na_days)]
    Y = returns[id_target][~returns.index.isin(na_days)]
    Y = np.sign(Y)
    neg = Y[Y<0].index.values
    Y[neg] = 0
    target_model = sm.Logit(Y, X)
    target_reg = target_model.fit()
    target_regs[id_target] = [target_param_number, target_features, target_reg]+

IndentationError: unexpected indent (<ipython-input-31-9f716e93a3ae>, line 2)

In [10]:
target_regs

{'RET_139': [97,
  Index(['RET_18', 'RET_74', 'RET_156', 'RET_296', 'RET_223', 'RET_193',
         'RET_83', 'RET_172', 'RET_245', 'RET_63', 'RET_201', 'RET_72',
         'RET_122', 'RET_84', 'RET_49', 'RET_62', 'RET_168', 'RET_121', 'RET_66',
         'RET_194', 'RET_30', 'RET_87', 'RET_15', 'RET_264', 'RET_222', 'RET_26',
         'RET_110', 'RET_45', 'RET_238', 'RET_182', 'RET_118', 'RET_256',
         'RET_163', 'RET_216', 'RET_58', 'RET_262', 'RET_0', 'RET_224',
         'RET_197', 'RET_138', 'RET_203', 'RET_276', 'RET_105', 'RET_41',
         'RET_150', 'RET_99', 'RET_242', 'RET_5', 'RET_263', 'RET_187',
         'RET_295', 'RET_268', 'RET_31', 'RET_116', 'RET_285', 'RET_260',
         'RET_115', 'RET_259', 'RET_55', 'RET_270', 'RET_95', 'RET_123',
         'RET_261', 'RET_240', 'RET_159', 'RET_297', 'RET_286', 'RET_35',
         'RET_108', 'RET_250', 'RET_266', 'RET_213', 'RET_162', 'RET_229',
         'RET_120', 'RET_59', 'RET_230', 'RET_88', 'RET_56', 'RET_265',
         'RET_

## Prediction on test data

We thus simply make the predictions on the test data set using the models.

If there is missing values, we use the iterative imputer from sklearn

In [11]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp_mean = IterativeImputer()
imp_mean.fit(X = X_train)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=None,
                 sample_posterior=False, tol=0.001, verbose=0)

In [12]:
pred = {}
NA_count = 0
for idx, row in tqdm(X_test.iterrows()):
    id_target = 'RET_' + str(int(row['ID_TARGET']))
    target_param_number, target_features, target_reg = target_regs[id_target]
    row_fill = pd.DataFrame(imp_mean.transform(np.matrix(row)))
    row_fill.columns = row.index
    X = row_fill[target_features]
    res = 0
    if np.isnan(X).values.any():
        res = np.sign(returns[id_target].mean())
        NA_count += 1
    else:
        res = np.sign(target_reg.predict(np.matrix(X))-0.5)[0]
    pred[idx] = res
pred = pd.Series(pred, name="RET_TARGET")

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [13]:
NA_count

0

## Save the result before submission

In [14]:
pred.name = "RET_TARGET"
pred.index.name = 'ID'
pred = pred.astype(int)
pred.to_csv('../ChallengeDataQRT2021Data/logit.csv', header = True)