In [56]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import os

%matplotlib inline

In [57]:
from Preprocessing.preprocessing import *

In [58]:
from sklearn.pipeline import Pipeline

1. Invert OMR, OTR, OCR
2. Fill last NaNs (0 for know)
3. Create number_trades_same_day
4. Add type column
5. Drop columns

In [59]:
X_TRAIN_PATH = 'Data/AMF_train_X_XCZw8r3.csv'
X_TEST_PATH = 'Data/AMF_test_X_uDs0jHH.csv' # won't be loaded here
Y_TRAIN_PATH = 'Data/AMF_train_Y_omYQJZL.csv'

In [60]:
df_x_train = pd.read_csv(X_TRAIN_PATH, index_col=0, sep=',')
df_y_train = pd.read_csv(Y_TRAIN_PATH, sep=',')

In [61]:
multi_loc = ['min_dt_TV1_TV2', 'mean_dt_TV1_TV2', 'med_dt_TV1_TV2',
            'min_dt_TV1_TV3', 'mean_dt_TV1_TV3', 'med_dt_TV1_TV3',
            'min_dt_TV1_TV4', 'mean_dt_TV1_TV4', 'med_dt_TV1_TV4']

events_dropped = ['min_time_two_events', 'mean_time_two_events'] # keep 10, 75, 90
cancel_dropped = ['10_p_lifetime_cancel', '25_p_lifetime_cancel', '75_p_lifetime_cancel',
                'mean_lifetime_cancel', 'max_lifetime_cancel']
TV1_dropped = ['min_dt_TV1', 'med_dt_TV1']
corr_dropped = events_dropped+cancel_dropped+TV1_dropped

dropped_strings = ['Day', 'Share']

to_invert = ['OTR', 'OMR', 'OCR']

pipe = Pipeline(steps=[('count shares same day', CountSharesSameDay()),
                       ('Add type', AddType()),
                       ('inverter', Inverter(cols = to_invert)),
                       ('drop useless cols', DropCols(cols = multi_loc + corr_dropped + dropped_strings)),
                       ('mean imput', MeanImput()),
                      ],
               verbose = True)

In [62]:
res = pipe.fit_transform(df_x_train, df_y_train)

[Pipeline]  (step 1 of 5) Processing count shares same day, total=   0.1s
[Pipeline] .......... (step 2 of 5) Processing Add type, total=   0.0s
[Pipeline] .......... (step 3 of 5) Processing inverter, total=   0.1s
[Pipeline] . (step 4 of 5) Processing drop useless cols, total=   0.0s
[Pipeline] ........ (step 5 of 5) Processing mean imput, total=   0.0s


In [63]:
res.head()

Unnamed: 0,Trader,OTR,OCR,OMR,10_p_time_two_events,med_time_two_events,25_p_time_two_events,75_p_time_two_events,90_p_time_two_events,max_time_two_events,min_lifetime_cancel,med_lifetime_cancel,90_p_lifetime_cancel,NbTradeVenueMic,MaxNbTradesBySecond,MeanNbTradesBySecond,mean_dt_TV1,NbSecondWithAtLeatOneTrade,Nber_shares_same_day,type
0,Trader_10,0.44,0.12,0.08,0.0,419.6885,10.722543,984.32056,23151.838,25139.598,682.15326,984.32056,23151.838,1,7,2.75,9639.216031,4,5,NON HFT
1,Trader_10,0.589404,0.039735,0.046358,7.4e-05,0.003374,0.000204,8.768699,2671.4883,31278.357,11.866026,177.93991,21433.684,1,17,5.933333,493.559196,15,5,NON HFT
2,Trader_10,0.674419,0.021142,0.008457,7.1e-05,0.000599,0.000129,5.725427,448.8836,30799.467,2.761036,187.99548,1953.6235,1,20,5.063492,163.954785,63,4,NON HFT
3,Trader_10,0.586207,0.068966,0.034483,0.0,6.152666,0.000945,62.444176,19187.719,19187.719,286.01932,286.01932,19187.719,1,8,4.25,1232.562133,4,3,NON HFT
4,Trader_10,0.658879,0.037383,0.0,7.1e-05,0.001364,0.000146,2.22542,273.45676,23164.514,2.798452,1345.9528,23164.514,1,19,3.710526,248.073927,38,1,NON HFT


In [64]:
# Name of the folder containing all the preprocessed data
preprocessed_data_path = 'Processed_data/'

if(not os.path.exists(preprocessed_data_path)):
    os.mkdir(preprocessed_data_path)


pre_training1 = Pipeline(steps=[('split test/train', SplitTrainTest_Gen(train_size = 0.8, tol = 0.20, folder=preprocessed_data_path, suffix='evenly_splitted_traders2'))],
                        verbose = True)

In [65]:
X_train, y_train = pre_training1.fit_transform(res, df_y_train)

Train set size: 0.82
y_train proportions MIX 0.49
y_train proportions HFT 0.29
y_train proportions NON HFT 0.22

Test set size: 0.18
y_test proportions MIX 0.49
y_test proportions HFT 0.36
y_test proportions NON HFT 0.16
[Pipeline] .. (step 1 of 1) Processing split test/train, total=   0.1s


In [66]:
pre_training2 = Pipeline(steps=[('split test/train', SplitTrainTest_Default(train_size = 0.7, folder=preprocessed_data_path, suffix='evenly_splitted')),
                                ],
                        verbose = True)

In [67]:
X_train, y_train = pre_training2.fit_transform(res)

Train set size: 0.7
y_train proportions HFT 0.3
y_train proportions NON HFT 0.21
y_train proportions MIX 0.49

Test set size: 0.3
y_test proportions HFT 0.3
y_test proportions NON HFT 0.21
y_test proportions MIX 0.49
[Pipeline] .. (step 1 of 1) Processing split test/train, total=   0.3s


## Pre-Processing test data:

In [68]:
df_x_test = pd.read_csv(X_TEST_PATH, index_col=0, sep=',')

In [69]:
res = pipe.fit_transform(df_x_test)


[Pipeline]  (step 1 of 5) Processing count shares same day, total=   0.1s
[Pipeline] .......... (step 2 of 5) Processing Add type, total=   0.0s
[Pipeline] .......... (step 3 of 5) Processing inverter, total=   0.1s
[Pipeline] . (step 4 of 5) Processing drop useless cols, total=   0.0s
[Pipeline] ........ (step 5 of 5) Processing mean imput, total=   0.0s


In [70]:
#res = res.drop(columns = 'Trader')
res.head()

Unnamed: 0,Trader,OTR,OCR,OMR,10_p_time_two_events,med_time_two_events,25_p_time_two_events,75_p_time_two_events,90_p_time_two_events,max_time_two_events,min_lifetime_cancel,med_lifetime_cancel,90_p_lifetime_cancel,NbTradeVenueMic,MaxNbTradesBySecond,MeanNbTradesBySecond,mean_dt_TV1,NbSecondWithAtLeatOneTrade,Nber_shares_same_day
0,Adelaide,0.897059,0.014706,0.0,6.1e-05,0.00387,0.000196,0.340629,4.720414,751.59094,283.955,283.955,283.955,1,19,3.8125,514.483186,16,1
1,Adelaide,0.6,0.2,0.0,0.00197,0.005786,0.00197,0.008854,270.50568,270.50568,270.5223,270.5223,270.5223,1,3,3.0,0.004,1,1
2,Adelaide,0.918919,0.027027,0.0,0.0,0.000485,7e-05,2.182102,250.33145,272.0694,272.0694,272.0694,272.0694,1,13,5.666667,124.1945,6,1
3,Adelaide,0.444444,0.222222,0.0,0.0,0.004708,0.001081,230.00269,389.48483,389.48483,230.00269,230.00269,389.48483,1,3,2.0,0.261333,2,1
4,Adelaide,0.789474,0.052632,0.0,6.3e-05,0.000152,6.5e-05,0.339079,129.88925,549.25635,237.76761,237.76761,237.76761,1,6,3.0,4621.266333,5,1


In [71]:
res.to_csv('Processed_data/X_test_preprocessed.csv', index=False)

## Pre-Processing two-stage training

In [73]:
# we add specific folders
preprocessed_data_path = 'Processed_data/'
two_stage_folder = preprocessed_data_path+'two_stage/'
stage_1_folder = two_stage_folder+'stage_1/'
stage_2_folder = two_stage_folder+'stage_2/'

if(not os.path.exists(preprocessed_data_path)):
    os.mkdir(preprocessed_data_path)

if(not os.path.exists(two_stage_folder)):
    os.mkdir(two_stage_folder)
    
if(not os.path.exists(stage_1_folder)):
    os.mkdir(stage_1_folder)
    
if(not os.path.exists(stage_2_folder)):
    os.mkdir(stage_2_folder)

##### 1. Split 'X' into 'X_stage1' and 'X_stage2'

In [74]:
pipe_split_stage_12 = Pipeline(steps=[('count shares same day', CountSharesSameDay()),
                       ('Add type', AddType()),
                       ('inverter', Inverter(cols = to_invert)),
                       ('drop useless cols', DropCols(cols = multi_loc + corr_dropped + dropped_strings)),
                       ('mean imput', MeanImput()),
                       ('split stage 1 and stage 2', SplitTrainTest_Gen(train_size = 0.6, tol = 0.05, folder=two_stage_folder,
                                                                    suffix='stage12_split', drop_traders=False, drop_type=False)),
                      ],
               verbose = True)

In [75]:
X_stage1, y_stage1 = pipe_split_stage_12.fit_transform(df_x_train, df_y_train)

[Pipeline]  (step 1 of 6) Processing count shares same day, total=   0.1s
[Pipeline] .......... (step 2 of 6) Processing Add type, total=   0.0s
[Pipeline] .......... (step 3 of 6) Processing inverter, total=   0.1s
[Pipeline] . (step 4 of 6) Processing drop useless cols, total=   0.0s
[Pipeline] ........ (step 5 of 6) Processing mean imput, total=   0.0s
Train set size: 0.61
y_train proportions MIX 0.47
y_train proportions HFT 0.3
y_train proportions NON HFT 0.23

Test set size: 0.39
y_test proportions MIX 0.51
y_test proportions HFT 0.3
y_test proportions NON HFT 0.18
[Pipeline]  (step 6 of 6) Processing split stage 1 and stage 2, total=   0.1s


##### 2. Split 'X_stage1' into training and test sets

In [76]:
pipe_stage1 = Pipeline(steps=[('drop mix traders', DropMixTraders()),
                              ('stage 1 eval and test split ', SplitTrainTest_Gen(train_size = 0.7, tol = 0.05,
                                                                                  folder=stage_1_folder, suffix='stage1')),         
                        ],
                      verbose = True)

In [77]:
X_train_s1, y_train_s1 = pipe_stage2.fit_transform(X_stage1, y_stage1)

Train set size: 0.69
y_train proportions MIX 0.49
y_train proportions HFT 0.32
y_train proportions NON HFT 0.19

Test set size: 0.31
y_test proportions MIX 0.43
y_test proportions NON HFT 0.32
y_test proportions HFT 0.25
[Pipeline]  (step 1 of 1) Processing stage 2 eval and test split , total=   0.2s


##### 3. Split 'X_stage2' into training and test sets

In [78]:
# one must first retrieve X_stage2 and y_stage2
X_stage2_filename = "X_test_stage12_split.pkl"
y_stage2_filename = "y_test_stage12_split.pkl"

X_stage2 = pd.read_pickle(two_stage_folder+X_stage2_filename)
y_stage2 = pd.read_pickle(two_stage_folder+y_stage2_filename)

In [79]:
pipe_stage2 = Pipeline(steps=[('stage 2 eval and test split ', SplitTrainTest_Gen(train_size = 0.7, tol = 0.05,
                                                                                  folder=stage_2_folder, suffix='stage2',
                                                                                 drop_traders=False, drop_type=False,)),         
                        ],
                      verbose = True)

In [53]:
X_train_s2, y_train_s2 = pipe_stage2.fit_transform(X_stage2, y_stage2)

Train set size: 0.67
y_train proportions MIX 0.4
y_train proportions HFT 0.35
y_train proportions NON HFT 0.25

Test set size: 0.33
y_test proportions MIX 0.52
y_test proportions HFT 0.28
y_test proportions NON HFT 0.2
[Pipeline]  (step 1 of 1) Processing stage 2 eval and test split , total=   0.1s


##### 4. Quick checks

In [54]:
X_train_s1.head()

Unnamed: 0,OTR,OCR,OMR,10_p_time_two_events,med_time_two_events,25_p_time_two_events,75_p_time_two_events,90_p_time_two_events,max_time_two_events,min_lifetime_cancel,med_lifetime_cancel,90_p_lifetime_cancel,NbTradeVenueMic,MaxNbTradesBySecond,MeanNbTradesBySecond,mean_dt_TV1,NbSecondWithAtLeatOneTrade,Nber_shares_same_day
77,0.270742,0.027948,0.0,0.0,5.2e-05,0.0,5298.3413,18278.512,49783.777,65.953735,16796.088,39275.45,1,111,2.108844,189.024496,147,109
78,0.341849,0.03528,0.0,0.0,0.000348,0.0,1027.4409,17696.729,33397.78,63.845245,8531.339,25469.168,1,25,1.836601,170.563856,153,96
79,0.124611,0.018692,0.0,0.0,0.0,0.0,892.6599,5991.261,44633.32,91.31745,22268.738,44633.32,1,3,1.081081,811.300773,37,90
80,0.179104,0.014925,0.0,0.0,0.0,0.0,21549.754,21579.168,21580.172,17357.777,17357.777,17357.777,1,2,1.2,2707.063998,10,82
81,0.13783,0.002933,0.0,0.0,0.0,0.0,4332.2144,8319.635,26379.031,21637.67,21637.67,21637.67,1,14,1.46875,888.818923,32,85


In [55]:
X_train_s2.head()

Unnamed: 0,Trader,OTR,OCR,OMR,10_p_time_two_events,med_time_two_events,25_p_time_two_events,75_p_time_two_events,90_p_time_two_events,max_time_two_events,min_lifetime_cancel,med_lifetime_cancel,90_p_lifetime_cancel,NbTradeVenueMic,MaxNbTradesBySecond,MeanNbTradesBySecond,mean_dt_TV1,NbSecondWithAtLeatOneTrade,Nber_shares_same_day,type
3965,Trader_110,0.333333,0.111111,0.0,0.0,0.0,0.0,1415.4518,43617.375,43617.375,43617.375,43617.375,43617.375,1,1,1.0,9952.215491,3,2,NON HFT
3966,Trader_110,0.3,0.1,0.0,0.0,0.0,0.0,26605.14,26605.14,26605.14,26605.14,26605.14,26605.14,1,3,3.0,1082.427774,1,3,NON HFT
3967,Trader_110,0.2,0.2,0.0,1487.8102,1487.8102,1487.8102,29413.115,29413.115,29413.115,29413.115,29413.115,29413.115,1,1,1.0,1082.427774,1,3,NON HFT
3968,Trader_110,0.333333,0.166667,0.0,10378.027,21577.21,10378.027,28757.3,28757.3,28757.3,28757.3,28757.3,28757.3,1,1,1.0,17615.029433,2,1,NON HFT
3969,Trader_110,0.25,0.25,0.0,0.0,0.0,0.0,25907.355,25907.355,25907.355,25907.355,25907.355,25907.355,1,1,1.0,1082.427774,1,3,NON HFT


In [45]:
y_train_s1.value_counts()

2    13820
0     8334
Name: type, dtype: int64

In [46]:
y_train_s2.value_counts()

1    12332
2    10372
0     5636
Name: type, dtype: int64