# Import

In [1]:
import tensorflow as tf
import argparse
# Needed for PCA
from sklearn import decomposition

import numpy as np
import pandas as pd

  from ._conv import register_converters as _register_converters


In [2]:
all_features = pd.read_csv('all_features_new_64.csv')

# After PCA or AutoEncoder，features do not contain 'ask_price_1' and 'bid_price_1', 
# thus, we calculate and store the label first though there is no use in the feature selection part.
data = all_features.fillna(method='ffill')
data['mid_price'] = (data['ask_price_1'] + data['bid_price_1']) / 2
data['d_price'] = data['mid_price'].diff().shift(-1)
data['label'] = 1*(data['d_price']>0) - 1*(data['d_price']<0)
data = data.dropna() # drop the first 6 rows (with some nan features) and the last row (with nan 'd_price')
data = data.drop(['mid_price', 'd_price'], axis=1)

data = data.reset_index()
data = data.drop(['index'], axis=1)

In [3]:
print(data.shape)
data.head()

(581023, 65)


Unnamed: 0,ask_price_1,ask_vol_1,bid_price_1,bid_vol_1,ask_price_2,ask_vol_2,bid_price_2,bid_vol_2,ask_price_3,ask_vol_3,...,rank_bid_vol_4,rank_ask_vol_4,rank_bid_vol_5,rank_ask_vol_5,corr_vol_1,corr_vol_2,corr_vol_3,corr_vol_4,corr_vol_5,label
0,275200,166,275100,300,275300,1000,275000,100,275400,373,...,1.0,1.0,0.714286,1.0,-0.353553,-1.0,-1.0,-1.0,1.0,0
1,275200,166,275100,300,275300,1000,275000,100,275400,373,...,1.0,1.0,0.75,1.0,-0.377964,-1.0,-1.0,-1.0,1.0,0
2,275200,166,275100,300,275300,1000,275000,100,275400,373,...,1.0,1.0,1.0,1.0,-0.395285,-1.0,-1.0,-1.0,1.0,0
3,275200,166,275100,300,275300,1000,275000,300,275400,373,...,1.0,1.0,1.0,1.0,-0.408248,-1.0,-1.0,-1.0,1.0,0
4,275200,100,275100,300,275300,1000,275000,300,275400,373,...,1.0,1.0,1.0,1.0,-0.22821,1.0,-1.0,1.0,1.0,1


In [4]:
train_weight = 0.8
split = int(data.shape[0] * train_weight)
df_train = data.iloc[:split,:-1]
df_test = data.iloc[split:,:-1]

nrow = 3000
df_valid = df_test[0:nrow]
df_test = df_test[nrow:]

x_train = df_train.values
x_valid = df_valid.values
x_test = df_test.values
x_all = data.iloc[:,:-1].values

In [5]:
#normalization (to make sure the autoencoder is converging)
x_max = np.max(x_train,axis=0)
x_min = np.min(x_train,axis=0)
x_train = (x_train - x_min) / (x_max - x_min)
x_valid = (x_valid - x_min) / (x_max - x_min)
x_test = (x_test - x_min) / (x_max - x_min)
x_all = (x_all - x_min) / (x_max - x_min)

In [6]:
print(x_train.shape, x_valid.shape, x_test.shape, x_all.shape)

(464818, 64) (3000, 64) (113205, 64) (581023, 64)


# Main Function - Performing PCA and Extracting Features

In [8]:
if __name__ == '__main__':

    #feel free to change with your own
    new_features_resultpath = '/Users/meihuaren/personal/OR_2018fall/Courses/E4720 Deep Learning/project_coding/Team E_code/'
    
    #=====================================
    # PCA
    print ('Performing PCA')
    pca = decomposition.PCA(n_components=0.9) # 0.95 or 0.9
    pca.fit(x_train) # use train data for feature selection in order to avoid look ahead bias
    print('PCA Codes')
    pca_codes = pca.transform(x_all)
    print(pca_codes)
    pca_codes_df = pd.DataFrame(pca_codes)
    features64_new_pca = pd.concat([pca_codes_df,data.iloc[:,-1]],axis = 1)
    filename = new_features_resultpath + 'features64_new_pca.csv'
    features64_new_pca.to_csv(filename, index=False)
    
    '''
    print('Re-Constructing')
    # transform data into its original space
    pca_reconstructed = pca.inverse_transform(pca_codes[:20])
    #print(pca_reconstructed)
    '''

Performing PCA
PCA Codes
[[ 1.54546976 -0.35170054  0.40688107 ... -0.14447155  0.32958309
  -0.56119337]
 [ 1.54567115 -0.36167127  0.35556218 ... -0.1491005   0.32460311
  -0.56323576]
 [ 1.54924064 -0.41707433  0.31277762 ... -0.1385274   0.3231948
  -0.56224606]
 ...
 [-1.99492394  1.01697829  0.51508533 ...  0.39980994 -0.10465375
   0.3867314 ]
 [-1.99332415  1.12626926  0.57587923 ...  0.45153467 -0.06226303
   0.38756204]
 [-1.99260899  1.10210904  0.6048412  ...  0.48211453 -0.04522636
   0.38859466]]
