# Import

In [1]:
import tensorflow as tf
import argparse

# Needed for SVM
from sklearn.svm import LinearSVC

import numpy as np
import pandas as pd

In [2]:
all_features = pd.read_csv('all_features_new_64.csv')

# After PCA or AutoEncoder，features do not contain 'ask_price_1' and 'bid_price_1', 
# thus, we calculate and store the label first though there is no use in the feature selection part.
data = all_features.fillna(method='ffill')
data['mid_price'] = (data['ask_price_1'] + data['bid_price_1']) / 2
data['d_price'] = data['mid_price'].diff().shift(-1)
data['label'] = 1*(data['d_price']>0) - 1*(data['d_price']<0)
data = data.dropna() # drop the first 6 rows (with some nan features) and the last row (with nan 'd_price')
data = data.drop(['mid_price', 'd_price'], axis=1)

data = data.reset_index()
data = data.drop(['index'], axis=1)

In [3]:
print(data.shape)
data.head()

(581023, 65)


Unnamed: 0,ask_price_1,ask_vol_1,bid_price_1,bid_vol_1,ask_price_2,ask_vol_2,bid_price_2,bid_vol_2,ask_price_3,ask_vol_3,...,rank_bid_vol_4,rank_ask_vol_4,rank_bid_vol_5,rank_ask_vol_5,corr_vol_1,corr_vol_2,corr_vol_3,corr_vol_4,corr_vol_5,label
0,275200,166,275100,300,275300,1000,275000,100,275400,373,...,1.0,1.0,0.714286,1.0,-0.353553,-1.0,-1.0,-1.0,1.0,0
1,275200,166,275100,300,275300,1000,275000,100,275400,373,...,1.0,1.0,0.75,1.0,-0.377964,-1.0,-1.0,-1.0,1.0,0
2,275200,166,275100,300,275300,1000,275000,100,275400,373,...,1.0,1.0,1.0,1.0,-0.395285,-1.0,-1.0,-1.0,1.0,0
3,275200,166,275100,300,275300,1000,275000,300,275400,373,...,1.0,1.0,1.0,1.0,-0.408248,-1.0,-1.0,-1.0,1.0,0
4,275200,100,275100,300,275300,1000,275000,300,275400,373,...,1.0,1.0,1.0,1.0,-0.22821,1.0,-1.0,1.0,1.0,1


In [4]:
train_weight = 0.6
split = int(data.shape[0] * train_weight)
df_train = data[:split]
df_test = data[split:]

nrow = int(len(df_test)/2)
df_valid = df_test[0:nrow]
df_test = df_test[nrow:]

x_train = df_train.iloc[:,:-1].values
y_train = df_train.iloc[:,-1].values
train_index = (y_train!=0)
x_train = x_train[train_index]
y_train = y_train[train_index]

x_valid = df_valid.iloc[:,:-1].values
y_valid = df_valid.iloc[:,-1].values
valid_index = (y_valid!=0)
x_valid = x_valid[valid_index]
y_valid = y_valid[valid_index]

x_test = df_test.iloc[:,:-1].values
y_test = df_test.iloc[:,-1].values
test_index = (y_test!=0)
x_test = x_test[test_index]
y_test = y_test[test_index]

x_all = data.iloc[:,:-1].values
y_all = data.iloc[:,-1].values

In [5]:
np.unique(y_train)

array([-1,  1], dtype=int32)

In [6]:
#normalization (to make sure the autoencoder is converging)
x_max = np.max(x_train,axis=0)
x_min = np.min(x_train,axis=0)
x_train = (x_train - x_min) / (x_max - x_min)
x_valid = (x_valid - x_min) / (x_max - x_min)
x_test = (x_test - x_min) / (x_max - x_min)
x_all = (x_all - x_min) / (x_max - x_min)

In [7]:
print(x_train.shape, x_valid.shape, x_test.shape, x_all.shape)

(2311, 64) (425, 64) (481, 64) (581023, 64)


# Main Function - Performing SVM and Extracting Features

In [8]:
if __name__ == '__main__':

    #feel free to change with your own
    #new_features_resultpath = '/Users/meihuaren/personal/OR_2018fall/Courses/E4720 Deep Learning/project_coding/Team E_code/'
    new_features_resultpath = 'F:/Columbia OR/IEORE4720 Deep Learning/Course Project/Data/'
    #=====================================
    
    ### Random Feature Mapping
    
    n = x_train.shape[1]    
    m = int(n*np.log2(n)) # choose the dimensionality of the random feature map as n*logn
    
    np.random.seed(0) #set seed for replication purpose    
    G = np.random.normal(size=(m,n)) #sampling an independent m*n normal distributed variables
    
    X_train = 1 / np.sqrt(m) * np.sign(x_train @ G.T)  #perform random feature mapping for angular kernel
    X_valid = 1 / np.sqrt(m) * np.sign(x_valid @ G.T)
    X_test = 1 / np.sqrt(m) * np.sign(x_test @ G.T)
    
    ### SVM
    print ('Performing SVM')
    
    C_array = 10.0**(np.arange(-10,10))    
    train_acc_array = []
    valid_acc_array = []
    clf_array = []
    
    for c in C_array:   
        
        clf = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, C=c, max_iter=1000)
        clf.fit(X_train, y_train)
        clf_array.append(clf)
        
        train_acc = clf.score(X_train, y_train)
        train_acc_array.append(train_acc)
        valid_acc = clf.score(X_test, y_test)
        valid_acc_array.append(valid_acc)
    
    clf = clf_array[np.argmax(valid_acc_array)]
    
    # select features with non-zero weights
    selected_columns = (clf.coef_!=0).reshape(-1)
    
    # contruct final features
    X_all = 1 / np.sqrt(m) * np.sign(x_all @ G.T)
    svm_features = X_all[:,selected_columns]
    svm_features_df = pd.DataFrame(svm_features)
    features64_new_svm = pd.concat([svm_features_df,data.iloc[:,-1]],axis = 1)
    filename = new_features_resultpath + 'features64_new_svm.csv'
    features64_new_svm.to_csv(filename, index=False)   
    

Performing SVM


