In [1]:
import nbimporter
import helper_methods as hm
import preprocessing as pp
import numpy as np
import time
from sklearn import svm
from sklearn.metrics import accuracy_score, matthews_corrcoef
import matplotlib.pyplot as plt
import cvxopt
from numpy import linalg
import cvxopt.solvers

Importing Jupyter notebook from helper_methods.ipynb
Importing Jupyter notebook from preprocessing.ipynb


### SVM Classifier Implementation
16384
.125

In [2]:
def linear_kernel(x1, x2):
    return np.dot(x1, x2)

def polynomial_kernel(x, y, p=3):
    return (1 + np.dot(x, y)) ** p

def gaussian_kernel(x, y, gamma = 0.25):
    return np.exp(-linalg.norm(x-y)**2 * gamma)
#     return np.exp(-linalg.norm(x-y)**2 / (2 * (sigma ** 2)))

In [3]:
class SVM(object):

    def __init__(self, kernel=linear_kernel, C=None):
        self.kernel = kernel
        self.C = C
        if self.C is not None: self.C = float(self.C)

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # Gram matrix
        K = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(n_samples):
                K[i,j] = self.kernel(X[i], X[j])

        P = cvxopt.matrix(np.outer(y,y) * K)
        q = cvxopt.matrix(np.ones(n_samples) * -1)
        A = cvxopt.matrix(y, (1,n_samples))
        b = cvxopt.matrix(0.0)

        if self.C is None:
            G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1))
            h = cvxopt.matrix(np.zeros(n_samples))
        else:
            tmp1 = np.diag(np.ones(n_samples) * -1)
            tmp2 = np.identity(n_samples)
            G = cvxopt.matrix(np.vstack((tmp1, tmp2)))
            tmp1 = np.zeros(n_samples)
            tmp2 = np.ones(n_samples) * self.C
            h = cvxopt.matrix(np.hstack((tmp1, tmp2)))

        # solve QP problem
        solution = cvxopt.solvers.qp(P, q, G, h, A, b)

        # Lagrange multipliers
        a = np.ravel(solution['x'])

        # Support vectors have non zero lagrange multipliers
        sv = a > 1e-5
        ind = np.arange(len(a))[sv]
        self.a = a[sv]
        self.sv = X[sv]
        self.sv_y = y[sv]
        print("%d support vectors out of %d points" % (len(self.a), n_samples))

        # Intercept
        self.b = 0
        for n in range(len(self.a)):
            self.b += self.sv_y[n]
            self.b -= np.sum(self.a * self.sv_y * K[ind[n],sv])
        self.b /= len(self.a)

        # Weight vector
        if self.kernel == linear_kernel:
            self.w = np.zeros(n_features)
            for n in range(len(self.a)):
                self.w += self.a[n] * self.sv_y[n] * self.sv[n]
        else:
            self.w = None

    def project(self, X):
        if self.w is not None:
            return np.dot(X, self.w) + self.b
        else:
            y_predict = np.zeros(len(X))
            for i in range(len(X)):
                s = 0
                for a, sv_y, sv in zip(self.a, self.sv_y, self.sv):
                    s += a * sv_y * self.kernel(X[i], sv)
                y_predict[i] = s
            return y_predict + self.b

    def predict(self, X):
        return np.sign(self.project(X))

### Finding Optimal Parameters
https://stats.stackexchange.com/questions/43943/which-search-range-for-determining-svm-optimal-c-and-gamma-parameters

In [4]:
def print_parameters_accuracy(accuracies):
    print('#Features \t C \t Gamma \t Accuracy')
    for i in range(len(accuracies)):
        print(accuracies[i][0], '\t\t', accuracies[i][1], '\t\t', accuracies[i][2], '\t\t', accuracies[i][3])
    print()

In [5]:
def find_optimal_values(max_features, kernel = 'rbf', num_splits = 10, symbol_name = 'AAPL', use_implementation = True):
    accuracies = list()
    for num_features in range(1, max_features + 1, 1):
        print('Features:', num_features)
        
        X_train, X_test, Y_train, Y_test = hm.prepare_data(num_features, symbol_name, is_binary_ouput=True)
        X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values
        
        gamma = 0.25
        
#         for C in [1, 10, 100, 500, 1000, 5000, 10000]:
        for C in [2 ** x for x in range(-4, 8)]:
#             for gamma in [1, 0.1, 0.05, 0.01, 0.005, 0.001]:
#         for gamma in [2 ** x for x in range(-15, 3)]:
            print('(C, gamma) ------------------------> (' + str(C) + ', ' + str(gamma) + ')')
#             skl_svm_tscv = svm.SVC(kernel=kernel, C=C, gamma=gamma)
            skl_svm_tscv = SVM(gaussian_kernel, C = C)
            if use_implementation:
                accuracy = hm.timeSeriesCV(X_train, Y_train, num_splits, skl_svm_tscv, is_classification=True)
            else:
                accuracy = hm.rolling_cross_validation(X_train, Y_train, num_splits, skl_svm_tscv, is_classification=True)
            accuracies.append([num_features, C, gamma, accuracy])
    
    print_parameters_accuracy(accuracies)
    
    # Sorting the accuracies
    accuracies.sort(reverse=True, key=lambda x: x[len(accuracies) - 1])
    print_parameters_accuracy(accuracies)
    
    return accuracies[0][0], accuracies[0][1], accuracies[0][2]

In [12]:
def get_data_ready(symbol_name):
#     num_features, C, gamma = find_optimal_values(max_features=2, kernel = 'rbf', num_splits=10, symbol_name = symbol_name)
    
    X_train, X_test, Y_train, Y_test = hm.prepare_data(2)
    X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values
    return X_train, X_test, Y_train, Y_test, 1.0, 0.25    

### 1. SkLearn SVM Classifier

In [7]:
def sklearn_SVM_forecast(X_train, X_test, Y_train, Y_test, gamma, C = 1.0, kernel = 'rbf'):
    print('SKLEARN INBUILT SVM')
    clf = svm.SVC(kernel = kernel, C = C, gamma=gamma)
    clf.fit(X_train, Y_train)
    print('In-Built SVM (Accuracy) score -- kernel =', kernel, '--', clf.score(X_test, Y_test), '\n')

### 2. Predicting using Implementation

In [14]:
def implemented_SVM_forecast(X_train, X_test, Y_train, Y_test, gamma, C = 1.0, kernel = 'rbf'):
    my_SVM = SVM(linear_kernel, C = C)
    my_SVM.fit(X_train, Y_train)
    
    print('IMPLEMENTATION') 
    Y_pred = my_SVM.predict(X_test)
    print('Accuracy Score --', accuracy_score(Y_test, Y_pred))
    print('Matthews Correlation Coefficient --', matthews_corrcoef(Y_test, Y_pred))

### Running SVM

In [9]:
def forecast(X_train, X_test, Y_train, Y_test, gamma, C = 1.0):
    print('C:', C, '\t gamma:', gamma)
    sklearn_SVM_forecast(X_train, X_test, Y_train, Y_test, gamma, C, kernel = 'rbf')
    implemented_SVM_forecast(X_train, X_test, Y_train, Y_test, gamma, C, kernel = 'rbf')
    
# X_train, X_test, Y_train, Y_test = hm.prepare_data(2)
# X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values
# forecast(X_train, X_test, Y_train, Y_test, 'rbf')

In [10]:
def run_SVM(symbol_name):
    X_train, X_test, Y_train, Y_test, C, gamma = get_data_ready(symbol_name)
    forecast(X_train, X_test, Y_train, Y_test, gamma, C)

In [15]:
run_SVM(symbol_name = 'INX')

C: 1.0 	 gamma: 0.25
SKLEARN INBUILT SVM
In-Built SVM (Accuracy) score -- kernel = rbf -- 0.4938837920489297 

     pcost       dcost       gap    pres   dres
 0: -3.9228e+03 -7.9438e+03  2e+04  1e+00  3e-15
 1: -3.2345e+03 -5.3787e+03  2e+03  1e-13  3e-15
 2: -3.8491e+03 -3.9475e+03  1e+02  5e-14  3e-15
 3: -3.9203e+03 -3.9240e+03  4e+00  4e-14  3e-15
 4: -3.9210e+03 -3.9216e+03  6e-01  7e-14  3e-15
 5: -3.9211e+03 -3.9216e+03  5e-01  7e-14  2e-15
 6: -3.9211e+03 -3.9216e+03  5e-01  1e-13  2e-15
 7: -3.9212e+03 -3.9215e+03  4e-01  7e-14  2e-15
 8: -3.9212e+03 -3.9215e+03  4e-01  2e-13  2e-15
 9: -3.9212e+03 -3.9215e+03  4e-01  1e-14  2e-15
10: -3.9212e+03 -3.9215e+03  4e-01  1e-13  2e-15
11: -3.9213e+03 -3.9214e+03  1e-01  1e-13  3e-15
12: -3.9213e+03 -3.9214e+03  9e-02  2e-14  2e-15
13: -3.9213e+03 -3.9214e+03  9e-02  7e-14  2e-15
14: -3.9213e+03 -3.9213e+03  5e-03  5e-14  3e-15
15: -3.9213e+03 -3.9213e+03  1e-04  2e-13  3e-15
Optimal solution found.
3924 support vectors out of 3924 