In [12]:
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import sys
import os
%matplotlib inline

### Scikit-feature tutorial

In [37]:
import scipy.io

dirname = '.\..\data'
path = os.path.join(dirname, 'COIL20.mat')
mat = scipy.io.loadmat(path)

X = mat['X']
n_samples, n_features = X.shape
print n_samples, n_features

y = mat['Y'][:, 0]
n_labels = y.shape
print n_labels

1440 1024
(1440,)


In [38]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
print X_train.shape, X_test.shape

(1152, 1024) (288, 1024)


In [39]:
from skfeature.function.similarity_based import fisher_score

score = fisher_score.fisher_score(X_train, y_train)

print score[:5]
print score.shape

[ 13.96904931   0.5376816    0.19923194   0.07443112   0.12497083]
(1024,)


In [40]:
idx_fisher = fisher_score.feature_ranking(score)
print idx_fisher[:5]
print idx_fisher[-5:]

[1023 1022   31    0   30]
[ 66  65  34  97 897]


In [41]:
score.argsort()[:5]

array([897,  97,  34,  65,  66])

In [57]:
num_fea = 40

selected_features_train = X_train[:, idx_fisher[:num_fea]]
selected_features_test = X_test[:, idx_fisher[:num_fea]]

In [58]:
from sklearn import svm

clf = svm.LinearSVC()

clf.fit(selected_features_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [59]:
y_predict = clf.predict(selected_features_test)

print y_predict[:5]

[ 8 15  1 10 11]


In [60]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_predict)

print acc

0.8125


### Quadratic programming approach

In [34]:
import numpy as np
import pandas as pd
from cvxpy import *
from scipy.stats.stats import pearsonr


def create_opt_problem(X, y, sim, rel):
    """
    % Function generates matrix Q and vector b
    % which represent feature similarities and feature relevances
    %
    % Input:
    % X - [m, n] - design matrix
    % y - [m, 1] - target vector
    % sim - string - indicator of the way to compute feature similarities,
    % support values are 'correl' and 'mi'
    % rel - string - indicator of the way to compute feature significance,
    % support values are 'correl', 'mi' and 'signif'
    %
    % Output:
    % Q - [n ,n] - matrix of features similarities
    % b - [n, 1] - vector of feature relevances
    """
    
    if len(y.shape) == 1:
        y_mat = y[:, np.newaxis]
    else:
        y_mat = y[:]
        
    df = pd.DataFrame(np.hstack([X, y_mat]))
    cor = np.array(df.corr())
    
    if sim == 'correl':
        Q = cor[:-1, :-1]
    else:
        print "Wrong similarity measure"
        
    if rel == 'correl':
        b = cor[:-1, [-1]]
    else:
        print "Wrong relevance measure"

    return Q, b


def solve_opt_problem(Q, b):
    """
     Function solves the quadratic optimization problem stated to select
     significance and noncollinear features

     Input:
     Q - [n, n] - matrix of features similarities
     b - [n, 1] - vector of feature relevances

     Output:
     x - [n, 1] - solution of the quadratic optimization problem
    """

    n = Q.shape[0]
    x = Variable(n)

    objective = Minimize(quad_form(x, Q) - b.T*x)
    constraints = [x >= 0, norm(x, 1) <= 1]
    prob = Problem(objective, constraints)

    prob.solve()

    if prob.status == 'optimal':
        return np.array(x.value).flatten()
    

def quadratic_programming(X, y, sim='correl', rel='correl'):
    Q, b = create_opt_problem(X, y, sim, rel)
    qp_score = solve_opt_problem(Q, b)
    return qp_score

In [36]:
qp_score = quadratic_programming(X_train, y_train)

print qp_score.max(), qp_score.min()
print qp_score[:5]

0.119064183334 -1.82899441684e-15
[ -1.81609940e-15   2.28707362e-15   6.35014454e-15   1.40868755e-14
   2.08324057e-14]


In [50]:
idx_qp = qp_score.argsort()[::-1]
print idx_qp[:5]
print idx_qp[-5:]

[1001    9  480  174  269]
[1023   31 1022    0   30]
[1023 1022   31    0   30]
[ 66  65  34  97 897]


In [62]:
num_fea = 40

selected_features_train = X_train[:, idx_qp[:num_fea]]
selected_features_test = X_test[:, idx_qp[:num_fea]]

clf.fit(selected_features_train, y_train)
y_predict = clf.predict(selected_features_test)

acc = accuracy_score(y_test, y_predict)
print acc

0.822916666667
