In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import cProfile
import time

In [None]:
#Load Data; only the training set is needed as feature selection is done upon that
X_tr = pd.read_csv('/content/drive/My Drive/SRP/Public_URFall_Training.csv')
Y_train = X_tr['class'].astype('category').cat.codes
X_tr = X_tr.drop(['class'], axis = 1)
#Initialize scaler
scaler = StandardScaler()
#Scale the training set accordingly
scaler.fit(X_tr)
X_train = scaler.transform(X_tr)
#Convert training set to nump array
trrgx = X_train
trrgy = Y_train.to_numpy()

In [None]:
#Run Fisher Ratio in order to get ranking of features; outputs top 26 features
def fisher_ratio_M(X, y):
    """
    This computes the fisher ratio of each feature and returns the fisher ratio for each feature

    Parameters
    ----------
    X : numpy 2d array of size (nSamples x nFeatures) (8+...+..)*62
        training data; each row represents 1 training sample
    y : numpy 2d array of size (nSamples x 1) or (nSamples,)
        true label of training data

    Returns
    -------
    FR : numpt 1d array of size (nFeatures,)
        fisher ratio
    """
    trgx = X.copy()
    trgy = y.copy()

    if (trgy.ndim == 1):
        trgy = trgy[:, np.newaxis]

    classlabels = np.unique(trgy)
    nclasses = len(classlabels)
    nfeatures = trgx.shape[1]  # !!!previous np.size(trgx,1)
    # print(f'nfeatures = {nfeatures}')

    nk = np.zeros((nclasses, 1))
    mk = np.zeros((nclasses, nfeatures))
    vk = np.zeros((nclasses, nfeatures))
    for i in range(0, nclasses):
        iclass = classlabels[i]
        idx = trgy[:, 0] == iclass
        # print(f'idx = {idx}')
        nk[i] = (np.sum(idx))

        fk = trgx[(idx)]
        # print(f'class {iclass} features = {fk}')
        mk[i, :] = np.mean(fk, 0)
        vk[i, :] = np.var(fk, 0, ddof=1)

    # find overall mean
    m = np.mean(trgx, 0)
    n = np.sum(nk)

    SB = np.sum((mk - m) * (mk - m) * nk / n, axis=0)
    SW = np.sum(vk * nk / n, axis=0)

    # print(f'SB = {SB}')
    # print(f'SW = {SW}')

    FR = SB / SW
    idx = np.argsort(FR)  # this s in ascending order
    idx = idx[::-1]  # in descending order
    # print(f'idx = {idx}')
    # print(f'FR = {FR}')
    # print(f'FR_sorted = {FR_sorted}')
    return FR


# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    # trgx = np.array([[1.2,4.2,5.4],[2.3,4,3.2],[1.5,4.6,7.8],[5.7,9.8,1],[1.9,2.1,2.3],[2.11,13.2,5.6]],dtype = float)
    ntrg = 100
    nf = 5
    nc = 3
    trgx = np.random.randn(ntrg, nf)

    # print(f'trgx = {trgx}')

    # simulate classes
    trgy = np.zeros((ntrg, 1))
    n = round(ntrg / nc)
    for ir in range(0, nc):
        i = np.arange(0, n) + ir * n
        trgy[i] = ir

    # print(f'trgy = {trgy}')

    # https://stackoverflow.com/questions/43394393/oct2py-only-returning-the-first-output-argument    
    runOct2Py = 0
    if runOct2Py:
        start = time.time()
        oc = Oct2Py()
        trg = np.concatenate((trgx, trgy), axis=1)
        # print(trg)
        output, output2 = oc.fisherRatioM(trg, True, nout=2)
        print(f'output = {output - 1}')
        print(f'output2 = {output2}')
        end = time.time()
        print(f'time taken for octave = {end - start}s')

    start = time.time()
    fr = fisher_ratio_M(trrgx, trrgy)
    # idx, fr_sorted_vals = fisher_ratio_M(trgx, trgy)
    fr_sorted_vals = np.sort(fr)
    idx = np.argsort(fr)
    idx = idx[::-1]
    idx = idx[-25:]
    print(f'idx = {idx}')
    print(f'fr_sorted_vals = {fr_sorted_vals}')
    end = time.time()
    print(f'time taken for python = {end - start}s')

    cProfile.run("fisher_ratio_M(trgx,trgy)")


idx = [ 0  6  3 12 18  9 15 24 21 35 36 38 41 33 44 30 17 27 26 20 23  2 50 47
  8  5 11 32 29 42 39 14 45 48 34 37 22 16 28 25 19 49 31 10 46 13  7  1
  4 43 40]
fr_sorted_vals = [0.01099428 0.0145889  0.0175195  0.018089   0.01962025 0.02502919
 0.02510935 0.0263206  0.02865916 0.03618974 0.03768013 0.03807544
 0.03865755 0.04999698 0.06103043 0.06279274 0.0711717  0.08892501
 0.10390791 0.39265724 0.43521005 0.46584307 0.56041697 0.56234336
 0.67692852 0.7038909  0.74392716 0.79947924 0.82003932 0.8905791
 0.93066107 0.99803092 1.02823417 1.11692689 1.13554503 1.27522986
 1.34279144 1.36425424 1.36662369 1.43511273 1.4502609  1.65576792
 1.82491419 1.98747163 2.42122397 2.55192463 2.62312811 2.62819895
 2.63341844 2.64765355 2.71097572]
time taken for python = 0.005539894104003906s
         183 function calls in 0.001 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <__array_f

In [None]:
#Copy over the idx outputted from fisher ratio
idx = [14, 45, 39, 42, 48, 34, 29, 32, 11, 37, 5, 8, 22, 16, 28, 25, 47, 50, 19, 2, 23, 20, 49, 31, 26, 27]
new= []
#Outputs the list of top features based on comparing the index to the list of all features
existing = ['y1','x1','c1','y2','x2','c2','y3','x3','c3','y4','x4','c4','y5','x5','c5','y6','x6','c6','y7','x7','c7','y8','x8','c8','y9','x9','c9','y10','x10','c10','y11','x11','c11','y12','x12','c12','y13','x13','c13','y14','x14','c14','y15','x15','c15','y16','x16','c16','y17','x17','c17']
for i in idx:
  new.append(existing[i])
print(new)



26
['c5', 'y16', 'y14', 'y15', 'y17', 'x12', 'c10', 'c11', 'c4', 'x13', 'c2', 'c3', 'x8', 'x6', 'x10', 'x9', 'c16', 'c17', 'x7', 'c1', 'c8', 'c7', 'x17', 'x11', 'c9', 'y10']
