# Feature Selection

### Load data

In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
file = open('/Users/linarkou/Documents/ML-course/hw2/data/arcene_train.data')
X_train = np.fromfile(file, dtype=np.int32, sep=' ')
file.close()

file = open('/Users/linarkou/Documents/ML-course/hw2/data/arcene_train.labels')
y_train = np.fromfile(file, dtype=np.int32, sep=' ')
file.close()
 
file = open('/Users/linarkou/Documents/ML-course/hw2/data/arcene_valid.data')
X_valid = np.fromfile(file, dtype=np.int32, sep=' ')
file.close()

file = open('/Users/linarkou/Documents/ML-course/hw2/data/arcene_valid.labels')
y_valid = np.fromfile(file, dtype=np.int32, sep=' ')
file.close()

In [4]:
y_train = y_train.reshape(-1)
X_train = X_train.reshape(y_train.shape[0], -1)
y_valid = y_valid.reshape(-1)
X_valid = X_valid.reshape(y_valid.shape[0], -1)

In [5]:
print(X_train)
X_train.shape

[[  0  71   0 ...   0   0 524]
 [  0  41  82 ...   0 284 423]
 [  0   0   1 ...   0  34 508]
 ...
 [  2  15  48 ...   0   0 453]
 [  8   0  38 ...   0 189 403]
 [  0   0   0 ...   0  10 365]]


(100, 10000)

In [6]:
y_train.shape

(100,)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def predict(X_train, y_train, X_valid, y_valid):
    clf = KNeighborsClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_valid)
    score = accuracy_score(y_valid, y_pred)
    return score

# Filters

## 1. Spearman Correlation

In [8]:
from scipy.stats import spearmanr

def spearmanCorrelationFilter(X, y, amount):
    indices = []
    featuresAmount = X.shape[1]
    correlations = []
    for i in range(featuresAmount):
        correlations.append(abs(spearmanr(X[:, i], y)[0]))
    return (np.argsort(correlations))[:amount]

## 2. Pearson Correlation

In [9]:
from scipy.stats import pearsonr

def pearsonCorrelationFilter(X, y, amount):
    indices = []
    featuresAmount = X.shape[1]
    correlations = []
    for i in range(featuresAmount):
        correlations.append(abs(pearsonr(X[:, i], y)[0]))
    return (np.argsort(correlations))[:amount]

## 3. Intraclass Distance Filter

#### Group by classes

In [10]:
classes = {}
for i, class_value in enumerate(y_train):
    if class_value in classes:
        classes[class_value].append(i)
    else:
        classes[class_value] = [i]

In [11]:
from sklearn import preprocessing

def intraclassDistanceFilter(X, y, amount):
    scaler = preprocessing.StandardScaler().fit(X)
    X = scaler.transform(X)
    distances = []
    features_amount = X.shape[1]
    for f_i in range(features_amount):
        observations = X[:, f_i]
        dist = 0
        for cl in classes:
            dist_cl = 0
            for i in classes[cl]:
                for j in classes[cl]:
                    dist_cl += abs(observations[i] - observations[j])
            dist += dist_cl / (len(classes[cl])**2 - len(classes[cl]))
        distances.append(dist / len(classes))
    return (np.argsort(distances))[:amount]

## 4. Interclass Distance Filter

In [12]:
from sklearn import preprocessing

def interclassDistanceFilter(X, y, amount):
    scaler = preprocessing.StandardScaler().fit(X)
    X = scaler.transform(X)
    distances = []
    features_amount = X.shape[1]
    for f_i in range(features_amount):
        observations = X[:, f_i]
        dist = 0
        for i in classes[-1]:
            for j in classes[1]:
                dist += abs(observations[i] - observations[j])
        distances.append(dist / (len(classes[1]) * len (classes[-1])))
    return (np.argsort(distances))[-amount:]

# Wrappers

## 1. Forward wrapper

In [13]:
# stop if no improvements in (features_amount * limit) iterations
def forward_wrapper(X_train, y_train, X_valid, y_valid, amount):
    features_amount = X_train.shape[1]
    remaining_indices = list(range(features_amount))
    curr_indices = list()
    best_score = 0
    while len(curr_indices) != amount and len(remaining_indices) > 0:
        index = remaining_indices[np.random.randint(len(remaining_indices))]
        curr_indices.append(index)
        remaining_indices.remove(index)
        score = predict(X_train[:, curr_indices], y_train, X_valid[:, curr_indices], y_valid)
        if score > best_score:
            best_score = score
        else:
            curr_indices.remove(index)
    return curr_indices, best_score

## 2. Backward wrapper

In [28]:
from random import shuffle
# stop if no improvements in (features_amount * limit) iterations
def backward_wrapper(X_train, y_train, X_valid, y_valid, amount):
    features_amount = X_train.shape[1]
    indices = list(range(features_amount))
    shuffle(indices)
    best_score = predict(X_train, y_train, X_valid, y_valid)
    while len(indices) != amount:
        index = indices[0]
        indices.remove(index)
        score = predict(X_train[:, indices], y_train, X_valid[:, indices], y_valid)
        if score >= best_score:
            best_score = score    
        else:
            indices.append(index)
    return indices, best_score

# Calculation

In [15]:
row_names = pd.Index(["features amount", "score"], name="")

In [16]:
max_amount = 50

In [17]:
spearman_indices = spearmanCorrelationFilter(X_train, y_train, max_amount)

In [18]:
pearson_indices = pearsonCorrelationFilter(X_train, y_train, max_amount)

In [23]:
intracl_dist_indices = intraclassDistanceFilter(X_train, y_train, max_amount)

In [24]:
intercl_dist_indices = interclassDistanceFilter(X_train, y_train, max_amount)

In [25]:
forward_wrapper_indices, _ = forward_wrapper(X_train, y_train, X_valid, y_valid, max_amount)

In [29]:
backward_wrapper_indices, _ = backward_wrapper(X_train, y_train, X_valid, y_valid, max_amount)

# Comparison

In [30]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

row_names = pd.Index(["features amount", "score"], name="")

col_names = pd.Index([
    "original",
    "Pearson Correlation",
    "Spearman Correlation",
    "Intraclass Distance",
    "Interclass Distance",
    "Forward Wrapper",
    "Backward Wrapper"
])
filteredFeatureIndices = [
    range(X_train.shape[1]), 
    pearson_indices,
    spearman_indices,
    intracl_dist_indices,
    intercl_dist_indices,
    forward_wrapper_indices,
    backward_wrapper_indices
]

results = pd.DataFrame(index=row_names, columns=col_names)
for i, indices in enumerate(filteredFeatureIndices):
    clf = KNeighborsClassifier()
    clf.fit(X_train[:, indices], y_train)
    y_pred = clf.predict(X_valid[:, indices])
    score = accuracy_score(y_valid, y_pred)
    results[col_names[i]] = [len(indices),  score]
display(results)

Unnamed: 0,original,Pearson Correlation,Spearman Correlation,Intraclass Distance,Interclass Distance,Forward Wrapper,Backward Wrapper
,,,,,,,
features amount,10000.0,50.0,50.0,50.0,50.0,27.0,50.0
score,0.82,0.67,0.57,0.44,0.74,0.94,0.91
