# Feature Selection

### Load data

In [1]:
import numpy as np
import pandas as pd

In [16]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
file = open('/Users/linarkou/Documents/ML-course/hw2/data/arcene_train.data')
X_train = np.fromfile(file, dtype=np.int32, sep=' ')
file.close()

file = open('/Users/linarkou/Documents/ML-course/hw2/data/arcene_train.labels')
y_train = np.fromfile(file, dtype=np.int32, sep=' ')
file.close()
 
file = open('/Users/linarkou/Documents/ML-course/hw2/data/arcene_valid.data')
X_valid = np.fromfile(file, dtype=np.int32, sep=' ')
file.close()

file = open('/Users/linarkou/Documents/ML-course/hw2/data/arcene_valid.labels')
y_valid = np.fromfile(file, dtype=np.int32, sep=' ')
file.close()

In [3]:
y_train = y_train.reshape(-1)
X_train = X_train.reshape(y_train.shape[0], -1)
y_valid = y_valid.reshape(-1)
X_valid = X_valid.reshape(y_valid.shape[0], -1)

In [4]:
print(X_train)
X_train.shape

[[  0  71   0 ...   0   0 524]
 [  0  41  82 ...   0 284 423]
 [  0   0   1 ...   0  34 508]
 ...
 [  2  15  48 ...   0   0 453]
 [  8   0  38 ...   0 189 403]
 [  0   0   0 ...   0  10 365]]


(100, 10000)

In [5]:
y_train.shape

(100,)

## 1. Spearman Correlation

In [6]:
from scipy.stats import spearmanr

def correlationFilter(X, y, p_value=0.01):
    indices = []
    featuresAmount = X.shape[1]
    for i in range(featuresAmount):
        _, cur_p_value = spearmanr(X[:, i], y)
        if cur_p_value > p_value:
            indices.append(i)
    return indices

## 2. Intraclass Distance Filter

#### Group by classes

In [7]:
classes = {}
for i, class_value in enumerate(y_train):
    if class_value in classes:
        classes[class_value].append(i)
    else:
        classes[class_value] = [i]

In [47]:
from sklearn import preprocessing

def intraclassDistanceFilter(X, y, distance=1):
    scaler = preprocessing.StandardScaler().fit(X)
    X = scaler.transform(X)
    distances = []
    features_amount = X.shape[1]
    for f_i in range(features_amount):
        observations = X[:, f_i]        
        dist = 0
        for cl in classes:
            dist_cl = 0
            for i in classes[cl]:
                for j in classes[cl]:
                    dist_cl += abs(observations[i] - observations[j])
            dist += dist_cl / (len(classes[cl])**2 - len(classes[cl]))
        distances.append(dist / len(classes))
    return [i for i, d in enumerate(distances) if d <= distance]


## 3. Interclass Distance Filter

In [48]:
from sklearn import preprocessing

def interclassDistanceFilter(X, y, distance=1):
    scaler = preprocessing.StandardScaler().fit(X)
    X = scaler.transform(X)
    distances = []
    features_amount = X.shape[1]
    for f_i in range(features_amount):
        observations = X[:, f_i]
        dist = 0
        for i in classes[-1]:
            for j in classes[1]:
                dist += abs(observations[i] - observations[j])
        distances.append(dist / (len(classes[1]) * len (classes[-1])))
    return [i for i, d in enumerate(distances) if d >= distance]

# Comparison

In [41]:
row_names = pd.Index(["features amount", "score"], name="")

In [42]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def predict(X_train, y_train, X_valid, y_valid):
    clf = KNeighborsClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_valid)
    score = accuracy_score(y_valid, y_pred)
    return score

### Spearman correlation

In [19]:
p_values = [0.1, 0.05, 0.01, 0.005, 0.001]

best_score = 0
spearman_indices = []
best_p_value = 0
spearman_df = pd.DataFrame(index=row_names, columns=pd.Index(data=p_values, name="p value"))
for p_value in p_values:
    indices = correlationFilter(X_train, y_train, p_value)
    score = predict(X_train[:, indices], y_train, X_valid[:, indices], y_valid)
    spearman_df[p_value] = [len(indices), score]
    if score > best_score:
        spearman_indices = indices
        best_score = score
        best_p_value = p_value
    
display(spearman_df)

p value,0.1,0.05,0.01,0.005,0.001
,,,,,
features amount,6592.0,7408.0,8620.0,8989.0,9489.0
score,0.78,0.79,0.83,0.8,0.81


### Intraclass Distance

In [43]:
intracl_params = np.arange(0.9, 1.1, 0.01)

best_score = 0
intracl_dist_indices = []
best_intracl_param = 0
intracl_dist_df = pd.DataFrame(index=row_names, 
                               columns=pd.Index(data=intracl_params, name="High border of mean dist inside class"))
for param in intracl_params:
    indices = intraclassDistanceFilter(X_train, y_train, param)
    score = predict(X_train[:, indices], y_train, X_valid[:, indices], y_valid)
    intracl_dist_df[param] = [len(indices), score]
    if score >= best_score:
        intracl_dist_indices = indices
        best_score = score
        best_intracl_param = param

In [46]:
print(intracl_dist_df)

High border of mean dist inside class     0.90    0.91     0.92     0.93  \
                                                                           
features amount                        5726.00  5568.0  5407.00  5256.00   
score                                     0.81     0.8     0.81     0.81   

High border of mean dist inside class     0.94     0.95     0.96     0.97  \
                                                                            
features amount                        5107.00  4942.00  4795.00  4627.00   
score                                     0.82     0.83     0.83     0.84   

High border of mean dist inside class     0.98     0.99   ...        1.01  \
                                                          ...               
features amount                        4475.00  4300.00   ...     3878.00   
score                                     0.83     0.81   ...        0.81   

High border of mean dist inside class     1.02     1.03     1.04     1.05  \


### Interclass Distance

In [28]:
intercl_params = np.arange(0.9, 1.2, 0.01)

best_score = 0
intercl_dist_indices = []
best_intercl_param = 0
intercl_dist_df = pd.DataFrame(index=row_names, 
                               columns=pd.Index(data=intercl_params, name="Low border of mean dist between classes"))
for param in intercl_params:
    indices = interclassDistanceFilter(X_train, y_train, param)
    score = predict(X_train[:, indices], y_train, X_valid[:, indices], y_valid)
    intercl_dist_df[param] = [len(indices), score]
    if score > best_score:
        intercl_dist_indices = indices
        best_score = score
        best_intercl_param = param

In [44]:
print(intercl_dist_df.loc['score', :])

Low border of mean dist between classes     0.90     0.91     0.92     0.93  \
                                                                              
features amount                          6121.00  6009.00  5889.00  5772.00   
score                                       0.81     0.83     0.83     0.83   

Low border of mean dist between classes     0.94     0.95     0.96     0.97  \
                                                                              
features amount                          5662.00  5542.00  5382.00  5232.00   
score                                       0.83     0.83     0.83     0.82   

Low border of mean dist between classes     0.98     0.99   ...        1.01  \
                                                            ...               
features amount                          5061.00  4888.00   ...     4537.00   
score                                       0.82     0.82   ...        0.79   

Low border of mean dist between classes     1.02 

In [106]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

row_names = pd.Index(["features amount", "score"], name="")

col_names = pd.Index([
    "original", 
    "Equality", 
    "Diff Values", 
    "Spearman Correlation",
    "Intraclass Distance",
    "Interclass Distance",
    "Spearman + Intraclass",
    "Spearman + Interclass"
])
filteredFeatureIndices = [
    range(X_train.shape[1]), 
    equalityThresholdIndices, 
    differentValuesIndices, 
    correlationFilterIndices,
    intraclassDistanceIndices,
    interclassDistanceIndices,
    np.intersect1d(correlationFilterIndices, intraclassDistanceIndices),
    np.intersect1d(correlationFilterIndices, interclassDistanceIndices)
]

results = pd.DataFrame(index=row_names, columns=col_names)
for i, indices in enumerate(filteredFeatureIndices):
    clf = KNeighborsClassifier()
    clf.fit(X_train[:, indices], y_train)
    y_pred = clf.predict(X_valid[:, indices])
    score = accuracy_score(y_valid, y_pred)
    results[col_names[i]] = [len(indices),  score]
display(results)

Unnamed: 0,original,Equality,Diff Values,Spearman Correlation,Intraclass Distance,Interclass Distance,Spearman + Intraclass,Spearman + Interclass
,,,,,,,,
features amount,10000.0,9920.0,8755.0,8620.0,6922.0,3617.0,5899.0,2893.0
score,0.82,0.82,0.82,0.83,0.8,0.82,0.79,0.83
