# Feature Selection

### Load data

In [5]:
import numpy as np
import pandas as pd

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
file = open('/Users/linarkou/Documents/ML-course/hw2/data/arcene_train.data')
X_train = np.fromfile(file, dtype=np.int32, sep=' ')
file.close()

file = open('/Users/linarkou/Documents/ML-course/hw2/data/arcene_train.labels')
y_train = np.fromfile(file, dtype=np.int32, sep=' ')
file.close()
 
file = open('/Users/linarkou/Documents/ML-course/hw2/data/arcene_valid.data')
X_valid = np.fromfile(file, dtype=np.int32, sep=' ')
file.close()

file = open('/Users/linarkou/Documents/ML-course/hw2/data/arcene_valid.labels')
y_valid = np.fromfile(file, dtype=np.int32, sep=' ')
file.close()

In [8]:
y_train = y_train.reshape(-1)
X_train = X_train.reshape(y_train.shape[0], -1)
y_valid = y_valid.reshape(-1)
X_valid = X_valid.reshape(y_valid.shape[0], -1)

In [9]:
print(X_train)
X_train.shape

[[  0  71   0 ...   0   0 524]
 [  0  41  82 ...   0 284 423]
 [  0   0   1 ...   0  34 508]
 ...
 [  2  15  48 ...   0   0 453]
 [  8   0  38 ...   0 189 403]
 [  0   0   0 ...   0  10 365]]


(100, 10000)

In [10]:
y_train.shape

(100,)

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def predict(X_train, y_train, X_valid, y_valid):
    clf = KNeighborsClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_valid)
    score = accuracy_score(y_valid, y_pred)
    return score

## 1. Spearman Correlation

In [17]:
from scipy.stats import spearmanr

def correlationFilter(X, y, p_value=0.01):
    indices = []
    featuresAmount = X.shape[1]
    for i in range(featuresAmount):
        _, cur_p_value = spearmanr(X[:, i], y)
        if cur_p_value > p_value:
            indices.append(i)
    return indices

## 2. Intraclass Distance Filter

#### Group by classes

In [18]:
classes = {}
for i, class_value in enumerate(y_train):
    if class_value in classes:
        classes[class_value].append(i)
    else:
        classes[class_value] = [i]

In [19]:
from sklearn import preprocessing

def intraclassDistanceFilter(X, y, distance=1):
    scaler = preprocessing.StandardScaler().fit(X)
    X = scaler.transform(X)
    distances = []
    features_amount = X.shape[1]
    for f_i in range(features_amount):
        observations = X[:, f_i]        
        dist = 0
        for cl in classes:
            dist_cl = 0
            for i in classes[cl]:
                for j in classes[cl]:
                    dist_cl += abs(observations[i] - observations[j])
            dist += dist_cl / (len(classes[cl])**2 - len(classes[cl]))
        distances.append(dist / len(classes))
    return [i for i, d in enumerate(distances) if d <= distance]


## 3. Interclass Distance Filter

In [20]:
from sklearn import preprocessing

def interclassDistanceFilter(X, y, distance=1):
    scaler = preprocessing.StandardScaler().fit(X)
    X = scaler.transform(X)
    distances = []
    features_amount = X.shape[1]
    for f_i in range(features_amount):
        observations = X[:, f_i]
        dist = 0
        for i in classes[-1]:
            for j in classes[1]:
                dist += abs(observations[i] - observations[j])
        distances.append(dist / (len(classes[1]) * len (classes[-1])))
    return [i for i, d in enumerate(distances) if d >= distance]

## 4. Pearson Correlation

In [21]:
from scipy.stats import pearsonr

def pearsonCorrelationFilter(X, y, p_value=0.01):
    indices = []
    featuresAmount = X.shape[1]
    for i in range(featuresAmount):
        _, cur_p_value = pearsonr(X[:, i], y)
        if cur_p_value > p_value:
            indices.append(i)
    return indices

# Comparison

In [22]:
row_names = pd.Index(["features amount", "score"], name="")

### Spearman correlation

In [23]:
p_values = [0.1, 0.05, 0.01, 0.005, 0.001]

best_score = 0
spearman_indices = []
best_p_value = 0
spearman_df = pd.DataFrame(index=row_names, columns=pd.Index(data=p_values, name="p value"))
for p_value in p_values:
    indices = correlationFilter(X_train, y_train, p_value)
    score = predict(X_train[:, indices], y_train, X_valid[:, indices], y_valid)
    spearman_df[p_value] = [len(indices), score]
    if score > best_score:
        spearman_indices = indices
        best_score = score
        best_p_value = p_value
    
display(spearman_df)

p value,0.1,0.05,0.01,0.005,0.001
,,,,,
features amount,6592.0,7408.0,8620.0,8989.0,9489.0
score,0.78,0.79,0.83,0.8,0.81


### Intraclass Distance

In [24]:
intracl_params = np.arange(0.9, 1.1, 0.01)

best_score = 0
intracl_dist_indices = []
best_intracl_param = 0
intracl_dist_df = pd.DataFrame(index=row_names, 
                               columns=pd.Index(data=intracl_params, name="High border of mean dist inside class"))
for param in intracl_params:
    indices = intraclassDistanceFilter(X_train, y_train, param)
    score = predict(X_train[:, indices], y_train, X_valid[:, indices], y_valid)
    intracl_dist_df[param] = [len(indices), score]
    if score > best_score:
        intracl_dist_indices = indices
        best_score = score
        best_intracl_param = param

In [25]:
best_intracl_param

1.0500000000000003

In [26]:
intracl_dist_df.loc['score']

High border of mean dist inside class
0.90    0.74
0.91    0.75
0.92    0.76
0.93    0.75
0.94    0.77
0.95    0.74
0.96    0.73
0.97    0.76
0.98    0.77
0.99    0.77
1.00    0.79
1.01    0.79
1.02    0.77
1.03    0.75
1.04    0.76
1.05    0.80
1.06    0.80
1.07    0.75
1.08    0.75
1.09    0.77
1.10    0.78
Name: score, dtype: float64

### Interclass Distance

In [27]:
intercl_params = np.arange(0.9, 1.2, 0.01)

best_score = 0
intercl_dist_indices = []
best_intercl_param = 0
intercl_dist_df = pd.DataFrame(index=row_names, 
                               columns=pd.Index(data=intercl_params, name="Low border of mean dist between classes"))
for param in intercl_params:
    indices = interclassDistanceFilter(X_train, y_train, param)
    score = predict(X_train[:, indices], y_train, X_valid[:, indices], y_valid)
    intercl_dist_df[param] = [len(indices), score]
    if score >= best_score:
        intercl_dist_indices = indices
        best_score = score
        best_intercl_param = param

In [28]:
best_intercl_param

0.9600000000000001

In [29]:
intercl_dist_df.loc['score']

Low border of mean dist between classes
0.90    0.81
0.91    0.83
0.92    0.83
0.93    0.83
0.94    0.83
0.95    0.83
0.96    0.83
0.97    0.82
0.98    0.82
0.99    0.82
1.00    0.80
1.01    0.79
1.02    0.81
1.03    0.82
1.04    0.82
1.05    0.82
1.06    0.82
1.07    0.82
1.08    0.81
1.09    0.81
1.10    0.82
1.11    0.81
1.12    0.81
1.13    0.80
1.14    0.81
1.15    0.82
1.16    0.80
1.17    0.78
1.18    0.75
1.19    0.77
Name: score, dtype: float64

In [31]:
intercl_dist_indices_less_amount = interclassDistanceFilter(X_train, y_train, distance=1.07)

### Pearson Correlation

In [15]:
p_values = [0.1, 0.05, 0.01, 0.005, 0.001]

best_score = 0
pearson_indices = []
best_p_value = 0
pearson_df = pd.DataFrame(index=row_names, columns=pd.Index(data=p_values, name="p value"))
for p_value in p_values:
    indices = pearsonCorrelationFilter(X_train, y_train, p_value)
    score = predict(X_train[:, indices], y_train, X_valid[:, indices], y_valid)
    pearson_df[p_value] = [len(indices), score]
    if score > best_score:
        pearson_indices = indices
        best_score = score
        best_p_value = p_value
    
display(pearson_df)

p value,0.1,0.05,0.01,0.005,0.001
,,,,,
features amount,6527.0,7342.0,8696.0,9122.0,9646.0
score,0.78,0.78,0.81,0.82,0.8


In [34]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

row_names = pd.Index(["features amount", "score"], name="")

col_names = pd.Index([
    "original",
    "Pearson Correlation",
    "Spearman Correlation",
    "Intraclass Distance",
    "Interclass Distance",
    "Interclass Distance (less f.amount)",
    "Spearman + Intraclass",
    "Spearman + Interclass",
    "Spearman + Interclass (less f.amount)",
    "Intraclass + Interclass",
    "Intraclass + Interclass (less f.amount)",
    "Spearman + Intraclass + Interclass",
    "Spearman + Intraclass + Interclass (less f.amount)"
])
filteredFeatureIndices = [
    range(X_train.shape[1]), 
    pearson_indices,
    spearman_indices,
    intracl_dist_indices,
    intercl_dist_indices,
    intercl_dist_indices_less_amount,
    np.intersect1d(spearman_indices, intracl_dist_indices),
    np.intersect1d(spearman_indices, intercl_dist_indices),
    np.intersect1d(spearman_indices, intercl_dist_indices_less_amount),
    np.intersect1d(intracl_dist_indices, intercl_dist_indices),
    np.intersect1d(intracl_dist_indices, intercl_dist_indices_less_amount),
    np.intersect1d(spearman_indices, np.intersect1d(intracl_dist_indices, intercl_dist_indices)),
    np.intersect1d(spearman_indices, np.intersect1d(intracl_dist_indices, intercl_dist_indices_less_amount))
]

results = pd.DataFrame(index=row_names, columns=col_names)
for i, indices in enumerate(filteredFeatureIndices):
    clf = KNeighborsClassifier()
    clf.fit(X_train[:, indices], y_train)
    y_pred = clf.predict(X_valid[:, indices])
    score = accuracy_score(y_valid, y_pred)
    results[col_names[i]] = [len(indices),  score]
display(results)

Unnamed: 0,original,Pearson Correlation,Spearman Correlation,Intraclass Distance,Interclass Distance,Interclass Distance (less f.amount),Spearman + Intraclass,Spearman + Interclass,Spearman + Interclass (less f.amount),Intraclass + Interclass,Intraclass + Interclass (less f.amount),Spearman + Intraclass + Interclass,Spearman + Intraclass + Interclass (less f.amount)
,,,,,,,,,,,,,
features amount,10000.0,9122.0,8620.0,6922.0,5382.0,3438.0,5899.0,4397.0,2741.0,2304.0,602.0,1676.0,262.0
score,0.82,0.82,0.83,0.8,0.83,0.82,0.79,0.86,0.83,0.8,0.78,0.79,0.84


# Wrappers

## Forward wrapper

In [54]:
# stop if no improvements in (features_amount * limit) iterations
def forward_wrapper(X_train, y_train, X_valid, y_valid, limit=0.1):
    features_amount = X_train.shape[1]
    remaining_indices = list(range(features_amount))
    curr_indices = list()
    best_score = 0
    steps_without_improvements = 0
    steps_limit = features_amount * limit
    while steps_without_improvements < steps_limit:
        index = remaining_indices[np.random.randint(len(remaining_indices))]
        curr_indices.append(index)
        remaining_indices.remove(index)
        score = predict(X_train[:, curr_indices], y_train, X_valid[:, curr_indices], y_valid)
        if score > best_score:
            best_score = score
            steps_without_improvements = 0
        else:
            steps_without_improvements += 1
            curr_indices.remove(index)
    return curr_indices, best_score

In [100]:
forward_wrapper_indices, forward_wrapper_score = forward_wrapper(X_train, y_train, X_valid, y_valid)

In [101]:
forward_wrapper_score

0.96

In [102]:
len(feed_forward_wrapper_indices)

18

In [103]:
forward_wrapper_indices

[8721,
 5792,
 7475,
 3190,
 1020,
 4080,
 5377,
 6189,
 2247,
 4407,
 682,
 5907,
 4657,
 5543,
 4016,
 615,
 1333,
 6957,
 1644,
 1411,
 1043]

## Backward wrapper

In [92]:
# stop if no improvements in (features_amount * limit) iterations
def backward_wrapper(X_train, y_train, X_valid, y_valid, limit=0.05):
    features_amount = X_train.shape[1]
    indices = list(range(features_amount))
    best_score = predict(X_train, y_train, X_valid, y_valid)
    steps_without_improvements = 0
    steps_limit = features_amount * limit
    while steps_without_improvements < steps_limit:
        i = np.random.randint(len(indices))
        index = indices[i]
        indices.remove(index)
        score = predict(X_train[:, indices], y_train, X_valid[:, indices], y_valid)
        if score >= best_score:
            best_score = score    
            steps_without_improvements = 0
        else:
            steps_without_improvements += 1
            indices.append(index)
    return indices, best_score

In [93]:
backward_wrapper_indices, backward_wrapper_score = backward_wrapper(X_train, y_train, X_valid, y_valid)

In [94]:
backward_wrapper_score

0.93

In [95]:
len(backward_wrapper_indices)

46