# Excersice 4

Feature selection:

1. You need to load the UCI Arrhythmia dataset
https://archive.ics.uci.edu/ml/datasets/arrhythmia
* the problem is a multi-class classification

2. For comparing the result you need to use kNN (k=3), and compare based on the accuracy with the original dataset (with the all features).

2. Run three different types of feature selection methods (Univariate Statistics, model based, and Iterative Feature Selection).
* The original data set has 279 features.
* You need to run the methods for number_of_features= 20, 50, 100, 150, 200
* All the results should show in the same figure (At the end we have just one figure)

In [2]:
# seting the environment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
# loading dataframe "arrhythmia.data"
# checking dataframe
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data', header=None, prefix='v')
data.head()

Unnamed: 0,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v270,v271,v272,v273,v274,v275,v276,v277,v278,v279
0,75,0,190,80,91,193,371,174,121,-16,...,0.0,9.0,-0.9,0.0,0.0,0.9,2.9,23.3,49.4,8
1,56,1,165,64,81,174,401,149,39,25,...,0.0,8.5,0.0,0.0,0.0,0.2,2.1,20.4,38.8,6
2,54,0,172,95,138,163,386,185,102,96,...,0.0,9.5,-2.4,0.0,0.0,0.3,3.4,12.3,49.0,10
3,55,0,175,94,100,202,380,179,143,28,...,0.0,12.2,-2.2,0.0,0.0,0.4,2.6,34.6,61.6,1
4,75,0,190,80,88,181,360,177,103,-16,...,0.0,13.1,-3.6,0.0,0.0,-0.1,3.9,25.4,62.8,7


In [4]:
# checking dataframe shape
data.shape

(452, 280)

In [5]:
# there are some "?"" character in dataframe and we replaced them with "nan"
data = data.replace('?', np.nan)
# fill nan data with median of columns
data.fillna(data.median(), inplace=True) 
X, y = data.iloc[:, :-1], data.iloc[:, -1]

In [6]:
# selecting objects from dataframe
data.select_dtypes(include=['object']).columns.tolist()

['v10', 'v11', 'v12', 'v13', 'v14']

In [7]:
from sklearn.svm import LinearSVC,SVR
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel, RFE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(xtrain, ytrain) # first of all, fit all data with all features
print('All features:')
print('train: ', accuracy_score(ytrain, knn.predict(xtrain)), '  test: ',accuracy_score(ytest, knn.predict(xtest)))

# number_of_features= 20, 50, 100, 150, 200
n_feature = [20, 50, 100, 150, 200]
col = pd.MultiIndex.from_tuples([('Univariate Statistics','train'),('Univariate Statistics', 'test'),
                                ('model base','train'),('model base', 'test'),
                                ('Iterative feature selection','train'),('Iterative feature selection', 'test')])

result = pd.DataFrame(np.nan, n_feature, col) # creating an empty multi index dataframe

for k in n_feature:    
    # select features with univariate statistics
    # in parameter of SelectKBest we could use diffrent statistic but i choose f_classif
    xnew = SelectKBest(f_classif, k=k).fit_transform(X, y)
    x_train, x_test, y_train, y_test = train_test_split(xnew, y, test_size=0.2)
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(x_train, y_train)
    
    # set train and test accuracies in result data frame
    result.loc[result.index==k, ('Univariate Statistics', 'train')] = accuracy_score(y_train, knn.predict(x_train))
    result.loc[result.index==k, ('Univariate Statistics', 'test')] = accuracy_score(y_test, knn.predict(x_test))
    
for k in n_feature:    # select features base on model
    
    svm = LinearSVC(C=0.01, penalty='l1', dual=False).fit(X, y) # using and svm model for select feature
    
    # using SelectFromModel from sklearn base on svm model, if we want to select specific number of feature we should
    # set threshold=-np.inf and this method return new dataframe with selected feature for us
    xnew = SelectFromModel(svm, threshold=-np.inf, max_features=k, prefit=True).transform(X)
    knn = KNeighborsClassifier(n_neighbors=3)
    x_train, x_test, y_train, y_test = train_test_split(xnew, y, test_size=0.2)
    knn.fit(x_train, y_train)
    
    # set accuracies in result data frame
    result.loc[result.index==k, ('model base', 'train')] = accuracy_score(y_train, knn.predict(x_train))
    result.loc[result.index==k, ('model base', 'test')] = accuracy_score(y_test, knn.predict(x_test))
    
for k in n_feature:    # select features base on Iterative feature selection
    selector = RFE(RandomForestClassifier(n_estimators=10, random_state=15),
    n_features_to_select=k)
    selector = selector.fit(X, y)
    xnew = selector.transform(X)
    knn = KNeighborsClassifier(n_neighbors=3)
    x_train, x_test, y_train, y_test = train_test_split(xnew, y, test_size=0.2)
    knn.fit(x_train, y_train)
    result.loc[result.index==k, ('Iterative feature selection', 'train')] = accuracy_score(y_train, knn.predict(x_train))
    result.loc[result.index==k, ('Iterative feature selection', 'test')] = accuracy_score(y_test, knn.predict(x_test))
    
result    

All features:
train:  0.6842105263157895   test:  0.6593406593406593


Unnamed: 0_level_0,Univariate Statistics,Univariate Statistics,model base,model base,Iterative feature selection,Iterative feature selection
Unnamed: 0_level_1,train,test,train,test,train,test
20,0.703601,0.637363,0.692521,0.703297,0.761773,0.593407
50,0.750693,0.648352,0.745152,0.56044,0.747922,0.593407
100,0.711911,0.659341,0.692521,0.637363,0.731302,0.648352
150,0.722992,0.626374,0.692521,0.637363,0.67867,0.527473
200,0.689751,0.538462,0.742382,0.648352,0.714681,0.516484
