In [None]:
## Loading packages
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sklearn

from sklearn import linear_model
from sklearn import preprocessing
from sklearn import datasets
from sklearn import metrics
from sklearn import model_selection
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm

In [None]:
## Loading data and scaling it

dataRaw = sklearn.datasets.load_wine()

X = dataRaw['data']
Y = dataRaw['target']

print(X.shape)
print(Y.shape)

# Getting feature names of the data
featureNames = dataRaw['feature_names']
print(featureNames)

# Shuffling array 
X, Y = shuffle(X, Y, random_state=1)


# Scaling data
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Splitting data to training and test set

x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size = 0.3)




In [None]:
# Defining forward selection algorithm 

def forward_selector(maxFeatureN, train_y, test_y, train_X, test_X, random_seed, cv_k, originalData):
    
    np.random.seed(random_seed)
    
    returnList = {"maxFeatureN": maxFeatureN}
    
    candFeatures = list(range(train_X.shape[1]))
    selection = []
    
    for i in range(maxFeatureN):
        accuracyScores = []
        
        for c in candFeatures: 
            S = list(selection)
            S.append(c)

            train_X_S = train_X[:,S]

            # fitting SVM classifier
            svm_classifier = sklearn.svm.LinearSVC()

            accuraccy = np.mean(model_selection.cross_val_score(svm_classifier, 
                                                                train_X_S, 
                                                                train_y, 
                                                                cv=cv_k, 
                                                                scoring='accuracy'))

            accuracyScores.append(accuraccy)
    
        bestIndex = np.argmax(accuracyScores)
        
        #Update current best selection
        selection.append(candFeatures[bestIndex])
        del candFeatures[bestIndex]
        
    
    
    train_X_S = train_X[:, selection]
    #print(train_X_S.shape)
    test_X_S = test_X[:, selection]
    #print(test_X_S.shape)


    svmClassifier = sklearn.svm.LinearSVC()
    svmClassifier.fit(train_X_S, train_y)
    
    train_y_hat = svmClassifier.predict(train_X_S)
    test_y_hat = svmClassifier.predict(test_X_S)
    
    
    returnList["trainAcc"] = sklearn.metrics.accuracy_score(train_y, train_y_hat)
    returnList["trainConfM"] = sklearn.metrics.confusion_matrix(train_y, train_y_hat)
    returnList["testAcc"] = sklearn.metrics.accuracy_score(test_y, test_y_hat)
    returnList["testConfM"] = sklearn.metrics.confusion_matrix(test_y, test_y_hat)
    returnList["featuresSelectedNumeric"] = [selection]
    returnList["featuresSelectedText"] = [originalData['feature_names'][i] for i in selection]    
    
    return returnList
    


#print(forward_selector(13, y_train, y_test, x_train, x_test, 1, 10, dataRaw))




In [None]:
#Test code

"""
svmClassifier = sklearn.svm.LinearSVC()
svmClassifier.fit(x_train[:,[6, 0, 12, 10]], y_train)
    
train_y_hat = svmClassifier.predict(x_train[:,[6, 0, 12, 10]])
test_y_hat = svmClassifier.predict(x_test[:,[6, 0, 12, 10]])


print(sklearn.metrics.accuracy_score(y_train, train_y_hat))
print(sklearn.metrics.confusion_matrix(y_train, train_y_hat))
"""

In [None]:
## For data presented in the table 4.1

accResults = []

for ii in (range (1,14,1)):
    
    result = forward_selector(ii, y_train, y_test, x_train, x_test, 1, 10, dataRaw)
    accResults.append(result["trainAcc"])
    print("For cardinality", ii, "Best features are: ", result["featuresSelectedNumeric"])
    print("For cardinality", ii, "Best features are: ", result["featuresSelectedText"])
    print("Giving performance in train set of ", result["trainAcc"])
    print("Giving performance in test set of ", result["testAcc"])
    print("\n")

    
## Finding best stes for different stopping rules

best_untill_decline_cardinality = None
for i in range(len(accResults)):
    if i == len(accResults)-1:
        break
    elif accResults[i] < accResults[i+1]:
        next
    elif accResults[i] >= accResults[i+1]:
        best_untill_decline = i+1
        break
        
        
    
print("Best set up to a point where model performance on train data starts to decrease is at cardinality: ", 
      best_untill_decline)


print("Best overall set is: ", (np.argmax(accResults)+1) )

In [None]:
# Defining backward selection algorithm 

def backward_selector(maxFeatureN, train_y, test_y, train_X, test_X, random_seed, cv_k, originalData):
    
    np.random.seed(random_seed)
    
    returnList = {"maxFeatureN": maxFeatureN}
    
    candFeatures = list(range(train_X.shape[1]))
    selection = []
    
    for i in range(train_X.shape[1] - maxFeatureN):
        accuracyScores = []
        
        for a, c in enumerate(candFeatures): 
            S = list(candFeatures)
            del S[a]

            train_X_S = train_X[:,S]

            # fitting SVM classifier
            svm_classifier = sklearn.svm.LinearSVC()

            accuraccy = np.mean(model_selection.cross_val_score(svm_classifier, 
                                                                train_X_S, 
                                                                train_y, 
                                                                cv=cv_k, 
                                                                scoring='accuracy'))

            accuracyScores.append(accuraccy)
    
        bestIndex = np.argmax(accuracyScores)
        del candFeatures[bestIndex]
        
    
    
    
    
    
    
    train_X_S = train_X[:, candFeatures]
    #print(train_X_S.shape)
    test_X_S = test_X[:, candFeatures]
    #print(test_X_S.shape)


    svmClassifier = sklearn.svm.LinearSVC()
    svmClassifier.fit(train_X_S, train_y)
    
    train_y_hat = svmClassifier.predict(train_X_S)
    test_y_hat = svmClassifier.predict(test_X_S)
    
    
    returnList["trainAcc"] = sklearn.metrics.accuracy_score(train_y, train_y_hat)
    returnList["trainConfM"] = sklearn.metrics.confusion_matrix(train_y, train_y_hat)
    returnList["testAcc"] = sklearn.metrics.accuracy_score(test_y, test_y_hat)
    returnList["testConfM"] = sklearn.metrics.confusion_matrix(test_y, test_y_hat)
    returnList["featuresSelectedNumeric"] = [candFeatures]
    returnList["featuresSelectedText"] = [originalData['feature_names'][i] for i in candFeatures]
    
    
    return returnList
    


#print(backward_selector(11, y_train, y_test, x_train, x_test, 1, 10, dataRaw))


In [52]:
## For data presented in the table 4.2

accResults = []

for ii in (range (1,14,1)):
    
    result = backward_selector(ii, y_train, y_test, x_train, x_test, 1, 10, dataRaw)
    accResults.append(result["trainAcc"])
    print("For cardinality", ii, "Best features are: ", result["featuresSelectedNumeric"])
    print("For cardinality", ii, "Best features are: ", result["featuresSelectedText"])
    print("Giving performance in train set of ", result["trainAcc"])
    print("Giving performance in test set of ", result["testAcc"])
    print("\n")

    
## Finding best stes for different stopping rules

best_untill_decline_cardinality = None
for i in range(-1,-13,-1):
    
    if accResults[i] <= accResults[i-1]:
        next
    elif accResults[i] > accResults[i-1]:
        best_untill_decline = 14 + i
        break
        
        
print(accResults)
    
print("Best set up to a point where model performance on train data starts to decrease is at cardinality: ", 
      best_untill_decline)


print("Best overall set is: ", (np.argmax(accResults)+1) )

(124, 1)
(54, 1)
For cardinality 1 Best features are:  [[6]]
For cardinality 1 Best features are:  ['flavanoids']
Giving performance in train set of  0.7661290322580645
Giving performance in test set of  0.8703703703703703


(124, 2)
(54, 2)
For cardinality 2 Best features are:  [[6, 12]]
For cardinality 2 Best features are:  ['flavanoids', 'proline']
Giving performance in train set of  0.9112903225806451
Giving performance in test set of  0.8703703703703703


(124, 3)
(54, 3)
For cardinality 3 Best features are:  [[6, 10, 12]]
For cardinality 3 Best features are:  ['flavanoids', 'hue', 'proline']
Giving performance in train set of  0.967741935483871
Giving performance in test set of  0.9074074074074074


(124, 4)
(54, 4)
For cardinality 4 Best features are:  [[0, 6, 10, 12]]
For cardinality 4 Best features are:  ['alcohol', 'flavanoids', 'hue', 'proline']
Giving performance in train set of  0.9758064516129032
Giving performance in test set of  0.9629629629629629


(124, 5)
(54, 5)
For

In [None]:
#Test code

"""
svmClassifier = sklearn.svm.LinearSVC()
svmClassifier.fit(x_train[:,[0, 2, 3, 6, 8, 9, 10, 12]], y_train)
    
train_y_hat = svmClassifier.predict(x_train[:,[0, 2, 3, 6, 8, 9, 10, 12]])
test_y_hat = svmClassifier.predict(x_test[:,[0, 2, 3, 6, 8, 9, 10, 12]])


print(sklearn.metrics.accuracy_score(y_train, train_y_hat))
print(sklearn.metrics.confusion_matrix(y_train, train_y_hat))
"""