# **Feature Selection + Nearest Neighbor**

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as seab
from itertools import chain, combinations
#from scipy.spatial import minkowski_distance

# - [x] Find "default Rate" : size(most common class)/size(dataset)
# - [x] Create Minkowski Distance function for two points
# - [x] Find nn of a single data point using only one feature
# - [] Find nn of every data point using one feature + max accuracy
# - [] Find nn of every data point using all features + max accuracy

### **[:+:] Reading the Data**

In [16]:
#Read data from the text files and set as a pandas data frame
lilData = pd.read_csv('CS170_Small_Data__19.txt', sep="  ", engine='python', header=None)

#print all the data:
#print(lilData)


#----print just the 'label' column (and indeces):
print("The first 4 entries of the first column: data[colIdx][rowIdx]:")
print(lilData[0][0:5])


print("Matrix Dimensions:")
lil_rowXcol= lilData.shape
print(lil_rowXcol)


print("Data Head:")
print(lilData.head())

print("MetaData: ")
lilData.describe()



print('Occurrence counts of classes:')
# count occurrences of the class column
occur = lilData.groupby([0]).size()
# display occurrences of a particular column
display(occur)
lilDefaultRate= 405/500
print("Default Rate:")
print(lilDefaultRate)



The first 4 entries of the first column: data[colIdx][rowIdx]:
0    2.0
1    1.0
2    2.0
3    2.0
4    2.0
Name: 0, dtype: float64
Matrix Dimensions:
(500, 7)
Data Head:
     0         1         2         3         4         5         6
0  2.0 -1.101866 -0.782026  0.552502  0.454685  1.132363  1.135458
1  1.0  0.928979 -0.169694  1.465293 -1.591929  0.144808  0.162709
2  2.0  1.123118 -1.384730 -0.903598  0.692522  0.669263 -0.142156
3  2.0  0.816617 -0.043628  1.026966  0.231013 -0.006551  2.316509
4  2.0 -1.159129 -1.341375  0.459997  0.631261 -1.479455  0.520158
MetaData: 
Occurrence counts of classes:


0
1.0     95
2.0    405
dtype: int64

Default Rate:
0.81


### **[:+:] Minkowski Distance Calculation**

In [17]:
#My (from-scratch)Function takes in indices of two data points and exponent p, returns the minkowski distance between them based on the four dimensions and the exponent
def minkowski(data,rowIdx1,rowIdx2,ftSet,p):
    # featSet1= []
    # featSet2= []
    sigma= 0
    for ft in ftSet:
        # featSet1.append(data[ft][rowIdx1])
        # featSet2.append(data[ft][rowIdx2])
        sigma+= ((((data[ft][rowIdx1]-data[ft][rowIdx2])**2))**(1/2))  **(p)
        distance= sigma**(1/p)
    return distance
    # return distance, featSet1, featSet2

# d= minkowski(lilData, rowIdx1=1, rowIdx2= 2, ftSet=(1,2,3,4,5,6), p=2)
# print(d)
# minkowski_distance(d[1],d[2],p=2)

## **[:+:] Forward Selection**

In [23]:
def generateCombos(data):
    features= []
    for i in range(1,data.shape[1]):
        features.append(i)
    print(features)
    return chain.from_iterable(combinations(features, r) for r in range(len(features) + 1))



# takes a single combo of features, classifies each dp by its nearest neighbor, returns a list of classifications for every dp
def classifier(data, ftSet):
    rows= data.shape[0]
    nearestNeighbors= []
    
    for dp in range(0,rows):
        nnIdx= 0
        nearest= 10000
        classification= 0
        for  neighborIdx in range(0,rows):
            if dp != neighborIdx:
                dist= minkowski(data, dp, neighborIdx, ftSet, p=2)
                if dist < nearest:
                    nnIdx= neighborIdx
                    nearest= dist
        classification= data[0][nnIdx]
        nearestNeighbors.append(classification)

    # nearest= min(neighbors)
    # mindex= data[0].index(nearest)
    return nearestNeighbors


#  Given a set of feature combos, each dp is compared against every other dp by that combo, the accuracies of every combo is returned as list
def accuracy(data, ftCombos):
    ftAccuracies= []
    for combo in ftCombos:
        classes= classifier(data, ftSet= {combo})
        classesL= len(classes)
        correct= 0
        for j in range(0,classesL):
            if classes[j] == data[0][j]:
                correct+=1
        accuracy = correct/classesL
        ftAccuracies.append(accuracy)
    return ftAccuracies



def firstFeature(data):
    ftAccuracies= []
    ftRowLength= data.shape[1]
    for i in range(1, ftRowLength):
        ftAccuracies.append(accuracy(data,ftCombos= {i}))
    bestAcc= max(ftAccuracies)
    bestFt= ftAccuracies.index(bestAcc)+1
    return bestFt, bestAcc, ftAccuracies



# lilFtCombos= generateCombos(lilData)
firstFeature(lilData)




(5, [0.856], [[0.724], [0.692], [0.68], [0.686], [0.856], [0.67]])

# Works Cited:


-   https://www.analyticsvidhya.com/blog/2020/02/4-types-of-distance-metrics-in-machine-learning/

-   https://www.geeksforgeeks.org/pandas-groupby-count-occurrences-in-column/
-   https://www.codingem.com/python-how-to-get-all-combinations-of-a-list/