# Feature Selection with Categorical Data

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_validate
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import make_scorer
from sklearn.neighbors import KNeighborsClassifier
import time

In [3]:
data = pd.read_csv("./data/data.csv", dtype={'grit': 'category', 'bin': 'category'})
data.head()

Unnamed: 0,size,score,center,comparison,grit,bin
0,-0.7429,-2.3981,-0.4037,1.7989,ultra,B
1,0.5697,0.3728,2.9002,-1.5993,super,C
2,-1.59,-3.1321,-3.2852,0.3593,extra,B
3,0.7101,0.9942,3.1275,-1.6632,extra,A
4,-2.2128,-2.4286,-1.0112,0.2764,ultra,B


The DataFrame data contains five predictors: four are numeric and one is categorical, grit.
We can use the [get_dummies()](https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html#pandas-get-dummies) function to convert a categorical vector to a numeric matrix.

In [6]:
dummy_grit = pd.get_dummies(data.grit)
dummy_grit.head()

Unnamed: 0,extra,super,ultra,very
0,0,0,1,0
1,0,1,0,0
2,1,0,0,0
3,1,0,0,0
4,0,0,1,0


The DataFrame data contains five predictors: four numeric and one categorical, grit. The last column in data contains the response, bin.

In [10]:
X = pd.concat([data.iloc[:, :-2], dummy_grit], axis=1)
y = data['bin']
X.head()

Unnamed: 0,size,score,center,comparison,extra,super,ultra,very
0,-0.7429,-2.3981,-0.4037,1.7989,0,0,1,0
1,0.5697,0.3728,2.9002,-1.5993,0,1,0,0
2,-1.59,-3.1321,-3.2852,0.3593,1,0,0,0
3,0.7101,0.9942,3.1275,-1.6632,1,0,0,0
4,-2.2128,-2.4286,-1.0112,0.2764,0,0,1,0


In [8]:
def score_func(y_true, y_pred):
    return (y_true != y_pred).sum()

scorer = make_scorer(score_func, greater_is_better=False)

In [11]:
classifier = KNeighborsClassifier(n_neighbors=1)
kf = KFold(n_splits=5, shuffle=True)

sfs = SequentialFeatureSelector(classifier, n_features_to_select='auto', cv=kf, scoring=scorer)
sfs.fit(X, y)

In [12]:
X_selected = X.loc[:, sfs.get_support()]
X_selected.head()

Unnamed: 0,score,comparison,super,ultra
0,-2.3981,1.7989,0,1
1,0.3728,-1.5993,1,0
2,-3.1321,0.3593,0,0
3,0.9942,-1.6632,0,0
4,-2.4286,0.2764,0,1


In [13]:
classifier = KNeighborsClassifier(n_neighbors=1)
kf = KFold(n_splits=7, shuffle=True)

start_time = time.time()
cv_results = cross_validate(classifier, X_selected, y, cv=kf)
elapsed_time = time.time() - start_time

print("Accuracies:", cv_results['test_score'])
print("Mean accuracy:", cv_results['test_score'].mean())
print(f"Cross validation elapsed time: {elapsed_time:.4f} seconds")

Accuracies: [0.90909091 0.90909091 0.95454545 0.86363636 1.         0.95454545
 0.90909091]
Mean accuracy: 0.9285714285714287
Cross validation elapsed time: 0.0355 seconds
