In [1]:
from sklearn import decomposition
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pandas as pd

In [40]:
data = pd.read_csv('./agaricus-lepiota.data')
dummies = pd.get_dummies(data) #Convert categorical variable into dummy/indicator variables.
x, y = pd.get_dummies(data), pd.get_dummies(data['edibility'])

In [41]:
print("X shape:", x.shape)
print("y shape:", y.shape)

skb = SelectKBest(chi2, k=10) # Select features according to the k highest scores. Chi-squared stats of non-negative features for classification tasks. 
x_new = skb.fit_transform(x, y) # Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.


print("skb shape:", x_new.shape)

selected = [dummies.columns[i] for i in skb.get_support(indices=True)]
print("Selected features:", ", ".join(selected))

X shape: (8124, 119)
y shape: (8124, 2)
skb shape: (8124, 10)
Selected features: edibility_e, edibility_p, odor_f, odor_n, gill-size_n, gill-color_b, stalk-surface-above-ring_k, stalk-surface-below-ring_k, ring-type_l, spore-print-color_h


In [42]:
print("Original shape:", x.shape)
pca = decomposition.PCA(n_components=10) # looks for a combination of features that capture well the variance of the original features.
x_pca = pca.fit_transform(x) # Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.

print("PCA shape:", x_pca.shape)
best_features = [pca.components_[i].argmax() for i in range(x_pca.shape[1])]
feature_names = [x.columns[best_features[i]] for i in range(x_pca.shape[1])]
print(len(feature_names))
print("Features in which gives max variance:", ", ".join(feature_names))

Original space: (8124, 119)
PCA space: 10
10
Features in which gives max variance: edibility_p, stalk-root_b, habitat_g, stalk-shape_t, odor_n, cap-shape_f, cap-surface_s, cap-color_n, cap-surface_f, spore-print-color_k


In [39]:
set(selected).intersection(set(feature_names))

{'edibility_p', 'odor_n'}