<a href="https://colab.research.google.com/github/Mulat-K/Machine-Learning-Mastery-with-Python/blob/main/FSFML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Feature Selection For Machine
 Learning**

1. Univariate Selection.
2. Recursive Feature Elimination.
3. Principle Component Analysis.
4. Feature Importance.

# **Univariate Selection**

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Load dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

dataframe = pd.read_csv(filename, names=column_names, header=None)

# Ensure numeric data and drop NaNs
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe.dropna(inplace=True)

# Split input and output
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values

# Feature selection using chi-squared
selector = SelectKBest(score_func=chi2, k=4)
fit = selector.fit(X, Y)

# Display scores and selected features
np.set_printoptions(precision=3)
print("Chi-squared scores:")
print(fit.scores_)

selected_features = fit.transform(X)
print("\nTop 4 selected features (first 5 samples):")
print(selected_features[:5])

Chi-squared scores:
[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]

Top 4 selected features (first 5 samples):
[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]


# **Recursive Feature Elimination.**

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Load dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names=column_names, header=None)

# Ensure numeric data and drop rows with missing values
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe.dropna(inplace=True)

# Split input and output
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values

# Apply RFE with logistic regression
model = LogisticRegression(solver='liblinear')  # liblinear works well for small datasets
rfe = RFE(estimator=model, n_features_to_select=3)
fit = rfe.fit(X, Y)

# Output results
print("Number of Selected Features: %d" % fit.n_features_)
print("Selected Features Mask (True = selected):")
print(fit.support_)
print("Feature Ranking (1 = best):")
print(fit.ranking_)

Number of Selected Features: 3
Selected Features Mask (True = selected):
[ True False False False False  True  True False]
Feature Ranking (1 = best):
[1 2 3 5 6 1 1 4]


# **Principle Component Analysis.**

In [5]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# Load dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names=column_names, header=None)

# Convert to numeric and clean missing data
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe.dropna(inplace=True)

# Split into input and output
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values

# Apply PCA
pca = PCA(n_components=3)
fit = pca.fit(X)

# Output PCA results
print("Explained Variance Ratio:")
print(fit.explained_variance_ratio_)

print("\nPrincipal Components (each row = 1 component, each column = feature weight):")
print(fit.components_)

Explained Variance Ratio:
[0.889 0.062 0.026]

Principal Components (each row = 1 component, each column = feature weight):
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [ 2.265e-02  9.722e-01  1.419e-01 -5.786e-02 -9.463e-02  4.697e-02
   8.168e-04  1.402e-01]
 [ 2.246e-02 -1.434e-01  9.225e-01  3.070e-01 -2.098e-02  1.324e-01
   6.400e-04  1.255e-01]]


# **Feature Importance.**

In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier

# Load dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names=column_names, header=None)

# Ensure numeric types and clean missing values
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe.dropna(inplace=True)

# Split into input and output
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values

# Train Extra Trees Classifier
model = ExtraTreesClassifier()
model.fit(X, Y)

# Display feature importances
print("Feature Importances:")
for name, score in zip(column_names[:8], model.feature_importances_):
    print(f"{name:>5}: {score:.4f}")

Feature Importances:
 preg: 0.1143
 plas: 0.2281
 pres: 0.0986
 skin: 0.0786
 test: 0.0752
 mass: 0.1425
 pedi: 0.1194
  age: 0.1432
