<a href="https://colab.research.google.com/github/JamshedAli18/Feature-Engineering-Techniques/blob/main/feature_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA



In [2]:
# Load dataset
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target)


In [3]:
X

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [4]:
# 1. Filter Methods (Chi-Square)
# Ensure non-negative data for Chi-Square
X_chi2 = X.apply(lambda col: col + abs(col.min()), axis=0)
chi2_selector = SelectKBest(chi2, k=5)
X_chi2_selected = chi2_selector.fit_transform(X_chi2, y)



In [5]:
# Selected features using Chi-Square
chi2_selected_features = X.columns[chi2_selector.get_support()].tolist()

# 2. Wrapper Methods (Recursive Feature Elimination)
model = LogisticRegression(max_iter=1000)
rfe_selector = RFE(model, n_features_to_select=5, step=1)
X_rfe_selected = rfe_selector.fit_transform(X, y)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# Selected features using RFE
rfe_selected_features = X.columns[rfe_selector.support_].tolist()

# 3. Embedded Methods (Random Forest Feature Importance)
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=X.columns)
top_features_rf = importances.nlargest(5).index.tolist()



In [7]:
# 4. Principal Component Analysis (PCA)
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)



In [8]:
# Output results
print("Selected Features using Chi-Square:", chi2_selected_features)
print("Selected Features using RFE:", rfe_selected_features)
print("Top Features using Random Forest:", top_features_rf)
print("PCA Reduced Data Shape:", X_pca.shape)


Selected Features using Chi-Square: ['malic_acid', 'magnesium', 'flavanoids', 'color_intensity', 'proline']
Selected Features using RFE: ['alcohol', 'ash', 'flavanoids', 'color_intensity', 'od280/od315_of_diluted_wines']
Top Features using Random Forest: ['flavanoids', 'color_intensity', 'alcohol', 'proline', 'od280/od315_of_diluted_wines']
PCA Reduced Data Shape: (178, 5)
