In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
from scipy.optimize import minimize
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
import kagglehub


In [None]:
path = kagglehub.dataset_download("uciml/iris")

Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/iris?dataset_version_number=2...


100%|██████████| 3.60k/3.60k [00:00<00:00, 7.02MB/s]

Extracting files...





In [None]:
path = kagglehub.dataset_download("uciml/iris")
iris = load_iris()


# Features and target
X = iris.data  # Features
y = iris.target  # Target

# Split the data into training and test sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the data (scaling the features)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit on training data and transform
X_test = scaler.transform(X_test)  # Transform test data using the same scaling



In [None]:
# Convert the Iris dataset to a pandas DataFrame
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

# Add the target column (species) to the DataFrame
iris_df['target'] = iris.target

# Show the first few rows of the dataset
print(iris_df.head())  # By default, it will show the first 5 rows


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


**Finding Q**:  the matrix that represent Feature Redundency

In [None]:
# Calculate the correlation matrix (Q)
correlation_matrix = np.corrcoef(X.T)  # Transpose to get features as rows
print(correlation_matrix)


[[ 1.         -0.11756978  0.87175378  0.81794113]
 [-0.11756978  1.         -0.4284401  -0.36612593]
 [ 0.87175378 -0.4284401   1.          0.96286543]
 [ 0.81794113 -0.36612593  0.96286543  1.        ]]


**Finding F**: a vector representing features relevance

In [None]:
correlations = []
for i in range(X.shape[1]):  # Loop through each feature
    feature = X[:, i]  # Get the feature
    corr = np.corrcoef(feature, y)[0, 1]  # Compute the correlation with the target (y)
    correlations.append(np.abs(corr))  # Use absolute value of correlation for relevance

F = np.array(correlations)
print("Relevance vector F:", F)

Relevance vector F: [0.78256123 0.42665756 0.9490347  0.95654733]



# The objective function
𝑓
(
𝑥
)
f(x) in QPFS is defined as:

𝑓
(
𝑥
)
=
𝛼
⋅
𝑥
𝑇
𝑄
𝑥
+
(
1
−
𝛼
)
⋅
𝐹
𝑇
𝑥

Where:

𝑥
x is the feature weight vector (we want to optimize this).
𝑄
Q is the correlation matrix (a redundancy measure).
𝐹
F is the redundancy vector (a relevance measure).
𝛼
α is a balance parameter between redundancy and relevance. It ranges between 0 and 1, where:
𝛼
=
1
α=1 means we care only about minimizing redundancy.
𝛼
=
0
α=0 means we care only about relevance.

In [None]:
# Define the alpha parameter (balance between relevance and redundancy)
alpha = 0.5  # You can experiment with this value

# Define the objective function
def objective(x, Q, F, alpha):
    return 0.5 * (alpha * np.dot(x.T, np.dot(Q, x)) + (1 - alpha) * np.dot(F.T, x))

# Define the constraint: sum of feature weights = 1
def constraint(x):
    return np.sum(x) - 1  # Sum of weights should be 1

# Define the bounds: feature weights should be between 0 and 1
bounds = [(0, 1)] * X.shape[1]

# Initial guess (equal weights to start)
x0 = np.ones(X.shape[1]) / X.shape[1]

# Set up the constraints and bounds for the optimization
constraints = [{'type': 'eq', 'fun': constraint}]
bounds = [(0, 1)] * X.shape[1]

# Solve the optimization problem using scipy's minimize function
result = minimize(objective, x0, args=(correlation_matrix, F, alpha), constraints=constraints, bounds=bounds)

# Get the optimized feature weights
optimized_weights = result.x

# Output the optimized weights
print("Optimized Feature Weights:", optimized_weights)

# Select the features with the highest weights
selected_features = np.argsort(optimized_weights)[::-1]  # Sort indices based on weights
print("Selected Features (by importance):", selected_features)

Optimized Feature Weights: [8.55459675e-17 5.91424403e-01 4.08575597e-01 3.90513613e-17]
Selected Features (by importance): [1 2 0 3]


In [None]:
# Load Iris dataset
data = load_iris()
X, y = data.data, data.target
feature_names = data.feature_names

# Initialize a DataFrame to store results
results_df = pd.DataFrame({"Feature": feature_names})

# Mutual Information
mi_scores = mutual_info_classif(X, y)
results_df["Mutual Information"] = mi_scores

# Recursive Feature Elimination (RFE)
rfe_model = RandomForestClassifier(random_state=42)
rfe = RFE(estimator=rfe_model, n_features_to_select=2)
rfe.fit(X, y)
results_df["RFE Ranking"] = rfe.ranking_

# Lasso Regularization (L1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)
results_df["Lasso Coefficients"] = lasso.coef_

# Random Forest Feature Importance
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X, y)
results_df["Random Forest Importance"] = rf_model.feature_importances_

results_df["weight using QPFS"] = optimized_weights

# Sort features by their average rank across all methods
results_df["Average Rank"] = (
    results_df[["Mutual Information", "RFE Ranking", "Random Forest Importance","weight using QPFS"]]
    .rank(ascending=False).mean(axis=1)
)
results_df = results_df.sort_values(by="Average Rank")

# Display results
print(results_df)


             Feature  Mutual Information  RFE Ranking  Lasso Coefficients  \
2  petal length (cm)            0.987414            1            0.263330   
1   sepal width (cm)            0.274995            3           -0.000000   
3   petal width (cm)            0.996180            1            0.427466   
0  sepal length (cm)            0.501222            2            0.000000   

   Random Forest Importance  weight using QPFS  Average Rank  
2                  0.436130       4.085756e-01         2.125  
1                  0.021678       5.914244e-01         2.500  
3                  0.436065       3.905136e-17         2.625  
0                  0.106128       8.554597e-17         2.750  
