<a href="https://colab.research.google.com/github/KshitijShinde/bml/blob/main/rfecv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score

# Load the diabetes dataset
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

# Convert the target variable to binary for Logistic Regression (diabetic or not)
y = (y > np.median(y)).astype(int)  # For simplicity, classifying above vs below median

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=10000)

# Initialize RFECV for feature selection
rfecv = RFECV(estimator=model, step=1, cv=5)
rfecv.fit(X_train, y_train)

# Print the selected features and their ranking
print("Optimal number of features:", rfecv.n_features_)
print("Selected features:", rfecv.support_)
print("Feature ranking:", rfecv.ranking_)

# Evaluate the model using the selected features
X_train_selected = X_train[:, rfecv.support_]
X_test_selected = X_test[:, rfecv.support_]

model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)

# Accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with selected features: {accuracy:.4f}')


Optimal number of features: 4
Selected features: [False False  True  True False False  True False  True False]
Feature ranking: [5 4 1 1 6 7 1 2 1 3]
Accuracy with selected features: 0.7970
