In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, RFECV
from sklearn.metrics import accuracy_score

In [2]:
diabetes_data = pd.read_csv('diabetes.csv')

In [3]:
missing_diabetes = diabetes_data.isnull().mean()
diabetes_cleaned = diabetes_data.loc[:, missing_diabetes < 0.3]

In [4]:
X_diabetes = diabetes_cleaned.drop(columns=['Outcome'])
y_diabetes = diabetes_cleaned['Outcome']

X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(X_diabetes, y_diabetes, test_size=0.3, random_state=42)


In [5]:
model_diabetes = LogisticRegression(max_iter=200)
model_diabetes.fit(X_train_diabetes, y_train_diabetes)
y_pred_diabetes = model_diabetes.predict(X_test_diabetes)
accuracy_diabetes = accuracy_score(y_test_diabetes, y_pred_diabetes)
print(f'Accuracy after Missing Value Filter (Diabetes): {accuracy_diabetes:.4f}')


Accuracy after Missing Value Filter (Diabetes): 0.7359


In [6]:
correlation_matrix_diabetes = diabetes_cleaned.corr()
high_corr_pairs_diabetes = correlation_matrix_diabetes[correlation_matrix_diabetes.abs() > 0.8].stack().index.tolist()
to_drop_diabetes = set()

for i, j in high_corr_pairs_diabetes:
    if i != j:
        to_drop_diabetes.add(j)

diabetes_cleaned_corr = diabetes_cleaned.drop(columns=to_drop_diabetes)

In [7]:
X_diabetes_corr = diabetes_cleaned_corr.drop(columns=['Outcome'])
y_diabetes_corr = diabetes_cleaned_corr['Outcome']

X_train_diabetes_corr, X_test_diabetes_corr, y_train_diabetes_corr, y_test_diabetes_corr = train_test_split(X_diabetes_corr, y_diabetes_corr, test_size=0.3, random_state=42)


In [8]:
model_diabetes_corr = LogisticRegression(max_iter=200)
model_diabetes_corr.fit(X_train_diabetes_corr, y_train_diabetes_corr)
y_pred_diabetes_corr = model_diabetes_corr.predict(X_test_diabetes_corr)
accuracy_diabetes_corr = accuracy_score(y_test_diabetes_corr, y_pred_diabetes_corr)
print(f'Accuracy after High Correlation Filter (Diabetes): {accuracy_diabetes_corr:.4f}')


Accuracy after High Correlation Filter (Diabetes): 0.7359


In [9]:
X_diabetes = diabetes_cleaned.drop(columns=['Outcome'])
selector_diabetes = VarianceThreshold(threshold=0.01)
X_diabetes_low_variance = selector_diabetes.fit_transform(X_diabetes)

In [10]:
mask_diabetes = selector_diabetes.get_support()


In [11]:
low_variance_features = X_diabetes.loc[:, mask_diabetes]

diabetes_low_variance = pd.concat([low_variance_features, y_diabetes.reset_index(drop=True)], axis=1)


In [12]:
X_low_variance = diabetes_low_variance.drop(columns=['Outcome'])
y_low_variance = diabetes_low_variance['Outcome']

# Split the data into training and testing sets
X_low_variance_train, X_low_variance_test, y_low_variance_train, y_low_variance_test = train_test_split(X_low_variance, y_low_variance, test_size=0.3, random_state=42)


In [13]:
model_diabetes_low_variance = LogisticRegression(max_iter=200)
model_diabetes_low_variance.fit(X_low_variance_train, y_low_variance_train)
y_pred_diabetes_low_variance = model_diabetes_low_variance.predict(X_low_variance_test)


In [14]:
accuracy_diabetes_low_variance = accuracy_score(y_low_variance_test, y_pred_diabetes_low_variance)
print(f'Accuracy after Low Variance Filter (Diabetes): {accuracy_diabetes_low_variance:.4f}')

Accuracy after Low Variance Filter (Diabetes): 0.7359


In [15]:
selector_rfecv_diabetes = RFECV(estimator=LogisticRegression(max_iter=200), step=1, cv=5)
selector_rfecv_diabetes.fit(X_diabetes, y_diabetes)

X_selected_diabetes = selector_rfecv_diabetes.transform(X_diabetes)
X_train_selected_diabetes, X_test_selected_diabetes, y_train_selected_diabetes, y_test_selected_diabetes = train_test_split(X_selected_diabetes, y_diabetes, test_size=0.3, random_state=42)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
model_forward_diabetes = LogisticRegression(max_iter=200)
model_forward_diabetes.fit(X_train_selected_diabetes, y_train_selected_diabetes)
y_pred_forward_diabetes = model_forward_diabetes.predict(X_test_selected_diabetes)
accuracy_forward_diabetes = accuracy_score(y_test_selected_diabetes, y_pred_forward_diabetes)
print(f'Accuracy after Forward Feature Selection (Diabetes): {accuracy_forward_diabetes:.4f}')


Accuracy after Forward Feature Selection (Diabetes): 0.7359


In [17]:
remaining_features = X_diabetes.columns.tolist()


In [18]:
from sklearn.tree import DecisionTreeClassifier

# Use a Decision Tree Classifier for backward elimination
model_backward_diabetes = DecisionTreeClassifier()

# Train on the initial dataset
model_backward_diabetes.fit(X_train_diabetes, y_train_diabetes)

# Get initial feature importances and indices
importances_diabetes = model_backward_diabetes.feature_importances_
indices_diabetes = np.argsort(importances_diabetes)

# Convert features to list to keep track during elimination
remaining_features = list(X_diabetes.columns)

print(f"Initial Number of Features: {len(remaining_features)}")

while len(remaining_features) > 5:  # Stop when there are at least 5 features left
    # Drop the least important feature
    least_important_feature_index = indices_diabetes[0]
    feature_to_drop = remaining_features[least_important_feature_index]
    
    # Remove the feature from dataset and from remaining features list
    X_diabetes = X_diabetes.drop(columns=[feature_to_drop])
    remaining_features.remove(feature_to_drop)
    
    # Update the model
    X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(
        X_diabetes, y_diabetes, test_size=0.3, random_state=42)
    
    model_backward_diabetes.fit(X_train_diabetes, y_train_diabetes)
    importances_diabetes = model_backward_diabetes.feature_importances_
    indices_diabetes = np.argsort(importances_diabetes)

# Print final number of features
print(f"Final Number of Features After Elimination: {len(remaining_features)}")

Initial Number of Features: 8
Final Number of Features After Elimination: 5


In [19]:
importances_diabetes_sorted_indices = np.argsort(importances_diabetes)[::-1]  # Sort importances

# Get the number of features to print (up to 5)
num_features_to_print = min(5, len(remaining_features))

print("Top Important Features (Diabetes):")
for i in range(num_features_to_print):
    if i < len(importances_diabetes):
        feature_name = remaining_features[importances_diabetes_sorted_indices[i]]
        importance_value = importances_diabetes[importances_diabetes_sorted_indices[i]]
        print(f"{i + 1}. Feature '{feature_name}': {importance_value:.4f}")

Top Important Features (Diabetes):
1. Feature 'Glucose': 0.3760
2. Feature 'BMI': 0.2082
3. Feature 'Age': 0.1709
4. Feature 'DiabetesPedigreeFunction': 0.1637
5. Feature 'BloodPressure': 0.0812
