In [1]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import warnings

In [2]:
# Load and preprocess the dataset
dataset1 = pd.read_csv("prep.csv", index_col=None)
df2 = dataset1
df2 = pd.get_dummies(df2, dtype=int, drop_first=True)

# Split independent and dependent variables
indep_x = df2.drop('classification_yes', axis=1)
dep_y = df2['classification_yes']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(indep_x, dep_y, test_size=0.3, random_state=42)

In [3]:
# Initialize classifiers
classifiers = {
    'KNN': KNeighborsClassifier(n_neighbors=3),
    #'SVM': SVC(kernel='linear', random_state=42),
    #'Random': RandomForestClassifier(random_state=42),
    'Decision': DecisionTreeClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}

In [4]:
warnings.filterwarnings("ignore")
# Dictionary to store results
results = {'Sequential backward Selection': {}}

# Perform feature selection and evaluation for each classifier
for name, clf in classifiers.items():
    # Initialize Sequential Feature Selector
    sfs = SequentialFeatureSelector(clf, n_features_to_select=2, direction='backward')
    
    # Fit SFS to the training data
    sfs.fit(X_train, y_train)
    
    # Get the selected features
    selected_features = X_train.columns[sfs.get_support()]
    print(f"\n{name} Selected Features:")
    print(selected_features)
    
    # Fit the model with the selected features
    clf.fit(X_train[selected_features], y_train)
    
    # Evaluate the model
    accuracy = clf.score(X_test[selected_features], y_test)
    
    # Store the accuracy in the results dictionary
    results['Sequential backward Selection'][name] = accuracy

# Convert results dictionary to DataFrame
results_df = pd.DataFrame.from_dict(results)

# Display the results DataFrame
print("\nSummary of Model Accuracies:")
print(results_df)



KNN Selected Features:
Index(['al', 'hrmo'], dtype='object')

Decision Selected Features:
Index(['al', 'hrmo'], dtype='object')

Logistic Regression Selected Features:
Index(['al', 'hrmo'], dtype='object')

Summary of Model Accuracies:
                     Sequential backward Selection
KNN                                       0.933333
Decision                                  0.941667
Logistic Regression                       0.925000
