<a href="https://colab.research.google.com/github/IDE21/Feature-Engineering/blob/main/Feature%20Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# Load the dataset (Breast Cancer Wisconsin (Diagnostic) dataset)
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
columns = ['ID', 'Diagnosis'] + [f'Feature_{i}' for i in range(1, 31)]
data = pd.read_csv(url, header=None, names=columns)

# Drop the ID column
data.drop('ID', axis=1, inplace=True)
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})

In [None]:
# Split data into features (X) and target (y)
X = data.drop('Diagnosis', axis=1)
y = data['Diagnosis']

# Split into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Define a function to evaluate and compare models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return accuracy, precision, recall, f1, cm

In [None]:
# 1. Chi-Squared (Filter Method)
from sklearn.preprocessing import MinMaxScaler # import MinMaxScaler

# Use MinMaxScaler to scale data to a range of 0 to 1
scaler_minmax = MinMaxScaler()  # create a MinMaxScaler object
X_train_minmax = scaler_minmax.fit_transform(X_train) # fit and transform the training data
X_test_minmax = scaler_minmax.transform(X_test) # transform the test data

chi2_selector = SelectKBest(chi2, k=10)
X_train_chi2 = chi2_selector.fit_transform(X_train_minmax, y_train) # use minmax scaled data
X_test_chi2 = chi2_selector.transform(X_test_minmax) # use minmax scaled data

# 2. Recursive Feature Elimination (RFE)
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=10, step=1)
X_train_rfe = rfe_selector.fit_transform(X_train_scaled, y_train)
X_test_rfe = rfe_selector.transform(X_test_scaled)

# 3. Random Forest (Embedded Method)
rf = RandomForestClassifier()
rf.fit(X_train_scaled, y_train)
importances = rf.feature_importances_
indices = np.argsort(importances)[-10:]  # Select top 10
X_train_rf = X_train_scaled[:, indices]
X_test_rf = X_test_scaled[:, indices]

In [None]:
# Initialize models
models = {
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier()
}

# Store results for original and feature-selected models
results = {}

# Evaluate models on original data
for name, model in models.items():
    results[f'{name} - Original'] = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)

# Evaluate models on Chi-Squared selected features
for name, model in models.items():
    results[f'{name} - Chi2'] = evaluate_model(model, X_train_chi2, X_test_chi2, y_train, y_test)

# Evaluate models on RFE selected features
for name, model in models.items():
    results[f'{name} - RFE'] = evaluate_model(model, X_train_rfe, X_test_rfe, y_train, y_test)

# Evaluate models on Random Forest selected features
for name, model in models.items():
    results[f'{name} - RF'] = evaluate_model(model, X_train_rf, X_test_rf, y_train, y_test)

In [None]:
metrics_df = pd.DataFrame(results, index=['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Confusion Matrix']).T
display(metrics_df)

Unnamed: 0,Accuracy,Precision,Recall,F1-Score,Confusion Matrix
Naive Bayes - Original,0.964912,0.97561,0.930233,0.952381,"[[70, 1], [3, 40]]"
KNN - Original,0.947368,0.930233,0.930233,0.930233,"[[68, 3], [3, 40]]"
Decision Tree - Original,0.947368,0.930233,0.930233,0.930233,"[[68, 3], [3, 40]]"
Naive Bayes - Chi2,0.973684,1.0,0.930233,0.963855,"[[71, 0], [3, 40]]"
KNN - Chi2,0.964912,0.953488,0.953488,0.953488,"[[69, 2], [2, 41]]"
Decision Tree - Chi2,0.929825,0.948718,0.860465,0.902439,"[[69, 2], [6, 37]]"
Naive Bayes - RFE,0.973684,1.0,0.930233,0.963855,"[[71, 0], [3, 40]]"
KNN - RFE,0.973684,0.97619,0.953488,0.964706,"[[70, 1], [2, 41]]"
Decision Tree - RFE,0.938596,0.928571,0.906977,0.917647,"[[68, 3], [4, 39]]"
Naive Bayes - RF,0.973684,1.0,0.930233,0.963855,"[[71, 0], [3, 40]]"
