In [5]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import warnings

# Suppress LightGBM warnings
warnings.filterwarnings("ignore", category=UserWarning, message="Found whitespace in feature_names")

# Load your dataset
url = r"C:\Users\nagar\OneDrive\Documents\D13 Minor Project\Datasets\Modified.zip"
data = pd.read_csv(url)

# Splitting features (X) and target (y)
X = data.drop('target', axis=1)
y = data['target']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define KL Divergence function
def kl_divergence(p, q):
    epsilon = 1e-10  # small epsilon to avoid division by zero
    return np.sum(np.where(p != 0, p * np.log((p + epsilon) / (q + epsilon)), 0))

# Calculate KL Divergence for each feature
kl_div_scores = {}
num_bins = max(len(X_train[col].unique()) for col in X_train.columns)

for col in X_train.columns:
    feature_values = X_train[col]
    
    # Compute probability distributions for the feature and the target variable
    p_feature, _ = np.histogram(feature_values, bins=num_bins, density=True)
    p_target, _ = np.histogram(y_train, bins=num_bins, density=True)
    
    # Compute KL Divergence
    kl_div_scores[col] = kl_divergence(p_feature, p_target)

# Select top features based on KL Divergence scores
top_features_kl_div = sorted(kl_div_scores, key=kl_div_scores.get, reverse=True)[:11]
print("Top features based on KL Divergence:", top_features_kl_div)

# Define Fischer Index function
def fisher_index(p, q):
    with np.errstate(divide='ignore', invalid='ignore'):  
        return np.sum(np.where((p + q) != 0, (p - q) ** 2 / (p + q), 0))

# Calculate Fischer Index for each feature
fisher_idx_scores = {}

for col in X_train.columns:
    feature_values = X_train[col]
    
    # Compute probability distributions for the feature and the target variable
    p_feature_0, _ = np.histogram(feature_values[y_train == 0], bins=num_bins, density=True)
    p_feature_1, _ = np.histogram(feature_values[y_train == 1], bins=num_bins, density=True)
    
    # Compute Fischer Index
    fisher_idx_scores[col] = fisher_index(p_feature_0, p_feature_1)

# Calculate average score for each feature from KL Divergence method
average_scores = {}
for feature in top_features_kl_div:
    kl_score = kl_div_scores.get(feature, 0)
    fisher_score = fisher_idx_scores.get(feature, 0)
    average_scores[feature] = (kl_score + fisher_score) / 2

# Select top features based on Fischer Index
top_features_fischer = sorted(fisher_idx_scores, key=fisher_idx_scores.get, reverse=True)[:11]
print("Top features based on Fischer Index:", top_features_fischer)


# Combine Fischer Index and KL Divergence scores
combined_scores = {}

for col in X_train.columns:
    kl_score = kl_div_scores.get(col, 0)
    fisher_score = fisher_idx_scores.get(col, 0)
    combined_scores[col] = (kl_score + fisher_score) / 2

# Select top features based on average scores from both methods
top_features_combined = sorted(combined_scores, key=combined_scores.get, reverse=True)[:11]
print("Top features based on Combined Scores:", top_features_combined)



# Use selected features for training and testing
X_train_selected = X_train[top_features_combined]
X_test_selected = X_test[top_features_combined]

# Initialize individual models
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
lgbm_model = LGBMClassifier(n_estimators=100, random_state=42)

# Fit individual models
rf_model.fit(X_train_selected, y_train)
xgb_model.fit(X_train_selected, y_train)
mlp_model.fit(X_train_selected, y_train)
lgbm_model.fit(X_train_selected, y_train)

# Initialize Stacking Classifier with base models and a meta-model
stacking_classifier = StackingClassifier(
    estimators=[
        ('mlp', mlp_model),
        ('rf', rf_model),
        ('xgb', xgb_model),
        ('lgbm', lgbm_model)
    ],
    final_estimator=LogisticRegression(max_iter=1000),  # Increased max_iter
    cv=5
)

# Fit the Stacking Classifier (which internally fits the base models)
stacking_classifier.fit(X_train_selected, y_train)

# Predictions
y_pred = stacking_classifier.predict(X_test_selected)

# Calculate accuracy
stacking_accuracy = accuracy_score(y_test, y_pred)
print("Stacking Classifier Accuracy:", stacking_accuracy)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Precision, Recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)


Top features based on KL Divergence: ['ST slope', 'chest pain type', 'country', 'oldpeak', 'resting ecg', 'age', 'fasting blood sugar', 'max heart rate', 'sex', 'resting bp s', 'exercise angina']
Top features based on Fischer Index: ['ST slope', 'exercise angina', 'oldpeak', 'chest pain type', 'sex', 'fasting blood sugar', 'country', 'age', 'resting ecg', 'resting bp s', 'max heart rate']
Top features based on Combined Scores: ['ST slope', 'chest pain type', 'country', 'oldpeak', 'resting ecg', 'exercise angina', 'age', 'sex', 'fasting blood sugar', 'max heart rate', 'resting bp s']
[LightGBM] [Info] Number of positive: 498, number of negative: 454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 257
[LightGBM] [Info] Number of data points in the train set: 952, number of used featur

Stacking Classifier Accuracy: 0.9495798319327731
Confusion Matrix:
[[101   6]
 [  6 125]]
Precision: 0.9541984732824428
Recall: 0.9541984732824428
F1 Score: 0.9541984732824428


In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import warnings

# Suppress LightGBM warnings
warnings.filterwarnings("ignore", category=UserWarning, message="Found whitespace in feature_names")

# Load your dataset
url = r"C:\Users\nagar\Downloads\Samanvitha\Saman.zip"
data = pd.read_csv(url)

# Excluding the 'fasting blood pressure' feature
data = data.drop('country', axis=1)

# Splitting features (X) and target (y)
X = data.drop('target', axis=1)
y = data['target']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define KL Divergence function
def kl_divergence(p, q):
    epsilon = 1e-10  # small epsilon to avoid division by zero
    return np.sum(np.where(p != 0, p * np.log((p + epsilon) / (q + epsilon)), 0))

# Calculate KL Divergence for each feature
kl_div_scores = {}
num_bins = max(len(X_train[col].unique()) for col in X_train.columns)

for col in X_train.columns:
    feature_values = X_train[col]
    
    # Compute probability distributions for the feature and the target variable
    p_feature, _ = np.histogram(feature_values, bins=num_bins, density=True)
    p_target, _ = np.histogram(y_train, bins=num_bins, density=True)
    
    # Compute KL Divergence
    kl_div_scores[col] = kl_divergence(p_feature, p_target)

# Select top features based on KL Divergence scores
top_features_kl_div = sorted(kl_div_scores, key=kl_div_scores.get, reverse=True)[:11]
print("Top features based on KL Divergence:", top_features_kl_div)

# Define Fischer Index function
def fisher_index(p, q):
    with np.errstate(divide='ignore', invalid='ignore'):  
        return np.sum(np.where((p + q) != 0, (p - q) ** 2 / (p + q), 0))

# Calculate Fischer Index for each feature
fisher_idx_scores = {}

for col in X_train.columns:
    feature_values = X_train[col]
    
    # Compute probability distributions for the feature and the target variable
    p_feature_0, _ = np.histogram(feature_values[y_train == 0], bins=num_bins, density=True)
    p_feature_1, _ = np.histogram(feature_values[y_train == 1], bins=num_bins, density=True)
    
    # Compute Fischer Index
    fisher_idx_scores[col] = fisher_index(p_feature_0, p_feature_1)

# Select top features based on Fischer Index scores
top_features_fisher_idx = sorted(fisher_idx_scores, key=fisher_idx_scores.get, reverse=True)[:11]
print("Top features based on Fischer Index:", top_features_fisher_idx)

# Calculate average score for each feature from both KL Divergence and Fischer Index methods
average_scores = {}
for feature in set(top_features_kl_div + top_features_fisher_idx):
    kl_score = kl_div_scores.get(feature, 0)
    fisher_score = fisher_idx_scores.get(feature, 0)
    average_scores[feature] = (kl_score + fisher_score) / 2

# Select top features based on average scores, ensuring less than 10 features if possible
top_features_average = sorted(average_scores, key=average_scores.get, reverse=True)[:11]
print("Top features based on Average Scores:", top_features_average)

# Use selected features for training and testing
X_train_selected = X_train[top_features_average]
X_test_selected = X_test[top_features_average]

# Initialize individual models
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
lgbm_model = LGBMClassifier(n_estimators=100, random_state=42)

# Fit individual models
rf_model.fit(X_train_selected, y_train)
xgb_model.fit(X_train_selected, y_train)
mlp_model.fit(X_train_selected, y_train)
lgbm_model.fit(X_train_selected, y_train)

# Initialize Stacking Classifier with base models and a meta-model
stacking_classifier = StackingClassifier(
    estimators=[
        ('mlp', mlp_model),
        ('rf', rf_model),
        ('xgb', xgb_model),
        ('lgbm', lgbm_model)
    ],
    final_estimator=LogisticRegression(max_iter=1000),  # Increased max_iter
    cv=5
)

# Fit the Stacking Classifier (which internally fits the base models)
stacking_classifier.fit(X_train_selected, y_train)

# Predictions
y_pred = stacking_classifier.predict(X_test_selected)

# Calculate accuracy
stacking_accuracy = accuracy_score(y_test, y_pred)
print("Stacking Classifier Accuracy:", stacking_accuracy)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Precision, Recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)


Top features based on KL Divergence: ['ST slope', 'chest pain type', 'oldpeak', 'resting ecg', 'age', 'fasting blood sugar', 'max heart rate', 'sex', 'resting bp s', 'exercise angina', 'cholesterol']
Top features based on Fischer Index: ['ST slope', 'exercise angina', 'oldpeak', 'chest pain type', 'sex', 'fasting blood sugar', 'age', 'resting ecg', 'resting bp s', 'max heart rate', 'cholesterol']
Top features based on Average Scores: ['ST slope', 'chest pain type', 'oldpeak', 'resting ecg', 'exercise angina', 'age', 'sex', 'fasting blood sugar', 'max heart rate', 'resting bp s', 'cholesterol']
[LightGBM] [Info] Number of positive: 498, number of negative: 454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 952, number of 

Stacking Classifier Accuracy: 0.957983193277311
Confusion Matrix:
[[100   7]
 [  3 128]]
Precision: 0.9481481481481482
Recall: 0.9770992366412213
F1 Score: 0.9624060150375939


In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support

# Load your dataset
url = r"C:\Users\nagar\OneDrive\Documents\D13 Minor Project\Datasets\Heart Disease Dataset(Comprehensive).zip"
data = pd.read_csv(url)

# Splitting features (X) and target (y)
X = data.drop('target', axis=1)
y = data['target']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize individual models
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
lgbm_model = LGBMClassifier(n_estimators=100, random_state=42)

# Fit individual models
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
mlp_model.fit(X_train, y_train)
lgbm_model.fit(X_train, y_train)

# Initialize Stacking Classifier with base models and a meta-model
stacking_classifier = StackingClassifier(
    estimators=[
        ('mlp', mlp_model),
        ('rf', rf_model),
        ('xgb', xgb_model),
        ('lgbm', lgbm_model)
    ],
    final_estimator=LogisticRegression(max_iter=1000),  # Increased max_iter
    cv=5
)

# Fit the Stacking Classifier (which internally fits the base models)
stacking_classifier.fit(X_train, y_train)

# Predictions
y_pred = stacking_classifier.predict(X_test)

# Calculate accuracy
stacking_accuracy = accuracy_score(y_test, y_pred)
print("Stacking Classifier Accuracy:", stacking_accuracy)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Precision, Recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)


[LightGBM] [Info] Number of positive: 498, number of negative: 454
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 952, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.523109 -> initscore=0.092503
[LightGBM] [Info] Start training from score 0.092503
[LightGBM] [Info] Number of positive: 498, number of negative: 454
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 952, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.523109 -> initscore=0.092503
[LightGBM] [Info] Start training from score 0.092503
[LightGBM] [Info] Number of po

Stacking Classifier Accuracy: 0.957983193277311
Confusion Matrix:
[[100   7]
 [  3 128]]
Precision: 0.9481481481481482
Recall: 0.9770992366412213
F1 Score: 0.9624060150375939
