In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#random_states = random.sample(range(1, 100), 50)
random_states = [73, 62, 81, 4, 65, 10, 85, 3, 54, 9, 21, 13, 69, 52, 16, 96, 50, 8, 44, 92, 30, 56, 89, 86, 39, 70, 90, 17, 75, 49, 43, 45, 58, 22, 15, 80, 97, 66, 68, 55, 34, 63, 42, 12, 6, 29, 57, 25, 36, 41]
print(random_states)

# Loading and Preprocessing The Data

In [None]:
data = pd.read_excel("PATH TO DATA")

label_map = {'Case': 0, 'Control': 1}
data["Label"] = data["Label"].map(label_map)

x_values = data.iloc[:, 1:].values
y_values = data.iloc[:, 0].values

scaler = StandardScaler()
x_values = scaler.fit_transform(x_values)

mi_scores = mutual_info_regression(x_values, y_values,random_state= 42)

feature_names = data.columns[1:]

sorted_features = [f for _, f in sorted(zip(mi_scores, feature_names), reverse=True)]

In [None]:
print(sorted_features)
print("====================")
print(len(sorted_features))

In [None]:
sorted_mi_scores = sorted(mi_scores, reverse=True)
valid_score_count = len([score for score in mi_scores if score > 0])

plt.figure(figsize=(10, 6))
plt.bar(sorted_features, sorted_mi_scores)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Mutual Information Scores')
plt.title('Mutual Information Scores for Features (Sorted)')
plt.tight_layout()
plt.show()

# Plotting FI, Pairplot and CM for all Related features

In [None]:
All_Related_Features = sorted_features[:valid_score_count]

print("Related features:", All_Related_Features)

In [None]:
important_features = All_Related_Features
data = data[['Label'] + important_features]

In [None]:
# Plot pair plot
sns.pairplot(data, hue='Label', diag_kind='kde', markers=['o', 's'], palette='husl')
plt.title('Pair Plot of Data')
plt.show()

# Create a correlation matrix
corr_matrix = data.corr()

# Plot heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

# Model Training and Testing using All related variables

In [None]:
x_values = data.iloc[:, 1:].values
y_values = data.iloc[:, 0].values

scaler = StandardScaler()
x_values = scaler.fit_transform(x_values)

f1_scores = []

for num_features in range(1, len(All_Related_Features) + 1):
    selected_features = All_Related_Features[:num_features]

    x_subset = data[selected_features].values

    average_f1 = 0

    for random_state in random_states:
        base_models = [
            RandomForestClassifier(n_estimators=100, random_state=random_state),
            GradientBoostingClassifier(n_estimators=100, random_state=random_state),
            AdaBoostClassifier(n_estimators=100, random_state=random_state),
            DecisionTreeClassifier(random_state=random_state),
            LogisticRegression(max_iter=10000),
            SVC(probability=True),
            GaussianNB(),
            KNeighborsClassifier(),
            CatBoostClassifier(iterations=100, random_seed=random_state, verbose=False),
            XGBClassifier(random_state=random_state),
            LGBMClassifier(random_state=random_state)
    ]

        meta_model = LogisticRegression(max_iter=10000)

        x_train, x_test, y_train, y_test = train_test_split(x_subset, y_values, test_size=0.3, random_state=random_state)

        predictions = []

        for base_model in base_models:
            base_model.fit(x_train, y_train)
            y_pred_base = base_model.predict(x_test)
            predictions.append(y_pred_base)

        stacked_predictions = np.column_stack(predictions)

        meta_model.fit(stacked_predictions, y_test)

        y_pred_stacked = meta_model.predict(stacked_predictions)

        f1 = f1_score(y_test, y_pred_stacked)

        average_f1 += f1

    average_f1 /= len(random_states)
    f1_scores.append(average_f1)

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(All_Related_Features) + 1), f1_scores, marker='o')
plt.xlabel('Number of Features')
plt.ylabel('Average F1 Score')
plt.title('F1 Score vs. Number of Features')
plt.xticks(range(1, len(All_Related_Features) + 1))
plt.grid(True)
plt.show()

# Model Training and Testing using All related variables

In [None]:
x_values = data.iloc[:, 1:].values
y_values = data.iloc[:, 0].values

scaler = StandardScaler()
x_values = scaler.fit_transform(x_values)

top_results = {}
average_accuracy = 0
average_precision = 0
average_recall = 0
average_f1 = 0
average_specificity = 0

for random_state in random_states:
    base_models = [
        RandomForestClassifier(n_estimators=100, random_state=random_state),
        GradientBoostingClassifier(n_estimators=100, random_state=random_state),
        AdaBoostClassifier(n_estimators=100, random_state=random_state),
        DecisionTreeClassifier(random_state=random_state),
        LogisticRegression(),
        SVC(probability=True),
        GaussianNB(),
        KNeighborsClassifier(),
        CatBoostClassifier(iterations=100, random_seed=random_state, verbose=False),
        XGBClassifier(random_state=random_state),
        LGBMClassifier(random_state=random_state)
    ]

    meta_model = LogisticRegression()
    
    x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.3, random_state=random_state)

    predictions = []

    for base_model in base_models:
        base_model.fit(x_train, y_train)
        y_pred_base = base_model.predict(x_test)
        predictions.append(y_pred_base)

    stacked_predictions = np.column_stack(predictions)

    meta_model.fit(stacked_predictions, y_test)

    y_pred_stacked = meta_model.predict(stacked_predictions)

    accuracy = accuracy_score(y_test, y_pred_stacked)
    average_accuracy += accuracy

    confusion_mat = confusion_matrix(y_test, y_pred_stacked)
    tn, fp, fn, tp = confusion_mat.ravel()

    specificity = tn / (tn + fp)
    recall = recall_score(y_test, y_pred_stacked)
    precision = precision_score(y_test, y_pred_stacked)
    f1 = f1_score(y_test, y_pred_stacked)

    average_recall += recall
    average_precision += precision
    average_f1 += f1
    average_specificity += specificity

    top_results[random_state] = {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'specificity': specificity}

average_accuracy /= len(random_states)
average_precision /= len(random_states)
average_recall /= len(random_states)
average_f1 /= len(random_states)
average_specificity /= len(random_states)

print("Average Accuracy:", average_accuracy)
print("Average Precision:", average_precision)
print("Average Recall (Sn):", average_recall)
print("Average Specificity (Sp):", average_specificity)
print("Average F1-score:", average_f1)

# Each model using all related variables

In [None]:
x_values = data.iloc[:, 1:].values
y_values = data.iloc[:, 0].values

scaler = StandardScaler()
x_values = scaler.fit_transform(x_values)

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0.0
    
    return accuracy, precision, recall, f1, specificity

base_models = [
    RandomForestClassifier(n_estimators=100, random_state=random_states[0]),
    GradientBoostingClassifier(n_estimators=100, random_state=random_states[0]),
    AdaBoostClassifier(n_estimators=100, random_state=random_states[0]),
    DecisionTreeClassifier(random_state=random_states[0]),
    LogisticRegression(max_iter=10000),
    SVC(probability=True),
    GaussianNB(),
    KNeighborsClassifier(),
    CatBoostClassifier(iterations=100, random_seed=random_states[0], verbose=False),
    XGBClassifier(random_state=random_states[0]),
    LGBMClassifier(random_state=random_states[0])
]

for base_model in base_models:
    print(f"Results for {base_model.__class__.__name__}:\n")

    average_metrics = {
        'accuracy': 0,
        'precision': 0,
        'recall': 0,
        'f1': 0,
        'specificity': 0
    }

    x_subset = x_values

    metrics_sum = {key: 0 for key in average_metrics}

    for random_state in random_states:
        meta_model = LogisticRegression(max_iter=10000)

        x_train, x_test, y_train, y_test = train_test_split(x_subset, y_values, test_size=0.3, random_state=random_state)

        base_model.fit(x_train, y_train)
        y_pred_base = base_model.predict(x_test)

        stacked_predictions = y_pred_base.reshape(-1, 1)  # Reshape to column vector

        meta_model.fit(stacked_predictions, y_test)

        y_pred_stacked = meta_model.predict(stacked_predictions)

        metrics = calculate_metrics(y_test, y_pred_stacked)

        for i, key in enumerate(average_metrics):
            metrics_sum[key] += metrics[i]

    for key in average_metrics:
        average_metrics[key] = metrics_sum[key] / len(random_states)

    # Print results
    print(f"Average Accuracy: {average_metrics['accuracy']:.4f}")
    print(f"Average Precision: {average_metrics['precision']:.4f}")
    print(f"Average Recall: {average_metrics['recall']:.4f}")
    print(f"Average F1 Score: {average_metrics['f1']:.4f}")
    print(f"Average Specificity: {average_metrics['specificity']:.4f}\n")

    print("="*40 + "\n")


# Model Results using Top 26 selected variables

In [None]:
Top_Selected= sorted_features[:26]

print("Related features:", Top_Selected)

In [None]:
data2 = data[['Label'] + Top_Selected]

In [None]:
x_values = data2.iloc[:, 1:].values
y_values = data2.iloc[:, 0].values

scaler = StandardScaler()
x_values = scaler.fit_transform(x_values)

top_results = {}
average_accuracy = 0
average_precision = 0
average_recall = 0
average_f1 = 0
average_specificity = 0
average_auc = 0

for random_state in random_states:
    base_models = [
        RandomForestClassifier(n_estimators=100, random_state=random_state),
        GradientBoostingClassifier(n_estimators=100, random_state=random_state),
        AdaBoostClassifier(n_estimators=100, random_state=random_state),
        DecisionTreeClassifier(random_state=random_state),
        LogisticRegression(),
        SVC(probability=True),
        GaussianNB(),
        KNeighborsClassifier(),
        CatBoostClassifier(iterations=100, random_seed=random_state, verbose=False),
        XGBClassifier(random_state=random_state),
        LGBMClassifier(random_state=random_state)
    ]

    meta_model = LogisticRegression()
    
    x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.3, random_state=random_state)

    predictions = []

    for base_model in base_models:
        base_model.fit(x_train, y_train)
        y_pred_base = base_model.predict(x_test)
        predictions.append(y_pred_base)

    stacked_predictions = np.column_stack(predictions)

    meta_model.fit(stacked_predictions, y_test)

    y_pred_stacked = meta_model.predict(stacked_predictions)

    accuracy = accuracy_score(y_test, y_pred_stacked)
    average_accuracy += accuracy

    confusion_mat = confusion_matrix(y_test, y_pred_stacked)
    tn, fp, fn, tp = confusion_mat.ravel()

    specificity = tn / (tn + fp)
    recall = recall_score(y_test, y_pred_stacked)
    precision = precision_score(y_test, y_pred_stacked)
    f1 = f1_score(y_test, y_pred_stacked)

    average_recall += recall
    average_precision += precision
    average_f1 += f1
    average_specificity += specificity
    
     # AUC calculation
    stacked_probabilities = meta_model.predict_proba(stacked_predictions)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, stacked_probabilities)
    auc = roc_auc_score(y_test, stacked_probabilities)
    top_results[random_state] = {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'specificity': specificity, 'auc': auc}
    average_auc += auc

average_accuracy /= len(random_states)
average_precision /= len(random_states)
average_recall /= len(random_states)
average_f1 /= len(random_states)
average_specificity /= len(random_states)
average_auc/= len(random_states)


print("Average Accuracy:", average_accuracy)
print("Average Precision:", average_precision)
print("Average Recall (Sn):", average_recall)
print("Average Specificity (Sp):", average_specificity)
print("Average F1-score:", average_f1)
print("Average AUC:", average_auc)


# Plotting Results

In [None]:
# Create a correlation matrix
corr_matrix = data2.corr()

# Plot heatmap of the correlation matrix without numbers
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap for the Top 26 vaiable')
plt.show()

# Training and testing each model seperatly using 26 variable

In [None]:
x_values = data2.iloc[:, 1:].values
y_values = data2.iloc[:, 0].values

scaler = StandardScaler()
x_values = scaler.fit_transform(x_values)

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0.0
    
    return accuracy, precision, recall, f1, specificity

base_models = [
    RandomForestClassifier(n_estimators=100, random_state=random_states[0]),
    GradientBoostingClassifier(n_estimators=100, random_state=random_states[0]),
    AdaBoostClassifier(n_estimators=100, random_state=random_states[0]),
    DecisionTreeClassifier(random_state=random_states[0]),
    LogisticRegression(max_iter=10000),
    SVC(probability=True),
    GaussianNB(),
    KNeighborsClassifier(),
    CatBoostClassifier(iterations=100, random_seed=random_states[0], verbose=False),
    XGBClassifier(random_state=random_states[0]),
    LGBMClassifier(random_state=random_states[0])
]

for base_model in base_models:
    print(f"Results for {base_model.__class__.__name__}:\n")

    average_metrics = {
        'accuracy': 0,
        'precision': 0,
        'recall': 0,
        'f1': 0,
        'specificity': 0
    }

    x_subset = x_values

    metrics_sum = {key: 0 for key in average_metrics}

    for random_state in random_states:
        meta_model = LogisticRegression(max_iter=10000)

        x_train, x_test, y_train, y_test = train_test_split(x_subset, y_values, test_size=0.3, random_state=random_state)

        base_model.fit(x_train, y_train)
        y_pred_base = base_model.predict(x_test)

        stacked_predictions = y_pred_base.reshape(-1, 1)  # Reshape to column vector

        meta_model.fit(stacked_predictions, y_test)

        y_pred_stacked = meta_model.predict(stacked_predictions)

        metrics = calculate_metrics(y_test, y_pred_stacked)

        for i, key in enumerate(average_metrics):
            metrics_sum[key] += metrics[i]

    for key in average_metrics:
        average_metrics[key] = metrics_sum[key] / len(random_states)

    # Print results
    print(f"Average Accuracy: {average_metrics['accuracy']:.4f}")
    print(f"Average Precision: {average_metrics['precision']:.4f}")
    print(f"Average Recall: {average_metrics['recall']:.4f}")
    print(f"Average F1 Score: {average_metrics['f1']:.4f}")
    print(f"Average Specificity: {average_metrics['specificity']:.4f}\n")

    print("="*40 + "\n")
