In [None]:
#Importing all the necessary libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, cohen_kappa_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif

In [None]:
#Loading the dataset
df = pd.read_csv("trial_dataframe.csv")

#Encoding the target variable
le = LabelEncoder()
df["Sample_Characteristics"] = le.fit_transform(df["Sample_Characteristics"])

#Defining features and target
X = df.drop(columns=["Sample_Characteristics"])
y = df["Sample_Characteristics"]

# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
#Checking in with the dataset
X.head()

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,FIRRM,FGR,CFH,FUCA2,GCLC,NFYA,...,LOC105370174,C8orf44-SGK3,SNORA74C-2,ELOA3BP,NPBWR1,ELOA3DP,LNCDAT,LOC124902537,RNF228,PANO1
0,12.127672,3.321928,10.802516,10.41257,8.761551,9.859535,11.256209,12.131857,11.154185,11.294621,...,4.459432,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.459432,4.459432
1,11.79604,4.954196,11.309476,10.865733,9.799282,9.475733,12.087794,11.087463,11.272047,11.890644,...,3.584963,0.0,1.0,0.0,1.584963,0.0,0.0,0.0,3.169925,4.857981
2,9.5157,6.83289,12.538189,11.606868,10.055282,8.848623,12.170551,12.151017,11.145932,12.099677,...,5.285402,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.066089,5.0
3,11.951649,5.672425,11.074141,10.405141,8.891784,7.894818,11.253257,11.149112,11.365229,10.903129,...,4.0,1.0,0.0,0.0,1.584963,0.0,0.0,0.0,2.584963,5.0
4,11.905011,0.0,11.626622,10.463524,9.317413,8.873444,12.127027,11.338179,10.645658,10.631177,...,3.0,0.0,0.0,0.0,3.807355,0.0,3.169925,0.0,1.0,3.807355


In [None]:
#No feature selection + RF
#Initializing and training the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

#Making predictions
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1]  

#Evaluating model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
kappa = cohen_kappa_score(y_test, y_pred)

#Printing evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Kappa: {kappa:.4f}")


Accuracy: 0.9755
Precision: 0.9032
AUC: 0.9950
Kappa: 0.8482


In [None]:
# No feature selection + XGB
# Initializing and train the XGBoost classifier
xgb_clf = xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric="logloss")
xgb_clf.fit(X_train, y_train)

# Making predictions
y_pred = xgb_clf.predict(X_test)
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC calculation

# Evaluating model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
kappa = cohen_kappa_score(y_test, y_pred)

# Printing evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Kappa: {kappa:.4f}")


Accuracy: 0.9891
Precision: 0.9688
AUC: 0.9954
Kappa: 0.9334


In [None]:
#No feature selection + LR
#Initializing and training the Logistic Regression model
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

#Making predictions
y_pred = lr.predict(X_test)
y_pred_proba = lr.predict_proba(X_test)[:, 1]

#Evaluating model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
kappa = cohen_kappa_score(y_test, y_pred)

#Printing evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Kappa: {kappa:.4f}")

Accuracy: 0.9864
Precision: 0.9143
AUC: 0.9975
Kappa: 0.9200


In [None]:
#No feature selection + SVM
# Initializing and training the SVM model (with probability=True for AUC calculation)
svm_clf = SVC(kernel='rbf', probability=True, random_state=42)
svm_clf.fit(X_train, y_train)

# Making predictions
y_pred = svm_clf.predict(X_test)
y_pred_proba = svm_clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC calculation

# Evaluating model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
kappa = cohen_kappa_score(y_test, y_pred)

# Printing evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Kappa: {kappa:.4f}")

Accuracy: 0.9864
Precision: 0.9394
AUC: 0.9974
Kappa: 0.9179


In [None]:
#RF + SVM
# Feature selection using Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Getting feature importances
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
feature_counts = [10, 50, 100, 500, 1000]

# Looping through different feature counts
for num_features in feature_counts:
    top_features = feature_importances.nlargest(num_features).index.tolist()

    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    svm_clf = SVC(kernel='rbf', probability=True, random_state=42)
    svm_clf.fit(X_train_scaled, y_train)

    y_pred = svm_clf.predict(X_test_scaled)
    y_pred_proba = svm_clf.predict_proba(X_test_scaled)[:, 1]  

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    print(f"\nResults for Top {num_features} Features:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"AUC: {auc:.4f}")
    print(f"Kappa: {kappa:.4f}")


Results for Top 10 Features:
Accuracy: 0.8925
Precision: 0.8462
AUC: 0.9644
Kappa: 0.7852

Results for Top 50 Features:
Accuracy: 0.9140
Precision: 0.8958
AUC: 0.9681
Kappa: 0.8280

Results for Top 100 Features:
Accuracy: 0.9247
Precision: 0.8980
AUC: 0.9685
Kappa: 0.8495

Results for Top 500 Features:
Accuracy: 0.9247
Precision: 0.8980
AUC: 0.9648
Kappa: 0.8495

Results for Top 1000 Features:
Accuracy: 0.9140
Precision: 0.8800
AUC: 0.9681
Kappa: 0.8281


In [None]:
#RF + LR
# Feature selection using Random Forest
rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Getting feature importances
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
feature_counts = [10, 50, 100, 500, 1000]

# Initializing a DataFrame to store results
results_df = pd.DataFrame(columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Looping through different feature counts
for num_features in feature_counts:
    top_features = feature_importances.nlargest(num_features).index.tolist()

    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    lr_clf = LogisticRegression(solver="liblinear", C=1.0, max_iter=500, random_state=42)
    lr_clf.fit(X_train_scaled, y_train)

    y_pred = lr_clf.predict(X_test_scaled)
    y_pred_proba = lr_clf.predict_proba(X_test_scaled)[:, 1]  

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Displaying final results
print(results_df)

  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],


  Features       AUC  Accuracy  Precision     Kappa
0       10  0.962535  0.892473   0.860000  0.785120
1       50  0.962997  0.946237   0.936170  0.892486
2      100  0.961147  0.935484   0.916667  0.871012
3      500  0.981036  0.935484   0.916667  0.871012
4     1000  0.976873  0.924731   0.882353  0.849619


In [None]:
#RF + RF
# Feature selection using Random Forest
rf_selector = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2, random_state=42, n_jobs=-1)
rf_selector.fit(X_train, y_train)

# Getting feature importances
feature_importances = pd.Series(rf_selector.feature_importances_, index=X.columns)
feature_counts = [10, 50, 100, 500, 1000]

# Initializing a DataFrame to store results
results_df = pd.DataFrame(columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Looping through different feature counts
for num_features in feature_counts:
    top_features = feature_importances.nlargest(num_features).index.tolist()

    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    rf_clf = RandomForestClassifier(
        n_estimators=500,  
        max_depth=None,    
        min_samples_split=2, 
        random_state=42,
        n_jobs=-1
    )
    rf_clf.fit(X_train_selected, y_train)

    y_pred = rf_clf.predict(X_test_selected)
    y_pred_proba = rf_clf.predict_proba(X_test_selected)[:, 1]  

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Displaying final results
print(results_df)

  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],


  Features       AUC  Accuracy  Precision     Kappa
0       10  0.980342  0.946237   0.936170  0.892486
1       50  0.971323  0.935484   0.934783  0.870953
2      100  0.971554  0.935484   0.934783  0.870953
3      500  0.981499  0.956989   0.956522  0.913969
4     1000  0.978030  0.946237   0.936170  0.892486


In [None]:
#RF + XGB
# Feature selection using Random Forest
rf_selector = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=42, n_jobs=-1)
rf_selector.fit(X_train, y_train)

# Getting feature importances
feature_importances = pd.Series(rf_selector.feature_importances_, index=X.columns)
feature_counts = [10, 50, 100, 500, 1000]

# Initializing a DataFrame to store results
results_df = pd.DataFrame(columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Looping through different feature counts
for num_features in feature_counts:
    top_features = feature_importances.nlargest(num_features).index.tolist()

    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    xgb_clf = XGBClassifier(
        n_estimators=500,  
        learning_rate=0.05,  
        max_depth=6,  
        subsample=0.8,  
        colsample_bytree=0.8, 
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    )
    xgb_clf.fit(X_train_scaled, y_train)

    y_pred = xgb_clf.predict(X_test_scaled)
    y_pred_proba = xgb_clf.predict_proba(X_test_scaled)[:, 1]  

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Displaying final results
print(results_df)


Parameters: { "use_label_encoder" } are not used.

  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



  Features       AUC  Accuracy  Precision     Kappa
0       10  0.974534  0.920863   0.914286  0.841735
1       50  0.977847  0.920863   0.914286  0.841735
2      100  0.980538  0.920863   0.902778  0.841768
3      500  0.991097  0.964029   0.970588  0.928046
4     1000  0.989855  0.942446   0.942029  0.884886


In [None]:
#XGB + SVM
# Training XGBoost to get feature importances
xgb_selector = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6,
                             subsample=0.8, colsample_bytree=0.8, random_state=42,
                             use_label_encoder=False, eval_metric="logloss")

xgb_selector.fit(X_train, y_train)

# Getting feature importances
feature_importances = pd.Series(xgb_selector.feature_importances_, index=X.columns)
feature_counts = [10, 50, 100, 500, 1000]

# Initializing a DataFrame to store results
results_df = pd.DataFrame(columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Looping through different feature counts
for num_features in feature_counts:
    top_features = feature_importances.nlargest(num_features).index.tolist()

    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    svm_clf = SVC(kernel="rbf", C=1.0, probability=True, random_state=42)
    svm_clf.fit(X_train_scaled, y_train)

    y_pred = svm_clf.predict(X_test_scaled)
    y_pred_proba = svm_clf.predict_proba(X_test_scaled)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Displaying final results
print(results_df)

Parameters: { "use_label_encoder" } are not used.

  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],


  Features       AUC  Accuracy  Precision     Kappa
0       10  0.994372  0.994083     0.9375  0.964488
1       50  0.994372  0.994083     0.9375  0.964488
2      100  0.996104  0.994083     0.9375  0.964488
3      500  0.996104  0.994083     0.9375  0.964488
4     1000  0.995671  0.994083     0.9375  0.964488


In [None]:
#XGB + LR
# Training XGBoost to get feature importances
xgb_selector = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6,
                             subsample=0.8, colsample_bytree=0.8, random_state=42,
                             use_label_encoder=False, eval_metric="logloss")

xgb_selector.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(xgb_selector.feature_importances_, index=X.columns)

# List of feature selection counts
feature_counts = [10, 50, 100, 500, 1000]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = feature_importances.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Standardize the features (important for Logistic Regression)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Check if y_train contains at least 2 classes
    if len(np.unique(y_train)) < 2:
        print(f"Warning: y_train contains only one class for num_features = {num_features}. Skipping Logistic Regression.")
        continue  # Skip to the next iteration

    # Train Logistic Regression classifier
    lr_clf = LogisticRegression(solver="liblinear", C=1.0, random_state=42)
    lr_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = lr_clf.predict(X_test_scaled)
    y_pred_proba = lr_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Displaying final results
print(results_df)

Parameters: { "use_label_encoder" } are not used.

  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],


  Features       AUC  Accuracy  Precision     Kappa
0       10  0.998151  0.991848   0.942857  0.952025
1       50  0.998151  0.989130   0.941176  0.935188
2      100  0.998503  0.991848   0.942857  0.952025
3      500  0.998503  0.991848   0.918919  0.953244
4     1000  0.998503  0.991848   0.918919  0.953244


In [None]:
#XGB + RF
# Train XGBoost to get feature importances
xgb_selector = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6,
                             subsample=0.8, colsample_bytree=0.8, random_state=42,
                             use_label_encoder=False, eval_metric="logloss")

xgb_selector.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(xgb_selector.feature_importances_, index=X.columns)

# List of feature selection counts
feature_counts = [10, 50, 100, 500, 1000]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = feature_importances.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Train Random Forest classifier
    rf_clf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2,
                                    random_state=42, n_jobs=-1)
    rf_clf.fit(X_train_selected, y_train)

    # Make predictions
    y_pred = rf_clf.predict(X_test_selected)
    y_pred_proba = rf_clf.predict_proba(X_test_selected)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)

Parameters: { "use_label_encoder" } are not used.

  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],


  Features       AUC  Accuracy  Precision     Kappa
0       10  0.975673  0.928058   0.953846  0.856048
1       50  0.988095  0.949640   0.955882  0.899265
2      100  0.988820  0.964029   0.984848  0.928031
3      500  0.987578  0.949640   0.955882  0.899265
4     1000  0.987371  0.949640   0.955882  0.899265


In [None]:
#Variance threshold + XGB
# Define variance thresholds to experiment with
variance_thresholds = [0.001, 0.005, 0.01, 0.05, 0.1]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different variance thresholds
for threshold in variance_thresholds:
    # Apply VarianceThreshold feature selection
    selector = VarianceThreshold(threshold=threshold)
    X_train_selected = selector.fit_transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Get selected feature names
    selected_features = X.columns[selector.get_support()]
    num_features = len(selected_features)

    # Train XGBoost classifier
    xgb_clf = XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    )
    xgb_clf.fit(X_train_selected, y_train)

    # Make predictions
    y_pred = xgb_clf.predict(X_test_selected)
    y_pred_proba = xgb_clf.predict_proba(X_test_selected)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)



Parameters: { "use_label_encoder" } are not used.

  results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



   Variance Threshold Num Features       AUC  Accuracy  Precision     Kappa
0               0.001        31504  0.994372  0.982249   0.928571  0.886856
1               0.005        31502  0.991991  0.976331   0.923077  0.844311
2               0.010        31498  0.993074  0.982249   0.928571  0.886856
3               0.050        31237  0.993290  0.976331   0.923077  0.844311
4               0.100        30372  0.993290  0.982249   0.928571  0.886856


In [None]:
#Variance Threshold + SVM
# Define variance thresholds to experiment with
variance_thresholds = [0.001, 0.005, 0.01, 0.05, 0.1]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different variance thresholds
for threshold in variance_thresholds:
    # Apply VarianceThreshold feature selection
    selector = VarianceThreshold(threshold=threshold)
    X_train_selected = selector.fit_transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Get selected feature names
    selected_features = X.columns[selector.get_support()]
    num_features = len(selected_features)

    # Standardize the features (SVM is sensitive to scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train SVM classifier
    svm_clf = SVC(kernel="rbf", probability=True, C=1, gamma="scale", random_state=42)
    svm_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = svm_clf.predict(X_test_scaled)
    y_pred_proba = svm_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


  results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],


   Variance Threshold Num Features       AUC  Accuracy  Precision     Kappa
0               0.001        31504  0.995238  0.994083     0.9375  0.964488
1               0.005        31502  0.995238  0.994083     0.9375  0.964488
2               0.010        31498  0.995238  0.994083     0.9375  0.964488
3               0.050        31237  0.995238  0.994083     0.9375  0.964488
4               0.100        30372  0.995238  0.994083     0.9375  0.964488


In [None]:
#Variance threshold + LR

# Define variance thresholds to test
variance_thresholds = [0.001, 0.005, 0.01, 0.05, 0.1]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different variance thresholds
for threshold in variance_thresholds:
    # Apply VarianceThreshold feature selection
    selector = VarianceThreshold(threshold=threshold)
    X_train_selected = selector.fit_transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Get the number of selected features
    num_features = X_train_selected.shape[1]

    # Standardize the features (LR benefits from scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train Logistic Regression model
    lr_clf = LogisticRegression(solver="liblinear", random_state=42)
    lr_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = lr_clf.predict(X_test_scaled)
    y_pred_proba = lr_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


  results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],


   Variance Threshold Num Features       AUC  Accuracy  Precision     Kappa
0               0.001        31504  0.996537  0.934911   0.576923  0.697674
1               0.005        31502  0.996537  0.934911   0.576923  0.697674
2               0.010        31498  0.996537  0.934911   0.576923  0.697674
3               0.050        31237  0.996537  0.934911   0.576923  0.697674
4               0.100        30372  0.996537  0.934911   0.576923  0.697674


In [None]:
#Variance treshold + RF
# Define variance thresholds to test
variance_thresholds = [0.001, 0.005, 0.01, 0.05, 0.1]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different variance thresholds
for threshold in variance_thresholds:
    # Apply VarianceThreshold feature selection
    selector = VarianceThreshold(threshold=threshold)
    X_train_selected = selector.fit_transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Get the number of selected features
    num_features = X_train_selected.shape[1]

    # Train Random Forest classifier
    rf_clf = RandomForestClassifier(
        n_estimators=200,  # More trees for better learning
        max_depth=None,  # Let it grow deep
        min_samples_split=2,  # Standard split settings
        n_jobs=-1,  # Use all processors
        random_state=42
    )
    rf_clf.fit(X_train_selected, y_train)

    # Make predictions
    y_pred = rf_clf.predict(X_test_selected)
    y_pred_proba = rf_clf.predict_proba(X_test_selected)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


  results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],


   Variance Threshold Num Features       AUC  Accuracy  Precision     Kappa
0               0.001        31504  0.993506  0.994083   0.937500  0.964488
1               0.005        31502  0.993939  0.994083   0.937500  0.964488
2               0.010        31498  0.994372  0.988166   0.933333  0.926840
3               0.050        31237  0.994805  0.994083   0.937500  0.964488
4               0.100        30372  0.995238  0.994083   0.937500  0.964488


In [None]:
#Mutual info + XGB
# Define feature selection sizes
feature_counts = [10, 50, 100, 500, 1000]

# Compute Mutual Information scores
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_series = pd.Series(mi_scores, index=X_train.columns)

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = mi_series.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train XGBoost model with optimized settings
    xgb_clf = XGBClassifier(
        n_estimators=500,  # More trees for better learning
        learning_rate=0.05,  # Slower learning for better generalization
        max_depth=6,  # Optimal depth to prevent overfitting
        subsample=0.8,  # Helps prevent overfitting
        colsample_bytree=0.8,  # Randomly selects features for better generalization
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    )
    xgb_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = xgb_clf.predict(X_test_scaled)
    y_pred_proba = xgb_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)

Parameters: { "use_label_encoder" } are not used.

  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



  Num Features       AUC  Accuracy  Precision     Kappa
0           10  0.995455  0.976331   0.923077  0.844311
1           50  0.995238  0.982249   0.928571  0.886856
2          100  0.994805  0.982249   0.928571  0.886856
3          500  0.995022  0.982249   0.928571  0.886856
4         1000  0.994589  0.982249   0.928571  0.886856


In [None]:
#Mutual info + SVM
# Define feature selection sizes
feature_counts = [10, 50, 100, 500, 1000]

# Compute Mutual Information scores
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_series = pd.Series(mi_scores, index=X_train.columns)

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = mi_series.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train SVM model with optimized settings
    svm_clf = SVC(
        kernel="rbf",  # RBF kernel for non-linearity
        C=1.0,  # Regularization strength
        probability=True,  # Enable probability estimates for AUC calculation
        random_state=42
    )
    svm_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = svm_clf.predict(X_test_scaled)
    y_pred_proba = svm_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],


  Num Features       AUC  Accuracy  Precision     Kappa
0           10  0.996537  0.988166   0.933333  0.926840
1           50  0.994805  0.994083   0.937500  0.964488
2          100  0.994805  0.988166   0.933333  0.926840
3          500  0.994805  0.994083   0.937500  0.964488
4         1000  0.994805  0.994083   0.937500  0.964488


In [None]:
#Mutual info + LR
# Define feature selection sizes
feature_counts = [10, 50, 100, 500, 1000]

# Compute Mutual Information scores
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_series = pd.Series(mi_scores, index=X_train.columns)

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = mi_series.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train Logistic Regression model with optimized settings
    lr_clf = LogisticRegression(
        penalty="l2",  # Ridge regularization
        C=1.0,  # Regularization strength
        solver="liblinear",  # Good for small datasets
        random_state=42
    )
    lr_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = lr_clf.predict(X_test_scaled)
    y_pred_proba = lr_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],


  Num Features       AUC  Accuracy  Precision     Kappa
0           10  0.995238  0.988166   0.933333  0.926840
1           50  0.996537  0.988166   0.882353  0.930992
2          100  0.995238  0.988166   0.882353  0.930992
3          500  0.995671  0.988166   0.882353  0.930992
4         1000  0.996970  0.988166   0.882353  0.930992


In [None]:
#Mutual info + RF
# Define feature selection sizes
feature_counts = [10, 50, 100, 500, 1000]

# Compute Mutual Information scores
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_series = pd.Series(mi_scores, index=X_train.columns)

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = mi_series.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train Random Forest model with optimized settings
    rf_clf = RandomForestClassifier(
        n_estimators=500,  # More trees for better learning
        max_depth=None,  # Let trees grow fully
        min_samples_split=2,  # Default split setting
        min_samples_leaf=1,  # Small leaf size for more splits
        bootstrap=True,  # Bootstrap sampling
        random_state=42,
        n_jobs=-1
    )
    rf_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = rf_clf.predict(X_test_scaled)
    y_pred_proba = rf_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],


  Num Features       AUC  Accuracy  Precision     Kappa
0           10  0.995238  0.982249   0.928571  0.886856
1           50  0.994372  0.988166   0.933333  0.926840
2          100  0.995022  0.988166   0.933333  0.926840
3          500  0.994805  0.988166   0.933333  0.926840
4         1000  0.994805  0.988166   0.933333  0.926840


In [None]:
#XGB + XGB
from sklearn.preprocessing import StandardScaler # Importing the necessary class
# Define feature selection sizes
feature_counts = [10, 50, 100, 500, 1000]

# Train XGBoost model for feature selection
xgb_selector = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42, use_label_encoder=False, eval_metric="logloss")
xgb_selector.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(xgb_selector.feature_importances_, index=X_train.columns)

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = feature_importances.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train XGBoost model with optimized settings
    xgb_clf = XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    )
    xgb_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = xgb_clf.predict(X_test_scaled)
    y_pred_proba = xgb_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



  Num Features       AUC  Accuracy  Precision     Kappa
0           10  0.971843  0.920863   0.914286  0.841735
1           50  0.989234  0.942446   0.955224  0.884862
2          100  0.991925  0.942446   0.955224  0.884862
3          500  0.989441  0.942446   0.942029  0.884886
4         1000  0.987992  0.928058   0.927536  0.856108
