In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, cohen_kappa_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import VarianceThreshold
from sklearn.utils import shuffle

In [None]:
# Load the dataset
df = pd.read_csv("Lung_gene_expression.csv", index_col="Unnamed: 0")

print(df.columns)

# Encode the target variable
le = LabelEncoder()
df["classes"] = le.fit_transform(df["classes"])  # Convert categorical labels to 0 and 1

# Define features and target
X = df.drop(columns=["classes"])
y = df["classes"]

# Split dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'FIRRM', 'FGR', 'CFH', 'FUCA2',
       'GCLC', 'NFYA',
       ...
       'SNORA74C-2', 'ELOA3BP', 'NPBWR1', 'ELOA3DP', 'PDCD6-AHRR', 'LNCDAT',
       'LOC124902537', 'RNF228', 'PANO1', 'classes'],
      dtype='object', length=31506)


In [None]:
df = df.iloc[:,1:]

In [None]:
df.dropna()

Unnamed: 0,TNMD,DPM1,SCYL3,FIRRM,FGR,CFH,FUCA2,GCLC,NFYA,STPG1,...,SNORA74C-2,ELOA3BP,NPBWR1,ELOA3DP,PDCD6-AHRR,LNCDAT,LOC124902537,RNF228,PANO1,classes
TCGA-60-2712-01A-01R-0851-07,0.000000,10.835261,9.623881,9.335390,10.839204,12.055282,10.528454,12.670656,10.584963,8.861087,...,0.0,0.0,3.169925,0.0,0.0,0.000000,0.000000,6.339850,4.643856,0
TCGA-56-7221-01A-11R-2045-07,3.321928,11.707790,9.743151,10.067434,9.082149,11.060696,11.078818,15.292573,11.278449,10.490851,...,0.0,0.0,9.167418,0.0,0.0,0.000000,0.000000,8.717676,4.700440,0
TCGA-21-A5DI-01A-31R-A26W-07,0.000000,11.122828,8.174926,8.049849,7.339850,11.634357,9.807355,12.055282,10.453271,9.577429,...,0.0,0.0,7.851749,0.0,1.0,0.000000,0.000000,6.686501,3.459432,0
TCGA-43-7657-11A-01R-2125-07,0.000000,10.760720,9.636625,7.693487,11.921097,13.641262,11.542065,10.691744,10.557464,10.151017,...,0.0,0.0,3.459432,0.0,0.0,0.000000,3.321928,4.392317,3.321928,1
TCGA-43-7657-01A-31R-2125-07,0.000000,11.307201,8.957102,8.764872,8.751544,10.285402,11.861087,13.788311,10.868823,10.598983,...,0.0,0.0,13.429014,0.0,0.0,2.584963,0.000000,6.954196,4.087463,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-39-5028-01A-01R-1443-07,1.584963,11.633903,9.194757,8.294621,9.419960,13.058499,11.092757,11.814182,10.359750,10.213104,...,0.0,0.0,4.087463,0.0,0.0,1.000000,0.000000,4.000000,5.129283,0
TCGA-NC-A5HE-01A-11R-A26W-07,1.584963,11.428884,10.029287,9.897845,11.770251,11.486835,11.574594,13.891689,11.237807,8.159871,...,0.0,0.0,6.339850,0.0,1.0,3.000000,0.000000,5.584963,5.357552,0
TCGA-66-2783-01A-01R-1201-07,0.000000,10.348728,9.262095,9.189825,9.033423,10.631177,10.101976,12.842743,10.290019,10.030667,...,0.0,0.0,6.768184,0.0,0.0,0.000000,0.000000,6.475733,4.000000,0
TCGA-66-2795-01A-02R-0980-07,0.000000,11.051209,9.601771,9.731319,9.019591,11.106563,10.083479,14.560751,10.695228,8.375039,...,0.0,0.0,7.139551,0.0,0.0,1.584963,0.000000,5.807355,2.321928,0


In [None]:

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# 4. Get feature importances

feature_importances = model.feature_importances_
feature_names = X.columns

# Create a DataFrame for importances
importance_df = pd.DataFrame({
    'Gene': feature_names,
    'Importance': feature_importances
})

# ---------------------------
# 5. Select Top 1000 Genes
# ---------------------------
top_genes_df = importance_df.sort_values(by='Importance', ascending=False).head(1000)

# Optional: Save to CSV
top_genes_df.to_csv('top_1000_genes.csv', index=False)

# ---------------------------
# DONE!
# ---------------------------
print("Top 1000 genes saved to 'top_1000_genes.csv'")


Parameters: { "use_label_encoder" } are not used.



Top 1000 genes saved to 'top_1000_genes.csv'


In [None]:
#None + RF
# Initialize and train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC calculation

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
kappa = cohen_kappa_score(y_test, y_pred)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Kappa: {kappa:.4f}")


Accuracy: 0.9941
Precision: 0.9375
AUC: 0.9952
Kappa: 0.9645


In [None]:
#None + XGB
# Initialize and train the XGBoost classifier
xgb_clf = xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric="logloss")
xgb_clf.fit(X_train, y_train)

# Make predictions
y_pred = xgb_clf.predict(X_test)
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC calculation

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
kappa = cohen_kappa_score(y_test, y_pred)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Kappa: {kappa:.4f}")


Accuracy: 0.9763
Precision: 0.9231
AUC: 0.9955
Kappa: 0.8443


In [None]:
#None + LR
# Initialize and train the Logistic Regression model
from sklearn.model_selection import cross_val_score
import pandas as pd

lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)


# Make predictions
y_pred = lr.predict(X_test)
y_pred_proba = lr.predict_proba(X_test)[:, 1]  # Probability estimates for AUC calculation

scores = cross_val_score(lr, X, y, cv=5, scoring='accuracy')


# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
kappa = cohen_kappa_score(y_test, y_pred)

print(np.unique(y_pred, return_counts=True))


# Print evaluation metrics
print("cvs:",scores.mean())
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Kappa: {kappa:.4f}")

(array([0, 1]), array([153,  16]))
cvs: 0.9982142857142857
Accuracy: 0.9941
Precision: 0.9375
AUC: 0.9970
Kappa: 0.9645


In [None]:
# None + SVM
# Initialize and train the SVM model (with probability=True for AUC calculation)
svm_clf = SVC(kernel='rbf', probability=True, random_state=42)
svm_clf.fit(X_train, y_train)

# Make predictions
y_pred = svm_clf.predict(X_test)
y_pred_proba = svm_clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC calculation

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
kappa = cohen_kappa_score(y_test, y_pred)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Kappa: {kappa:.4f}")

Accuracy: 0.9941
Precision: 0.9375
AUC: 0.9965
Kappa: 0.9645


In [None]:
#RF + SVM
# Feature selection using Random Forest
from sklearn.preprocessing import StandardScaler # Import StandardScaler

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)

# List of feature selection counts
feature_counts = [10, 50, 100, 500, 1000]

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = feature_importances.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Standardize the features (important for SVM)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train the SVM model
    svm_clf = SVC(kernel='rbf', probability=True, random_state=42)
    svm_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = svm_clf.predict(X_test_scaled)
    y_pred_proba = svm_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Print results
    print(f"\nResults for Top {num_features} Features:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"AUC: {auc:.4f}")
    print(f"Kappa: {kappa:.4f}")


Results for Top 10 Features:
Accuracy: 0.9882
Precision: 0.9333
AUC: 0.9944
Kappa: 0.9268

Results for Top 50 Features:
Accuracy: 0.9941
Precision: 0.9375
AUC: 0.9939
Kappa: 0.9645

Results for Top 100 Features:
Accuracy: 0.9941
Precision: 0.9375
AUC: 0.9944
Kappa: 0.9645

Results for Top 500 Features:
Accuracy: 0.9941
Precision: 0.9375
AUC: 0.9948
Kappa: 0.9645

Results for Top 1000 Features:
Accuracy: 0.9941
Precision: 0.9375
AUC: 0.9957
Kappa: 0.9645


In [None]:
#RF + LR
# Feature selection using Random Forest
rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)

# List of feature selection counts
feature_counts = [10, 50, 100, 500, 1000]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = feature_importances.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train Logistic Regression model with best settings
    lr_clf = LogisticRegression(solver="liblinear", C=1.0, max_iter=500, random_state=42)
    lr_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = lr_clf.predict(X_test_scaled)
    y_pred_proba = lr_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],


  Features       AUC  Accuracy  Precision     Kappa
0       10  0.996104  0.982249   0.875000  0.893465
1       50  0.994372  0.988166   0.882353  0.930992
2      100  0.996537  0.988166   0.882353  0.930992
3      500  0.996537  0.988166   0.882353  0.930992
4     1000  0.996104  0.982249   0.833333  0.899345


In [None]:
#RF + RF
# Feature selection using Random Forest
rf_selector = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2, random_state=42, n_jobs=-1)
rf_selector.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(rf_selector.feature_importances_, index=X.columns)

# List of feature selection counts
feature_counts = [10, 50, 100, 500, 1000]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = feature_importances.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Train Random Forest model with optimized settings
    rf_clf = RandomForestClassifier(
        n_estimators=500,  # More trees for better performance
        max_depth=None,    # Fully grown trees for optimal learning
        min_samples_split=2,  # Ensures small splits, capturing more information
        random_state=42,
        n_jobs=-1
    )
    rf_clf.fit(X_train_selected, y_train)

    # Make predictions
    y_pred = rf_clf.predict(X_test_selected)
    y_pred_proba = rf_clf.predict_proba(X_test_selected)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)

  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],


  Features       AUC  Accuracy  Precision    Kappa
0       10  0.994372  0.988166   0.933333  0.92684
1       50  0.994156  0.988166   0.933333  0.92684
2      100  0.993723  0.988166   0.933333  0.92684
3      500  0.994156  0.988166   0.933333  0.92684
4     1000  0.994589  0.988166   0.933333  0.92684


In [None]:
#RF + XGB
# Feature selection using Random Forest

from xgboost import XGBClassifier # Import XGBClassifier

rf_selector = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=42, n_jobs=-1)
rf_selector.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(rf_selector.feature_importances_, index=X.columns)

# List of feature selection counts
feature_counts = [10, 50, 100, 500, 1000]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = feature_importances.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train XGBoost model with best settings
    xgb_clf = XGBClassifier(
        n_estimators=500,  # More trees for better learning
        learning_rate=0.05,  # Slower learning for better generalization
        max_depth=6,  # Optimal depth to prevent overfitting
        subsample=0.8,  # Helps prevent overfitting
        colsample_bytree=0.8,  # Randomly selects features for better generalization
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    )
    xgb_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = xgb_clf.predict(X_test_scaled)
    y_pred_proba = xgb_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


NameError: name 'StandardScaler' is not defined

In [None]:
#XGB + SVM
from sklearn.preprocessing import StandardScaler
# Train XGBoost to get feature importances
xgb_selector = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6,
                             subsample=0.8, colsample_bytree=0.8, random_state=42,
                             use_label_encoder=False, eval_metric="logloss")

xgb_selector.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(xgb_selector.feature_importances_, index=X.columns)

# List of feature selection counts
feature_counts = [10, 50, 100, 500, 1000]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = feature_importances.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Standardize the features (important for SVM)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train SVM classifier
    svm_clf = SVC(kernel="rbf", C=1.0, probability=True, random_state=42)
    svm_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = svm_clf.predict(X_test_scaled)
    y_pred_proba = svm_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


Parameters: { "use_label_encoder" } are not used.

  results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],


  Features       AUC  Accuracy  Precision     Kappa
0       10  0.994372  0.994083     0.9375  0.964488
1       50  0.994372  0.994083     0.9375  0.964488
2      100  0.996104  0.994083     0.9375  0.964488
3      500  0.996104  0.994083     0.9375  0.964488
4     1000  0.995671  0.994083     0.9375  0.964488


In [None]:
#XGB + LR
# Train XGBoost to get feature importances
xgb_selector = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6,
                             subsample=0.8, colsample_bytree=0.8, random_state=42,
                             use_label_encoder=False, eval_metric="logloss")

xgb_selector.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(xgb_selector.feature_importances_, index=X.columns)

# List of feature selection counts
feature_counts = [10, 50, 100, 500, 1000]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = feature_importances.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Standardize the features (important for Logistic Regression)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train Logistic Regression classifier
    lr_clf = LogisticRegression(solver="liblinear", C=1.0, random_state=42)
    lr_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = lr_clf.predict(X_test_scaled)
    y_pred_proba = lr_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


Parameters: { "use_label_encoder" } are not used.



NameError: name 'StandardScaler' is not defined

In [None]:
#XGB + RF
# Train XGBoost to get feature importances
xgb_selector = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6,
                             subsample=0.8, colsample_bytree=0.8, random_state=42,
                             use_label_encoder=False, eval_metric="logloss")

xgb_selector.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(xgb_selector.feature_importances_, index=X.columns)

# List of feature selection counts
feature_counts = [10, 50, 100, 500, 1000]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = feature_importances.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Train Random Forest classifier
    rf_clf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2,
                                    random_state=42, n_jobs=-1)
    rf_clf.fit(X_train_selected, y_train)

    # Make predictions
    y_pred = rf_clf.predict(X_test_selected)
    y_pred_proba = rf_clf.predict_proba(X_test_selected)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)

NameError: name 'XGBClassifier' is not defined

In [None]:
#Variance threshold + XGB
# Define variance thresholds to experiment with
variance_thresholds = [0.001, 0.005, 0.01, 0.05, 0.1]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different variance thresholds
for threshold in variance_thresholds:
    # Apply VarianceThreshold feature selection
    selector = VarianceThreshold(threshold=threshold)
    X_train_selected = selector.fit_transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Get selected feature names
    selected_features = X.columns[selector.get_support()]
    num_features = len(selected_features)

    # Train XGBoost classifier
    xgb_clf = XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    )
    xgb_clf.fit(X_train_selected, y_train)

    # Make predictions
    y_pred = xgb_clf.predict(X_test_selected)
    y_pred_proba = xgb_clf.predict_proba(X_test_selected)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)



Parameters: { "use_label_encoder" } are not used.

  results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



   Variance Threshold Num Features       AUC  Accuracy  Precision     Kappa
0               0.001        31504  0.994372  0.982249   0.928571  0.886856
1               0.005        31502  0.991991  0.976331   0.923077  0.844311
2               0.010        31498  0.993074  0.982249   0.928571  0.886856
3               0.050        31237  0.993290  0.976331   0.923077  0.844311
4               0.100        30372  0.993290  0.982249   0.928571  0.886856


In [None]:
#Variance Threshold + SVM
# Define variance thresholds to experiment with
variance_thresholds = [0.001, 0.005, 0.01, 0.05, 0.1]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different variance thresholds
for threshold in variance_thresholds:
    # Apply VarianceThreshold feature selection
    selector = VarianceThreshold(threshold=threshold)
    X_train_selected = selector.fit_transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Get selected feature names
    selected_features = X.columns[selector.get_support()]
    num_features = len(selected_features)

    # Standardize the features (SVM is sensitive to scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train SVM classifier
    svm_clf = SVC(kernel="rbf", probability=True, C=1, gamma="scale", random_state=42)
    svm_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = svm_clf.predict(X_test_scaled)
    y_pred_proba = svm_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


  results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],


   Variance Threshold Num Features       AUC  Accuracy  Precision     Kappa
0               0.001        31504  0.995238  0.994083     0.9375  0.964488
1               0.005        31502  0.995238  0.994083     0.9375  0.964488
2               0.010        31498  0.995238  0.994083     0.9375  0.964488
3               0.050        31237  0.995238  0.994083     0.9375  0.964488
4               0.100        30372  0.995238  0.994083     0.9375  0.964488


In [None]:
#Variance threshold + LR

# Define variance thresholds to test
variance_thresholds = [0.001, 0.005, 0.01, 0.05, 0.1]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different variance thresholds
for threshold in variance_thresholds:
    # Apply VarianceThreshold feature selection
    selector = VarianceThreshold(threshold=threshold)
    X_train_selected = selector.fit_transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Get the number of selected features
    num_features = X_train_selected.shape[1]

    # Standardize the features (LR benefits from scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train Logistic Regression model
    lr_clf = LogisticRegression(solver="liblinear", random_state=42)
    lr_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = lr_clf.predict(X_test_scaled)
    y_pred_proba = lr_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


  results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],


   Variance Threshold Num Features       AUC  Accuracy  Precision     Kappa
0               0.001        31504  0.996537  0.934911   0.576923  0.697674
1               0.005        31502  0.996537  0.934911   0.576923  0.697674
2               0.010        31498  0.996537  0.934911   0.576923  0.697674
3               0.050        31237  0.996537  0.934911   0.576923  0.697674
4               0.100        30372  0.996537  0.934911   0.576923  0.697674


In [None]:
#Variance treshold + RF
# Define variance thresholds to test
variance_thresholds = [0.001, 0.005, 0.01, 0.05, 0.1]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different variance thresholds
for threshold in variance_thresholds:
    # Apply VarianceThreshold feature selection
    selector = VarianceThreshold(threshold=threshold)
    X_train_selected = selector.fit_transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Get the number of selected features
    num_features = X_train_selected.shape[1]

    # Train Random Forest classifier
    rf_clf = RandomForestClassifier(
        n_estimators=200,  # More trees for better learning
        max_depth=None,  # Let it grow deep
        min_samples_split=2,  # Standard split settings
        n_jobs=-1,  # Use all processors
        random_state=42
    )
    rf_clf.fit(X_train_selected, y_train)

    # Make predictions
    y_pred = rf_clf.predict(X_test_selected)
    y_pred_proba = rf_clf.predict_proba(X_test_selected)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Variance Threshold", "Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)


  results_df = pd.concat([results_df, pd.DataFrame([[threshold, num_features, auc, accuracy, precision, kappa]],


   Variance Threshold Num Features       AUC  Accuracy  Precision     Kappa
0               0.001        31504  0.993506  0.994083   0.937500  0.964488
1               0.005        31502  0.993939  0.994083   0.937500  0.964488
2               0.010        31498  0.994372  0.988166   0.933333  0.926840
3               0.050        31237  0.994805  0.994083   0.937500  0.964488
4               0.100        30372  0.995238  0.994083   0.937500  0.964488


In [None]:
#Mutual info + XGB
# Define feature selection sizes
feature_counts = [10, 50, 100, 500, 1000]

# Compute Mutual Information scores
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_series = pd.Series(mi_scores, index=X_train.columns)

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Num Features", "AUC", "Accuracy", "Precision", "Kappa"])

# Loop through different feature counts
for num_features in feature_counts:
    # Select top N features
    top_features = mi_series.nlargest(num_features).index.tolist()

    # Filter dataset with selected features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train XGBoost model with optimized settings
    xgb_clf = XGBClassifier(
        n_estimators=500,  # More trees for better learning
        learning_rate=0.05,  # Slower learning for better generalization
        max_depth=6,  # Optimal depth to prevent overfitting
        subsample=0.8,  # Helps prevent overfitting
        colsample_bytree=0.8,  # Randomly selects features for better generalization
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    )
    xgb_clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = xgb_clf.predict(X_test_scaled)
    y_pred_proba = xgb_clf.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for AUC calculation

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Store results in DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([[num_features, auc, accuracy, precision, kappa]],
                                                      columns=["Num Features", "AUC", "Accuracy", "Precision", "Kappa"])],
                                                      ignore_index=True)

# Display final results
print(results_df)

NameError: name 'mutual_info_classif' is not defined