In [2]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 487.6 kB/s eta 0:04:17
   ---------------------------------------- 0.1/124.9 MB 1.1 MB/s eta 0:01:55
   ---------------------------------------- 0.1/124.9 MB 1.1 MB/s eta 0:01:55
   ---------------------------------------- 0.2/124.9 MB 958.4 kB/s eta 0:02:11
   ---------------------------------------- 0.2/124.9 MB 958.4 kB/s eta 0:02:11
   ---------------------------------------- 0.2/124.9 MB 958.4 kB/s eta 0:02:11
   ---------------------------------------- 0.2/124.9 MB 958.4 kB/s eta 0:02:11
   ---------------------------------------- 0.2/124.9 MB 958.4 kB/s eta 0:02:11
   ---------------------------------------- 0.2/124.9 MB 958.4 kB/s eta 0:

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.metrics import accuracy_score, classification_report, precision_score, f1_score
import xgboost as xgb

# Load your data
df = pd.read_csv('new_descriptors.csv')

# Extract features and target
X = df.drop(columns=['Target'])
y = df['Target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Feature Importance based selection
xgb_model.fit(X_train_scaled, y_train)
importances = xgb_model.feature_importances_

# Determine the threshold as a quantile (e.g., 75th percentile)
quantile = 0.75
threshold_value = np.quantile(importances, quantile)
print(f"Quantile Threshold Value: {threshold_value}")

# Feature selection with SelectFromModel using the quantile threshold
selector = SelectFromModel(xgb_model, threshold=threshold_value, prefit=True)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Check number of selected features
print(f"Number of features after feature importance selection: {X_train_selected.shape[1]}")

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           scoring='accuracy')

grid_search.fit(X_train_selected, y_train)
best_xgb = grid_search.best_estimator_

# Train RFE with the best model
rfe = RFE(estimator=best_xgb, n_features_to_select=100)  # Adjust number of features as needed
X_train_rfe_selected = rfe.fit_transform(X_train_scaled, y_train)
X_test_rfe_selected = rfe.transform(X_test_scaled)

# Check number of features selected by RFE
print(f"Number of features after RFE: {X_train_rfe_selected.shape[1]}")

# Evaluate XGBoost with feature importance-based selection
xgb_final = xgb.XGBClassifier(
    n_estimators=grid_search.best_params_['n_estimators'],
    max_depth=grid_search.best_params_['max_depth'],
    learning_rate=grid_search.best_params_['learning_rate'],
    subsample=grid_search.best_params_['subsample'],
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
xgb_final.fit(X_train_selected, y_train)
y_pred_selected = xgb_final.predict(X_test_selected)

# Evaluate XGBoost with RFE selection
xgb_final_rfe = xgb.XGBClassifier(
    n_estimators=grid_search.best_params_['n_estimators'],
    max_depth=grid_search.best_params_['max_depth'],
    learning_rate=grid_search.best_params_['learning_rate'],
    subsample=grid_search.best_params_['subsample'],
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
xgb_final_rfe.fit(X_train_rfe_selected, y_train)
y_pred_rfe = xgb_final_rfe.predict(X_test_rfe_selected)

# Print results
print("XGBoost with Feature Importance Selection")
accuracy_selected = accuracy_score(y_test, y_pred_selected)
print("Overall Accuracy:", accuracy_selected)
print("Precision:", precision_score(y_test, y_pred_selected, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred_selected, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred_selected))

print("\nXGBoost with RFE Selection")
accuracy_rfe = accuracy_score(y_test, y_pred_rfe)
print("Overall Accuracy:", accuracy_rfe)
print("Precision:", precision_score(y_test, y_pred_rfe, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred_rfe, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred_rfe))


Parameters: { "use_label_encoder" } are not used.



Quantile Threshold Value: 0.00045244861394166946
Number of features after feature importance selection: 363


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features after RFE: 100


Parameters: { "use_label_encoder" } are not used.



XGBoost with Feature Importance Selection
Overall Accuracy: 0.823751178133836
Precision: 0.8150089742887542
F1 Score: 0.8123113063025524
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.51      0.60       277
           1       0.84      0.93      0.89       784

    accuracy                           0.82      1061
   macro avg       0.79      0.72      0.74      1061
weighted avg       0.82      0.82      0.81      1061


XGBoost with RFE Selection
Overall Accuracy: 0.8284637134778511
Precision: 0.8204943853642499
F1 Score: 0.817496030967798
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.52      0.61       277
           1       0.85      0.94      0.89       784

    accuracy                           0.83      1061
   macro avg       0.80      0.73      0.75      1061
weighted avg       0.82      0.83      0.82      1061



Parameters: { "use_label_encoder" } are not used.

