In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

# Load your data
df = pd.read_csv('new_descriptors.csv')

# Extract features and target
X = df.drop(columns=['Target'])
y = df['Target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the Gradient Boosting model
gb = GradientBoostingClassifier(random_state=42)

# Feature Importance based selection
gb.fit(X_train_scaled, y_train)
importances = gb.feature_importances_

# Determine the threshold as a quantile (e.g., 75th percentile)
quantile = 0.75
threshold_value = np.quantile(importances, quantile)
print(f"Quantile Threshold Value: {threshold_value}")

# Feature selection with SelectFromModel using the quantile threshold
selector = SelectFromModel(gb, threshold=threshold_value, prefit=True)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           scoring='accuracy')

grid_search.fit(X_train_selected, y_train)
best_gb = grid_search.best_estimator_

# Train Gradient Boosting with the best model
gb_final = GradientBoostingClassifier(n_estimators=grid_search.best_params_['n_estimators'],
                                       learning_rate=grid_search.best_params_['learning_rate'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
gb_final.fit(X_train_selected, y_train)
y_pred_selected = gb_final.predict(X_test_selected)

# Evaluate Gradient Boosting with feature importance-based selection
gb_final.fit(X_train_selected, y_train)
y_pred_selected = gb_final.predict(X_test_selected)

# Cross-validation scores for Gradient Boosting
cv_scores_selected = cross_val_score(gb_final, X_train_selected, y_train, cv=5)

# Print results
print("Gradient Boosting with Feature Importance Selection")
print("Accuracy:", accuracy_score(y_test, y_pred_selected))
print("Classification Report:\n", classification_report(y_test, y_pred_selected))
print("Cross-Validation Scores (Feature Importance):", cv_scores_selected)
print("Mean CV Score (Feature Importance):", np.mean(cv_scores_selected))


Quantile Threshold Value: 0.0
Gradient Boosting with Feature Importance Selection
Accuracy: 0.825636192271442
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.44      0.57       277
           1       0.83      0.96      0.89       784

    accuracy                           0.83      1061
   macro avg       0.82      0.70      0.73      1061
weighted avg       0.82      0.83      0.81      1061

Cross-Validation Scores (Feature Importance): [0.82222222 0.82020202 0.82626263 0.82828283 0.82591093]
Mean CV Score (Feature Importance): 0.8245761256287573
