In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.metrics import accuracy_score, classification_report, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier

# Load your data
df = pd.read_csv('new_descriptors.csv')

# Extract features and target
X = df.drop(columns=['Target'])
y = df['Target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the AdaBoost model with a DecisionTreeClassifier as the base estimator
base_model = DecisionTreeClassifier(max_depth=1, random_state=42)  # Base model for AdaBoost
adaboost = AdaBoostClassifier(base_model, random_state=42)

# Feature Importance based selection
adaboost.fit(X_train_scaled, y_train)
importances = adaboost.feature_importances_

# Determine the threshold as a quantile (e.g., 75th percentile)
quantile = 0.75
threshold_value = np.quantile(importances, quantile)
print(f"Quantile Threshold Value: {threshold_value}")

# Feature selection with SelectFromModel using the quantile threshold
selector = SelectFromModel(adaboost, threshold=threshold_value, prefit=True)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Check number of selected features
print(f"Number of features after feature importance selection: {X_train_selected.shape[1]}")

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1.0]
}

grid_search = GridSearchCV(estimator=AdaBoostClassifier(base_model, random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           scoring='accuracy')

grid_search.fit(X_train_selected, y_train)
best_adaboost = grid_search.best_estimator_

# Train RFE with the best model
# Using the entire dataset to avoid potential overfitting with high-dimensional data
rfe = RFE(estimator=best_adaboost, n_features_to_select=100)  # Adjust number of features as needed
X_train_rfe_selected = rfe.fit_transform(X_train_scaled, y_train)
X_test_rfe_selected = rfe.transform(X_test_scaled)

# Check number of features selected by RFE
print(f"Number of features after RFE: {X_train_rfe_selected.shape[1]}")

# Evaluate AdaBoost with feature importance-based selection
adaboost_final = AdaBoostClassifier(
    base_model,
    n_estimators=grid_search.best_params_['n_estimators'],
    learning_rate=grid_search.best_params_['learning_rate'],
    random_state=42
)
adaboost_final.fit(X_train_selected, y_train)
y_pred_selected = adaboost_final.predict(X_test_selected)

# Evaluate AdaBoost with RFE selection
adaboost_final_rfe = AdaBoostClassifier(
    base_model,
    n_estimators=grid_search.best_params_['n_estimators'],
    learning_rate=grid_search.best_params_['learning_rate'],
    random_state=42
)
adaboost_final_rfe.fit(X_train_rfe_selected, y_train)
y_pred_rfe = adaboost_final_rfe.predict(X_test_rfe_selected)

# Print results
print("AdaBoost with Feature Importance Selection")
accuracy_selected = accuracy_score(y_test, y_pred_selected)
print("Overall Accuracy:", accuracy_selected)
print("Precision:", precision_score(y_test, y_pred_selected, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred_selected, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred_selected))

print("\nAdaBoost with RFE Selection")
accuracy_rfe = accuracy_score(y_test, y_pred_rfe)
print("Overall Accuracy:", accuracy_rfe)
print("Precision:", precision_score(y_test, y_pred_rfe, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred_rfe, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred_rfe))




Quantile Threshold Value: 0.0
Number of features after feature importance selection: 1451




Number of features after RFE: 100




AdaBoost with Feature Importance Selection
Overall Accuracy: 0.8171536286522149
Precision: 0.8077993970792529
F1 Score: 0.8084817532488617
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.53      0.60       277
           1       0.85      0.92      0.88       784

    accuracy                           0.82      1061
   macro avg       0.77      0.72      0.74      1061
weighted avg       0.81      0.82      0.81      1061


AdaBoost with RFE Selection
Overall Accuracy: 0.8171536286522149
Precision: 0.8077993970792529
F1 Score: 0.8084817532488617
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.53      0.60       277
           1       0.85      0.92      0.88       784

    accuracy                           0.82      1061
   macro avg       0.77      0.72      0.74      1061
weighted avg       0.81      0.82      0.81      1061

