In [6]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import numpy as np
import pandas as pd

# Load your data
df = pd.read_csv('new_descriptors.csv')

# Extract features and target
X = df.drop(columns=['Target'])
y = df['Target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the HistGradientBoosting model
hgb = HistGradientBoostingClassifier(random_state=42)

# Fit the model
hgb.fit(X_train_scaled, y_train)

# Feature importance using permutation_importance
perm_importance = permutation_importance(hgb, X_train_scaled, y_train, n_repeats=10, random_state=42)
importances = perm_importance.importances_mean

# Determine the threshold as a quantile (e.g., 75th percentile)
quantile = 0.75
threshold_value = np.quantile(importances, quantile)
print(f"Quantile Threshold Value: {threshold_value}")

# Feature selection with SelectFromModel using the quantile threshold
selector = SelectFromModel(hgb, threshold=threshold_value, prefit=False)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_iter': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [10, 20, 30]
}

grid_search = GridSearchCV(estimator=HistGradientBoostingClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           scoring='accuracy')

grid_search.fit(X_train_selected, y_train)
best_hgb = grid_search.best_estimator_

# Train HistGradientBoosting with the best model
hgb_final = HistGradientBoostingClassifier(learning_rate=grid_search.best_params_['learning_rate'],
                                           max_iter=grid_search.best_params_['max_iter'],
                                           max_depth=grid_search.best_params_['max_depth'],
                                           min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                           random_state=42)
hgb_final.fit(X_train_selected, y_train)
y_pred_selected = hgb_final.predict(X_test_selected)

# Cross-validation scores for HistGradientBoosting
cv_scores_selected = cross_val_score(hgb_final, X_train_selected, y_train, cv=5)

# Print results
print("HistGradientBoosting with Feature Importance Selection")
print("Accuracy:", accuracy_score(y_test, y_pred_selected))
print("Classification Report:\n", classification_report(y_test, y_pred_selected))
print("Cross-Validation Scores (Feature Importance):", cv_scores_selected)
print("Mean CV Score (Feature Importance):", np.mean(cv_scores_selected))


Quantile Threshold Value: 4.4408920985006264e-17


ValueError: when `importance_getter=='auto'`, the underlying estimator HistGradientBoostingClassifier should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_hist_gradient_boosting  # Needed to enable HistGradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, f1_score
from sklearn.inspection import permutation_importance
from imblearn.over_sampling import SMOTE

# Load your data
df = pd.read_csv('new_descriptors.csv')

# Extract features and target
X = df.drop(columns=['Target'])
y = df['Target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the HistGradientBoosting model
hgb = HistGradientBoostingClassifier(random_state=42)

# Train the model
hgb.fit(X_train_scaled, y_train)

# Compute permutation feature importance
result = permutation_importance(hgb, X_train_scaled, y_train, n_repeats=10, random_state=42, n_jobs=-1)
importances = result.importances_mean

# Determine the threshold as a quantile (e.g., 75th percentile)
quantile = 0.75
threshold_value = np.quantile(importances, quantile)
print(f"Quantile Threshold Value: {threshold_value}")

# Feature selection based on permutation importance
selected_features_mask = importances >= threshold_value
selected_features = X.columns[selected_features_mask]

print("Selected Features:")
print(selected_features)

# Apply SMOTE to handle class imbalance
X_train_selected = X_train_scaled[:, selected_features_mask]
X_test_selected = X_test_scaled[:, selected_features_mask]

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_selected, y_train)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'max_iter': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [None, 3, 5, 10],
    'min_samples_leaf': [20, 50, 100]
}

grid_search = GridSearchCV(estimator=HistGradientBoostingClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           scoring='accuracy')

grid_search.fit(X_train_smote, y_train_smote)
best_hgb = grid_search.best_estimator_

# Evaluate model with cross-validation
cross_val_scores = cross_val_score(best_hgb, X_train_smote, y_train_smote, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean()}")

# Train the final model
best_hgb.fit(X_train_smote, y_train_smote)
y_pred = best_hgb.predict(X_test_selected)

# Identify rows predicted as inhibitors
# Assume inhibitors are labeled as 1; adjust if necessary
inhibitor_label = 1
inhibitor_predictions = X_test_selected[y_pred == inhibitor_label]

# Map predictions back to original dataframe
inhibitor_indices = np.where(y_pred == inhibitor_label)[0]
inhibitor_rows = df.iloc[X_test.index[inhibitor_indices]]

# Print identified rows
print("Rows predicted as inhibitors:")
print(inhibitor_rows)

# Print results
print("\nHistGradientBoosting with Permutation Importance Selection and SMOTE")
accuracy = accuracy_score(y_test, y_pred)
print("Overall Accuracy:", accuracy)
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))


Quantile Threshold Value: 4.4408920985006264e-17
Selected Features:
Index(['nAcid', 'SpMax_A', 'SpMAD_A', 'VE1_A', 'VR2_A', 'ATS0dv', 'ATS8dv',
       'ATS4s', 'ATS8s', 'ATS0m',
       ...
       'GGI3', 'GGI8', 'JGI2', 'JGI6', 'JGI7', 'JGI8', 'MWC08', 'SRW08',
       'SRW09', 'SRW10'],
      dtype='object', length=370)
Cross-Validation Accuracy Scores: [0.81545064 0.83118741 0.90701001 0.9241774  0.92550143]
Mean Cross-Validation Accuracy: 0.8806653795229369
Rows predicted as inhibitors:
      nAcid  nBase    SpAbs_A   SpMax_A  SpDiam_A     SpAD_A   SpMAD_A  \
325       0      3  42.760646  2.508523  5.015985  42.760646  1.257666   
2480      0      0  36.286579  2.462812  4.918740  36.286579  1.251261   
1862      0      0  39.235488  2.421269  4.816843  39.235488  1.307850   
2374      0      2  37.725101  2.444531  4.883653  37.725101  1.300866   
3356      0      0  38.839018  2.499519  4.851194  38.839018  1.294634   
...     ...    ...        ...       ...       ...        ...  

In [3]:
# Convert selected features to a DataFrame and save as CSV
selected_features_df = pd.DataFrame(selected_features, columns=['Selected_Features'])
selected_features_df.to_csv('selected_features_hg.csv', index=False)

# Save rows predicted as inhibitors to a CSV
inhibitor_rows.to_csv('rows_predicted_as_inhibitors_hg.csv', index=False)


In [4]:
import pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

# Load your dataset (ensure your CSV has the target column labeled, e.g., 'target')
df = pd.read_csv('new_descriptors.csv')

# Separate features (X) and target (y)
X = df.drop(columns=['Target'])  # Replace 'target' with your actual target column name
y = df['Target']  # Target variable

# Split the data into training and test sets (optional, for testing the performance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the HistGradientBoostingClassifier
hgb = HistGradientBoostingClassifier(random_state=42)
hgb.fit(X_train, y_train)

# Use SelectFromModel with the feature_importances_ after fitting the model
selector = SelectFromModel(hgb, prefit=True, threshold="mean")

# Transform the training and test sets based on selected features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Get the names of the selected features
selected_feature_names = X_train.columns[selector.get_support()]

# Print the selected feature names
print("Selected Features:", selected_feature_names)

# Optionally: Train a new model on selected features
hgb_selected = HistGradientBoostingClassifier(random_state=42)
hgb_selected.fit(X_train_selected, y_train)

# Evaluate the model performance
accuracy = hgb_selected.score(X_test_selected, y_test)
print(f"Model accuracy with selected features: {accuracy}")


ValueError: when `importance_getter=='auto'`, the underlying estimator HistGradientBoostingClassifier should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.