In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# Step 1: Load your dataset
df = pd.read_csv('new_descriptors.csv')  # Replace 'your_file.csv' with your actual file name
X = df.drop('Target', axis=1)  # Replace 'target_column' with the name of your target column
y = df['Target']

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Step 4: Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 5: Apply SelectFromModel for feature selection
selector = SelectFromModel(estimator=rf)
selector.fit(X_train_smote, y_train_smote)
X_train_selected = selector.transform(X_train_smote)
X_test_selected = selector.transform(X_test)

# Step 6: Train the Random Forest model on the selected features
rf.fit(X_train_selected, y_train_smote)
y_pred = rf.predict(X_test_selected)

# Step 7: Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Step 8: Perform cross-validation to evaluate the model
cross_val_scores = cross_val_score(rf, X_train_selected, y_train_smote, cv=5)
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")


[WinError 2] The system cannot find the file specified
  File "c:\Users\bdeva\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\bdeva\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\bdeva\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\bdeva\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Model Accuracy: 0.8002
Cross-Validation Scores: [0.79971388 0.81831187 0.89270386 0.89985694 0.91977077]
Mean Cross-Validation Accuracy: 0.8661


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report, precision_score, f1_score
from imblearn.over_sampling import SMOTE

# Load your data
df = pd.read_csv('new_descriptors.csv')

# Extract features and target
X = df.drop(columns=['Target'])
y = df['Target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Feature Importance based selection
rf.fit(X_train_scaled, y_train)
importances = rf.feature_importances_

# Determine the threshold as a quantile (e.g., 75th percentile)
quantile = 0.75
threshold_value = np.quantile(importances, quantile)
print(f"Quantile Threshold Value: {threshold_value}")

# Feature selection with SelectFromModel using the quantile threshold
selector = SelectFromModel(rf, threshold=threshold_value, prefit=True)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Get the mask of selected features
selected_features_mask = selector.get_support()
selected_features = X.columns[selected_features_mask]

print("Selected Features:")
print(selected_features)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_selected, y_train)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           scoring='accuracy')

grid_search.fit(X_train_smote, y_train_smote)
best_rf = grid_search.best_estimator_

# Evaluate model with cross-validation
cross_val_scores = cross_val_score(best_rf, X_train_smote, y_train_smote, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean()}")

# Train the final model
best_rf.fit(X_train_smote, y_train_smote)
y_pred = best_rf.predict(X_test_selected)

# Identify rows predicted as inhibitors
# Assume inhibitors are labeled as 1; adjust if necessary
inhibitor_label = 1
inhibitor_predictions = X_test_selected[y_pred == inhibitor_label]

# Map predictions back to original dataframe
inhibitor_indices = np.where(y_pred == inhibitor_label)[0]
inhibitor_rows = df.iloc[X_test.index[inhibitor_indices]]

# Print identified rows
print("Rows predicted as inhibitors:")
print(inhibitor_rows)

# Print results
print("\nRandom Forest with Feature Importance Selection and SMOTE")
accuracy = accuracy_score(y_test, y_pred)
print("Overall Accuracy:", accuracy)
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Quantile Threshold Value: 0.0009058010038792962
Selected Features:
Index(['nAcid', 'VE1_A', 'VE2_A', 'nF', 'ATS6dv', 'ATS8dv', 'ATS3d', 'ATS2s',
       'ATS5s', 'ATS6s',
       ...
       'piPC9', 'TpiPC10', 'bpol', 'nG12FaHRing', 'TopoPSA(NO)', 'TopoPSA',
       'GGI3', 'GGI4', 'GGI6', 'GGI10'],
      dtype='object', length=363)


[WinError 2] The system cannot find the file specified
  File "c:\Users\bdeva\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\bdeva\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\bdeva\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\bdeva\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Cross-Validation Accuracy Scores: [0.83834049 0.85121602 0.87982833 0.89699571 0.89398281]
Mean Cross-Validation Accuracy: 0.8720726703313371
Rows predicted as inhibitors:
      nAcid  nBase    SpAbs_A   SpMax_A  SpDiam_A     SpAD_A   SpMAD_A  \
325       0      3  42.760646  2.508523  5.015985  42.760646  1.257666   
2480      0      0  36.286579  2.462812  4.918740  36.286579  1.251261   
1862      0      0  39.235488  2.421269  4.816843  39.235488  1.307850   
1637      0      1  56.541246  2.845339  5.459661  56.541246  1.229158   
2374      0      2  37.725101  2.444531  4.883653  37.725101  1.300866   
...     ...    ...        ...       ...       ...        ...       ...   
1647      1      0  41.585356  2.511985  5.023970  41.585356  1.223099   
1949      0      0  43.175258  2.509731  4.936039  43.175258  1.308341   
2271      0      0  41.999240  2.633193  5.244116  41.999240  1.235272   
2965      0      0  36.921710  2.578320  5.156639  36.921710  1.273162   
944       0   

In [2]:
# Convert selected features to a DataFrame and save as CSV
selected_features_df = pd.DataFrame(selected_features, columns=['Selected_Features'])
selected_features_df.to_csv('selected_features_rf.csv', index=False)

# Save rows predicted as inhibitors to a CSV
inhibitor_rows.to_csv('rows_predicted_as_inhibitors_rf.csv', index=False)
