### Imports

In [1]:
import numpy as np
import pandas as pd

from fingerprints.get_fp import output, input
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit import Chem

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV

import matplotlib as mpl
import matplotlib.pyplot as plt
from plotnine import *

In [2]:
odorants = pd.read_csv('data/odorants.csv')
labels = pd.read_csv('data/labels.csv')
solubility = pd.read_csv('data/solubility.csv')

# Simple Random Forest (most common class only)

In [3]:
# Sum the label columns to find the most common labels
label_sums = labels.iloc[:, -113:].sum().sort_values(ascending=False)
most_common_labels = label_sums.index[:1]  # Get the most common label
most_common_labels_list = most_common_labels.to_list()
most_common_labels_list.append('IsomericSMILES')
most_common_labels_list.append('CID')

# Filter data for molecules with these labels
# filtered_labels = labels[labels[most_common_labels].sum(axis=1) > 0]

# Filter data for column of most common labels
filtered_labels = labels[most_common_labels_list]



In [4]:
bitlength = 1024
nmolecules = len(filtered_labels)
fingerprints = np.zeros([nmolecules, bitlength]) #1024 values bits 0,1 - 1024-bit ECFP4 fingerprint
count = 0

for smiles in filtered_labels['IsomericSMILES']:
    mol = Chem.MolFromSmiles(smiles) # from SMILES to mol
    bits = np.array(AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024)) #from mol we obtain the bit vector for each molecules
    fingerprints[count] = np.array(bits) #save each vector as an array
    count = count+1

target_labels = np.array(filtered_labels.drop(["IsomericSMILES", "CID"], axis=1))



In [5]:
print(fingerprints.shape)
print(target_labels.shape)

(3487, 1024)
(3487, 1)


In [6]:
x = fingerprints
y = target_labels.ravel()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [7]:
forest = RandomForestClassifier(random_state=42)
# multi_target_forest = MultiOutputClassifier(forest)

In [8]:
# Define a range of `n_estimators` to explore
param_grid = {
    'n_estimators': [10, 50, 100, 200, 300, 400, 500]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(x_train, y_train)

# Best number of trees
print("Best number of trees:", grid_search.best_params_)

# Best score
print("Best cross-validated accuracy:", grid_search.best_score_)

# Using the best model
best_clf = grid_search.best_estimator_

# Make predictions
y_pred = best_clf.predict(x_test)

# Note: Adjust the scoring metrics as appropriate for your multi-label context
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Best number of trees: {'n_estimators': 500}
Best cross-validated accuracy: 0.7759045835665977
Accuracy: 0.7893982808022922
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       435
           1       0.74      0.68      0.71       263

    accuracy                           0.79       698
   macro avg       0.78      0.77      0.77       698
weighted avg       0.79      0.79      0.79       698

