In [1]:
conda install -c rdkit rdkit scikit-learn -y

Channels:
 - rdkit
 - defaults
 - conda-forge
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [3]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Read the CSV file into a DataFrame
df = pd.read_csv('train.csv')
mols = [Chem.MolFromSmiles(mol) for mol in df['SMILES_canonical']]
X = np.array([[d[1](m) for d in Descriptors._descList] for m in mols])
Y = np.array(df['target_feature'])

#splitting data in training and testing data
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2)

#Scale data
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_val = ss.transform(X_val)


## PCA 


In [4]:
from sklearn.decomposition import PCA

X_scale = ss.fit_transform(X)
pca = PCA(0.80)
pca.fit_transform(X_scale)
print(len(pca.explained_variance_ratio_))

43


## Random forest model

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score

#Random forrest algorithm
forest = RandomForestClassifier(max_depth=9)
forest.fit(X_train, y_train)

#Predict y values from X_val and produce balanced accuracy
y_pred_forest = forest.predict(X_val)
print('balanced accuracy:',balanced_accuracy_score(y_val, y_pred_forest),'\n', 'precision:',precision_score(y_val, y_pred_forest), '\n', 'recall:', recall_score(y_val, y_pred_forest))


balanced accuracy: 0.9170769230769231 
 precision: 0.9457013574660633 
 recall: 0.8461538461538461


## Random forest feature importance

In [112]:
forest_importances = forest.feature_importances_

#Create dictionary based on feature importance
importance_dict_forest = {}

for i, forest_importance in enumerate(forest_importances):
    feature_name = Descriptors._descList[i][0]
    importance_dict_forest[feature_name] = float(forest_importance)

#Sort dictionary from highest importance to lowest
sorted_importance_forest = dict(sorted(importance_dict_forest.items(), key=lambda item: item[1], reverse=True))

pprint(sorted_importance_forest, sort_dicts=False)

{'EState_VSA4': 0.08100879678791922,
 'BalabanJ': 0.0686459216951855,
 'TPSA': 0.039359002174653185,
 'PEOE_VSA8': 0.03546246379963587,
 'BCUT2D_CHGLO': 0.033869288135120273,
 'EState_VSA8': 0.032138674655223715,
 'MaxPartialCharge': 0.02841878052414004,
 'SMR_VSA6': 0.02833989651759313,
 'BCUT2D_MRLOW': 0.023290819951284954,
 'MinAbsPartialCharge': 0.022986676104748992,
 'fr_piperzine': 0.022795870475926617,
 'MinEStateIndex': 0.020134930786554987,
 'BCUT2D_LOGPLOW': 0.019947152591410503,
 'SPS': 0.019844196943378873,
 'EState_VSA2': 0.01944001170128206,
 'Chi4n': 0.016799880129717223,
 'PEOE_VSA12': 0.01274781683879838,
 'SlogP_VSA2': 0.012571761017275094,
 'EState_VSA10': 0.011647088425515118,
 'VSA_EState2': 0.01150972136814669,
 'VSA_EState8': 0.011470758475195766,
 'Chi3n': 0.009751365467577317,
 'fr_C_O': 0.00961849448489182,
 'MaxAbsEStateIndex': 0.009441975816957347,
 'FractionCSP3': 0.00923789721071165,
 'Chi1n': 0.008685309867916496,
 'MinAbsEStateIndex': 0.00822185030405326

## Logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score

#Logistic regression algorithm
reg = LogisticRegression(penalty='l2')
reg.fit(X_train, y_train)

#Predict y values from X_val and produce balanced accuracy 
y_pred_reg = reg.predict(X_val)
print('balanced accuracy:',balanced_accuracy_score(y_val, y_pred_reg),'\n', 'precision:',precision_score(y_val, y_pred_reg), '\n', 'recall:', recall_score(y_val, y_pred_reg))

0
balanced accuracy: 0.926746963562753 
 precision: 0.9113924050632911 
 recall: 0.8744939271255061


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Regression feature importance

In [115]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression()
reg.fit(X_train, y_train)

reg_importances = reg.coef_[0]
importance_dict_reg = {}

for i in range(len((reg_importances))):
    feature_name = Descriptors._descList[i][0]
    importance_dict_reg[feature_name] = reg_importances[i]

#Sort dictionary from highest importance to lowest
sorted_importance_reg = dict(sorted(importance_dict_reg.items(), key=lambda item: item[1], reverse=True))

pprint(sorted_importance_reg, sort_dicts=False)


{'Chi4n': 2.214086194891956,
 'SlogP_VSA2': 1.1178187854801565,
 'SlogP_VSA7': 1.0600018424540236,
 'fr_phenol': 0.9170814962305223,
 'VSA_EState1': 0.9112087703054336,
 'fr_Ar_OH': 0.8023930502639474,
 'BCUT2D_MRHI': 0.790215794420888,
 'HallKierAlpha': 0.7532774717865001,
 'EState_VSA6': 0.7515011447144464,
 'SlogP_VSA10': 0.7414209085278457,
 'FpDensityMorgan2': 0.6536489279768123,
 'NumHAcceptors': 0.5918109771231861,
 'SMR_VSA2': 0.5624694230336,
 'SlogP_VSA1': 0.5141337990558298,
 'NumAromaticCarbocycles': 0.5110718351634533,
 'HeavyAtomMolWt': 0.5093905259535129,
 'Chi1v': 0.4408105423582465,
 'EState_VSA4': 0.43953334256389986,
 'PEOE_VSA7': 0.4338580822581422,
 'PEOE_VSA8': 0.4300213556732536,
 'ExactMolWt': 0.4231086104266787,
 'NumRotatableBonds': 0.41915639072956346,
 'MolWt': 0.4058458094646085,
 'SMR_VSA1': 0.40154294386924383,
 'SMR_VSA10': 0.4003338617464615,
 'BCUT2D_LOGPHI': 0.39271739194123156,
 'EState_VSA7': 0.3786376828706805,
 'fr_ester': 0.3769613685780716,
 'SM

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
