In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
import math

In [2]:
import argparse
from joblib import dump, load
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit.Chem import Descriptors

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

In [4]:
def ecfc_molstring(molecule, radius=3, size=4096):
    arr = np.zeros((1,), dtype=int)
    DataStructs.ConvertToNumpyArray(
        AllChem.GetHashedMorganFingerprint(molecule, radius, size, useFeatures=False),
        arr,
    )
    return arr

In [5]:
def number_of_atoms(atom_list, df):
    for i in atom_list:
        df['num_of_{}_atoms'.format(i)] = df['mol'].apply(lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i))))

In [7]:
df= pd.read_csv('train.csv', names=['smiles', 'logP'])
df

Unnamed: 0,smiles,logP
0,CCCC(=O)OCC(Cc1cncn1C)C(CC)C(=O)OCc1ccccc1,3.78
1,CCOc1ccccc1O,1.68
2,O=[N+]([O-])c1ccc(Oc2ccc(Cl)cc2Cl)cc1,4.64
3,Cc1cccc(C)n1,1.68
4,CC(=O)/C=C/C1C(C)=CCCC1(C)C,3.85
...,...,...
9995,CNC1CCc2c(OC)cccc2C1C,2.42
9996,Nc1ncc(Cc2cccc(Cl)c2Cl)c(N)n1,2.81
9997,c1ccc(N2CCCCC2)cc1,2.98
9998,CCCCCCN(SN(C)C(=O)O/N=C(\C)SC)C(=O)N(C)C,3.30


In [8]:
df['mol'] = df['smiles'].apply(lambda x: Chem.AddHs(Chem.MolFromSmiles(x)))
df['num_of_atoms'] = df['mol'].apply(lambda x: x.GetNumAtoms())
df['num_of_heavy_atoms'] = df['mol'].apply(lambda x: x.GetNumHeavyAtoms())
df['tpsa'] = df['mol'].apply(lambda x: Descriptors.TPSA(x))
df['mol_w'] = df['mol'].apply(lambda x: Descriptors.ExactMolWt(x))
df['num_valence_electrons'] = df['mol'].apply(lambda x: Descriptors.NumValenceElectrons(x))
df['num_heteroatoms'] = df['mol'].apply(lambda x: Descriptors.NumHeteroatoms(x))
df['num_rings'] = df['mol'].apply(lambda x: Descriptors.RingCount(x))
df['Fp_Density_Morgan1'] = df['mol'].apply(lambda x: Descriptors.FpDensityMorgan1(x))
df['Fp_Density_Morgan2'] = df['mol'].apply(lambda x: Descriptors.FpDensityMorgan2(x))
df['Fp_Density_Morgan3'] = df['mol'].apply(lambda x: Descriptors.FpDensityMorgan3(x))

In [9]:
symbols = ['C','N','O','F','P','S','Cl']

In [10]:
number_of_atoms(symbols, df)

In [11]:
features_test = pd.DataFrame(np.array([ecfc_molstring(m) for m in df['mol']]), index=None)

In [12]:
features_test.columns = features_test.columns.astype(str)

In [14]:
features_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
df = pd.concat((df.drop(columns=['mol']), features_test), axis=1)

In [16]:
df

Unnamed: 0,smiles,logP,num_of_atoms,num_of_heavy_atoms,tpsa,mol_w,num_valence_electrons,num_heteroatoms,num_rings,Fp_Density_Morgan1,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,CCCC(=O)OCC(Cc1cncn1C)C(CC)C(=O)OCc1ccccc1,3.78,58,28,70.42,386.220557,152,6,2,1.321429,...,1,0,0,0,0,1,0,0,0,0
1,CCOc1ccccc1O,1.68,20,10,29.46,138.068080,54,2,1,1.900000,...,0,0,0,0,0,0,0,0,0,0
2,O=[N+]([O-])c1ccc(Oc2ccc(Cl)cc2Cl)cc1,4.64,25,18,52.37,282.980298,92,6,2,1.166667,...,0,0,0,0,0,0,0,0,0,0
3,Cc1cccc(C)n1,1.68,17,8,12.89,107.073499,42,1,1,1.500000,...,0,0,0,0,0,0,0,0,0,0
4,CC(=O)/C=C/C1C(C)=CCCC1(C)C,3.85,34,14,17.07,192.151415,78,1,1,2.000000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,CNC1CCc2c(OC)cccc2C1C,2.42,34,15,21.26,205.146664,82,2,2,1.800000,...,0,0,0,0,0,0,0,0,0,0
9996,Nc1ncc(Cc2cccc(Cl)c2Cl)c(N)n1,2.81,27,17,77.82,268.028252,88,6,2,1.352941,...,0,0,0,0,0,0,0,1,0,0
9997,c1ccc(N2CCCCC2)cc1,2.98,27,12,3.24,161.120449,64,1,2,1.083333,...,1,0,0,0,0,0,0,0,0,0
9998,CCCCCCN(SN(C)C(=O)O/N=C(\C)SC)C(=O)N(C)C,3.30,51,23,65.45,364.160283,134,9,0,1.260870,...,0,0,0,0,0,0,0,0,0,0


In [17]:
X = df.drop(columns=['smiles', 'logP'])
y = df['logP']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [19]:
def evaluation(model, X_test, y_test):
    prediction = model.predict(X_test)
    mse = math.sqrt(mean_squared_error(y_test, prediction))
    print('RMSE score:', round(mse,4))

# CatBoost

In [30]:
cb = CatBoostRegressor(depth=None, iterations=1500, learning_rate=0.1,
                        min_data_in_leaf=None, grow_policy='Lossguide', loss_function='RMSE', verbose=False)

In [31]:
cb.fit(X, y)

<catboost.core.CatBoostRegressor at 0x7a9106e34d60>

In [22]:
evaluation(cb, X_test, y_test)

RMSE score: 0.5422


In [32]:
dump(cb, 'cb.joblib')

['cb.joblib']

# RandomForest

In [26]:
rf = RandomForestRegressor(n_jobs=-1, criterion='squared_error', max_depth=None, n_estimators=800, verbose=0)

In [27]:
rf.fit(X_train, y_train)

In [28]:
evaluation(rf, X_test, y_test)

RMSE score: 0.6728


In [51]:
dump(rf, 'rf.joblib')

['rf.joblib']

# Linear model

In [24]:
from sklearn.linear_model import Ridge

In [47]:
ridge = Ridge(alpha=1, solver='sparse_cg')

In [48]:
ridge.fit(X_train, y_train)

In [49]:
evaluation(ridge, X_test, y_test)

RMSE score: 0.9846


# XGBoost


In [25]:
!pip install xgboost



In [33]:
from xgboost import XGBRegressor

In [34]:
xgb = XGBRegressor(verbosity=0, eta=0.1, grow_policy='lossguide', eval_metric='rmse', n_estimators=1000)

In [28]:
xgb.fit(X_train, y_train)

In [29]:
evaluation(xgb, X_test, y_test)

RMSE score: 0.5591


In [35]:
xgb.fit(X, y)

In [36]:
dump(xgb, 'xgb.joblib')

['xgb.joblib']

# Stacking

In [71]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

In [75]:
estimators = [
              ('Regression', ridge),
              ('Catboost', cb),
              ('RandomForest', rf),
              ('XGBoost', xgb)
              ]

In [76]:
stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV(), n_jobs=-1, cv=5)

In [77]:
stacking_regressor.fit(X, y)

In [None]:
evaluation(stacking_regressor, X_test, y_test)

In [None]:
dump(stacking_regressor, 'stacking.joblib')