In [1]:
!pip install gensim



In [2]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.2


In [None]:
!pip install git+https://github.com/samoturk/mol2vec

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [6]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import requests
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec
from gensim.models import word2vec

In [7]:
df= pd.read_csv('/content/drive/MyDrive/NTO/train.csv', header=None, names=['smiles', 'logP'])
df.head()

Unnamed: 0,smiles,logP
0,CCCC(=O)OCC(Cc1cncn1C)C(CC)C(=O)OCc1ccccc1,3.78
1,CCOc1ccccc1O,1.68
2,O=[N+]([O-])c1ccc(Oc2ccc(Cl)cc2Cl)cc1,4.64
3,Cc1cccc(C)n1,1.68
4,CC(=O)/C=C/C1C(C)=CCCC1(C)C,3.85


In [21]:
df.head()

Unnamed: 0,smiles,logP,mol,num_of_atoms,num_of_heavy_atoms,num_of_C_atoms,num_of_O_atoms,num_of_N_atoms,num_of_Cl_atoms,num_of_P_atoms,num_of_Br_atoms,num_of_F_atoms,tpsa,mol_w,num_valence_electrons,num_heteroatoms,num_rings,sentence,mol2vec
0,CCCC(=O)OCC(Cc1cncn1C)C(CC)C(=O)OCc1ccccc1,3.78,<rdkit.Chem.rdchem.Mol object at 0x7b93d2a83ed0>,58,28,22,4,2,0,0,0,0,70.42,386.220557,152,6,2,"(2246728737, 245196591, 2245384272, 2395932594...","(100,) dimensional vector"
1,CCOc1ccccc1O,1.68,<rdkit.Chem.rdchem.Mol object at 0x7b93d2a83f40>,20,10,8,2,0,0,0,0,0,29.46,138.06808,54,2,1,"(2246728737, 245196591, 2245384272, 219572169,...","(100,) dimensional vector"
2,O=[N+]([O-])c1ccc(Oc2ccc(Cl)cc2Cl)cc1,4.64,<rdkit.Chem.rdchem.Mol object at 0x7b93d2a8c040>,25,18,12,3,1,2,0,0,0,52.37,282.980298,92,6,2,"(864942730, 2378779377, 848127915, 271903915, ...","(100,) dimensional vector"
3,Cc1cccc(C)n1,1.68,<rdkit.Chem.rdchem.Mol object at 0x7b93d2a8c0b0>,17,8,7,0,1,0,0,0,0,12.89,107.073499,42,1,1,"(2246728737, 447750523, 3217380708, 4036277955...","(100,) dimensional vector"
4,CC(=O)/C=C/C1C(C)=CCCC1(C)C,3.85,<rdkit.Chem.rdchem.Mol object at 0x7b93d2a8c120>,34,14,13,1,0,0,0,0,0,17.07,192.151415,78,1,1,"(2246728737, 2113675549, 2246699815, 385453016...","(100,) dimensional vector"


In [8]:
df['mol'] = df['smiles'].apply(lambda x: Chem.MolFromSmiles(x))

In [9]:
df['mol'] = df['mol'].apply(lambda x: Chem.AddHs(x))
df['num_of_atoms'] = df['mol'].apply(lambda x: x.GetNumAtoms())
df['num_of_heavy_atoms'] = df['mol'].apply(lambda x: x.GetNumHeavyAtoms())

In [10]:
def number_of_atoms(atom_list, df):
    for i in atom_list:
        df['num_of_{}_atoms'.format(i)] = df['mol'].apply(lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i))))

In [11]:
number_of_atoms(['C', 'O', 'N', 'Cl', 'P', 'Br', 'F'], df)

In [12]:
df['tpsa'] = df['mol'].apply(lambda x: Descriptors.TPSA(x))
df['mol_w'] = df['mol'].apply(lambda x: Descriptors.ExactMolWt(x))
df['num_valence_electrons'] = df['mol'].apply(lambda x: Descriptors.NumValenceElectrons(x))
df['num_heteroatoms'] = df['mol'].apply(lambda x: Descriptors.NumHeteroatoms(x))

In [13]:
df['num_rings'] = df['mol'].apply(lambda x: Descriptors.RingCount(x))

In [14]:
url = 'https://github.com/samoturk/mol2vec_notebooks/raw/master/Notebooks/model_300dim.pkl'
file_to_download = requests.get(url, allow_redirects=True)

open('model_300dim.pkl', 'wb').write(file_to_download.content)

26567327

In [15]:
w2vec_model = word2vec.Word2Vec.load('model_300dim.pkl')

In [19]:
def sentences2vec_new(sentences, model, unseen=None):
    keys = set(model.wv.key_to_index)
    vec = []

    if unseen:
        unseen_vec = model.wv.get_vector(unseen)

    for sentence in sentences:
        if unseen:
            vec.append(sum([model.wv.get_vector(y) if y in set(sentence) & keys
                       else unseen_vec for y in sentence]))
        else:
            vec.append(sum([model.wv.get_vector(y) for y in sentence
                            if y in set(sentence) & keys]))
    return np.array(vec)

In [16]:
df['sentence'] = df.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)

In [20]:
df['mol2vec'] = [DfVec(x) for x in sentences2vec_new(df['sentence'], w2vec_model, unseen='UNK')]

In [22]:
X = pd.DataFrame(np.array([x.vec for x in df['mol2vec']]))

In [24]:
X = pd.concat((X, df.drop(["mol2vec", "sentence", "smiles", "mol", "logP"], axis=1)), axis=1)

In [33]:
X.columns = X.columns.astype(str)

In [25]:
y = df['logP'].values

In [34]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,num_of_N_atoms,num_of_Cl_atoms,num_of_P_atoms,num_of_Br_atoms,num_of_F_atoms,tpsa,mol_w,num_valence_electrons,num_heteroatoms,num_rings
0,-10.073109,9.706021,-18.493412,2.311213,-4.283033,-18.438519,17.174843,-4.850701,-18.096525,-42.489532,...,2,0,0,0,0,70.42,386.220557,152,6,2
1,-4.916669,5.189954,-6.624788,-0.833421,-2.796013,-4.197730,5.200408,-2.395915,-6.224814,-15.157603,...,0,0,0,0,0,29.46,138.068080,54,2,1
2,-4.958972,8.621200,-6.275642,1.453087,-3.005895,-3.968354,0.400630,-2.572672,-4.666282,-18.916910,...,1,2,0,0,0,52.37,282.980298,92,6,2
3,-3.228622,3.541736,-6.030561,0.328273,-1.985676,-3.974290,5.883745,-1.597206,-6.042399,-14.112975,...,1,0,0,0,0,12.89,107.073499,42,1,1
4,-9.828001,7.613701,-12.206541,0.232475,-3.548656,-10.543606,14.241396,-3.158611,-10.459246,-23.485481,...,0,0,0,0,0,17.07,192.151415,78,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-8.549610,8.040452,-10.950447,-1.120951,-4.784032,-8.410338,12.122510,-3.016298,-10.807934,-24.831055,...,1,0,0,0,0,21.26,205.146664,82,2,2
9996,-4.652695,4.362294,-5.307521,-0.670224,-7.070165,-5.641707,6.223618,-2.636048,-6.477015,-23.217495,...,4,2,0,0,0,77.82,268.028252,88,6,2
9997,-7.974234,5.791468,-8.786148,-1.181761,-3.995589,-6.579283,9.954315,-0.852773,-8.072239,-21.455757,...,1,0,0,0,0,3.24,161.120449,64,1,2
9998,-10.806989,7.436241,-11.483689,1.968850,0.192227,-20.346392,17.583517,-10.276894,-16.361986,-41.849747,...,4,0,0,0,0,65.45,364.160283,134,9,0


In [27]:
y

array([3.78, 1.68, 4.64, ..., 2.98, 3.3 , 3.76])

In [45]:
X_train, X_remain, y_train, y_remain = train_test_split(X, y, test_size=.2, random_state=23)
X_val, X_test, y_val, y_test = train_test_split(X_remain, y_remain, test_size=.5, random_state=24)

In [46]:
ridge_model = RidgeCV()
ridge_model.fit(X_train, y_train)

In [47]:
prediction = np.squeeze(ridge_model.predict(X_test))
mae = mean_absolute_error(y_test, prediction)
mse = mean_squared_error(y_test, prediction)

In [48]:
mae, mse

(0.6503952586284986, 0.7235364113661792)