In [101]:
!pip install rdkit rdkit;



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [102]:
!pip install git+https://github.com/samoturk/mol2vec

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/samoturk/mol2vec
  Cloning https://github.com/samoturk/mol2vec to /tmp/pip-req-build-nlye6_9w
  Running command git clone -q https://github.com/samoturk/mol2vec /tmp/pip-req-build-nlye6_9w


In [105]:
# import required libraries
from rdkit import Chem
from mol2vec.features import mol2alt_sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
from rdkit.Chem import Descriptors
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from numpy import mean
from numpy import std
from sklearn.model_selection import KFold

In [106]:
train_df = pd.read_csv('Smiles.csv')
test_df = pd.read_csv("test.csv")

In [107]:
#checking for null values and dropping them
NoValue_train = train_df[ (train_df['Smiles'] == 'Did not work') ].index
train_df.drop(NoValue_train , inplace=True)

NoValues_test = test_df[ (test_df['Smiles'] == 'Did not work') ].index
test_df.drop(NoValues_test , inplace=True)

In [108]:
print("creating mol from smiles")
train_df['mol'] = train_df['Smiles'].apply(lambda x: Chem.MolFromSmiles(x)) 
test_df['mol'] = test_df['Smiles'].apply(lambda x: Chem.MolFromSmiles(x)) 

creating mol from smiles


In [109]:
#checking for null
train_df['mol'].isna().sum()


5

In [110]:
#checking for null
test_df['mol'].isna().sum()

4

In [111]:
train_df['mol'].replace('', np.nan, inplace=True)
test_df['mol'].replace('', np.nan, inplace=True)

In [112]:
train_df.dropna(subset=['mol'], inplace=True)
test_df.dropna(subset=['mol'], inplace=True)

In [113]:
print("generating sentence from molecules  ")

train_df['sentence'] = train_df.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], radius=1)), axis=1)

test_df['sentence'] = test_df.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], radius=1)), axis=1)

generating sentence from molecules  


In [114]:
#Loading pre-trained word2vec
from gensim.models import word2vec
model = word2vec.Word2Vec.load('model_300dim.pkl')

In [115]:
from pandas.core.algorithms import mode
print("Create embedding from sentence")
train_df['embedding'] = [DfVec(x) for x in sentences2vec(train_df['sentence'], model, unseen ='UNK')]
test_df['embedding'] = [DfVec(x) for x in sentences2vec(test_df['sentence'], model, unseen ='UNK')]

Create embedding from sentence


In [116]:
X_train = np.array([x.vec for x in train_df['embedding']])
X_test_final = np.array([x.vec for x in test_df['embedding']]) 

In [117]:

Y_train = train_df["Label"]

X_train.shape, Y_train.shape, X_test_final.shape

((1686, 100), (1686,), (5846, 100))

In [120]:
# Using ADASYN generating synthetic data
from imblearn.over_sampling import ADASYN
from collections import Counter
counter = Counter(Y_train)
print("before ADASYN",counter)

ada = ADASYN(random_state=150)
X_train_ada,y_train_ada = ada.fit_resample(X_train, Y_train)
counter = Counter(y_train_ada)
print("after ADASYN",counter)

before ADASYN Counter({0: 1533, 1: 153})
after ADASYN Counter({1: 1543, 0: 1533})


In [121]:
#sacling 
X_train_ada = StandardScaler().fit_transform(X_train_ada)
X_test_final = StandardScaler().fit_transform(X_test_final)


In [122]:
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
model_LR =  LogisticRegression(C=10, random_state=22)
model_RF= RandomForestClassifier(max_depth=2, random_state=22)
model_SVM = LinearSVC(random_state=12)
# evaluate model
metrics = ['accuracy', 'f1']
scores_LR = cross_validate(model_LR, X_train_ada, y_train_ada, scoring= metrics, cv=cv, n_jobs=-1)
scores_RF = cross_validate(model_RF, X_train_ada, y_train_ada, scoring=metrics , cv=cv, n_jobs=-1)
Scores_SVM = cross_validate(model_SVM, X_train_ada, y_train_ada, scoring=metrics, cv=cv, n_jobs=-1)

#calculating accuracy and F-1 Score
LR_acc = scores_LR['test_accuracy']
LRF1_scores = scores_LR['test_f1']
RF_acc = scores_RF['test_accuracy']
RFF1_scores = scores_RF['test_f1']
SVC_acc = Scores_SVM['test_accuracy']
SVCF1_scores = Scores_SVM['test_f1']

print("Mean LR accuracy %0.3f " % (LR_acc.mean() * 100))
print("Mean  LR F-1 Score %0.3f" % (LRF1_scores.mean()))

print("Mean RF accuracy %0.3f" % (RF_acc.mean() * 100))
print("Mean RF F-1 Score %0.3f" % (RFF1_scores.mean()))

print("Mean SVM accuracy %0.3f" % (SVC_acc.mean() * 100))
print("Mean SVM F-1 Score %0.3f " % (SVCF1_scores.mean()))


Mean LR accuracy 81.665 
Mean  LR F-1 Score 0.821
Mean RF accuracy 71.780
Mean RF F-1 Score 0.708
Mean SVM accuracy 81.599
Mean SVM F-1 Score 0.820 
