In [85]:
!pip install rdkit rdkit;



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [86]:
!pip install git+https://github.com/samoturk/mol2vec

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/samoturk/mol2vec
  Cloning https://github.com/samoturk/mol2vec to /tmp/pip-req-build-ryob9m13
  Running command git clone -q https://github.com/samoturk/mol2vec /tmp/pip-req-build-ryob9m13


In [87]:
# import rdkit/mol2vec/word2vec 
from rdkit import Chem
from mol2vec.features import mol2alt_sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
from rdkit import Chem
from rdkit.Chem import Descriptors

# import numpy/pandas 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [88]:
train_df = pd.read_csv('Smiles.csv')
test_df = pd.read_csv("test.csv")

In [89]:
#checking for null values and dropping them
NoValue_train = train_df[ (train_df['Smiles'] == 'Did not work') ].index
train_df.drop(NoValue_train , inplace=True)

NoValues_test = test_df[ (test_df['Smiles'] == 'Did not work') ].index
test_df.drop(NoValues_test , inplace=True)

In [90]:
print("creating mol from smiles")
train_df['mol'] = train_df['Smiles'].apply(lambda x: Chem.MolFromSmiles(x))
test_df['mol'] = test_df['Smiles'].apply(lambda x: Chem.MolFromSmiles(x)) 

creating mol from smiles


In [91]:
#checking for null
train_df['mol'].isna().sum()


5

In [92]:

#checking for null
test_df['mol'].isna().sum()

4

In [93]:
train_df['mol'].replace('', np.nan, inplace=True)
test_df['mol'].replace('', np.nan, inplace=True)

In [94]:
train_df.dropna(subset=['mol'], inplace=True)
test_df.dropna(subset=['mol'], inplace=True)

In [95]:
print("generating sentence from molecules structure ")

train_df['sentence'] = train_df.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], radius=1)), axis=1)

test_df['sentence'] = test_df.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], radius=1)), axis=1)

generating sentence from molecules structure 


In [96]:
#Loading pre-trained model via word2vec
from gensim.models import word2vec
model = word2vec.Word2Vec.load('model_300dim.pkl')

In [97]:
from pandas.core.algorithms import mode
print("Create embedding from sentence")
train_df['embedding'] = [DfVec(x) for x in sentences2vec(train_df['sentence'], model, unseen ='UNK')]
test_df['embedding'] = [DfVec(x) for x in sentences2vec(test_df['sentence'], model, unseen ='UNK')]

Create embedding from sentence


In [98]:
X_train = np.array([x.vec for x in train_df['embedding']])
X_test_final = np.array([x.vec for x in test_df['embedding']]) 

In [99]:
Y_train = train_df["Label"]

X_train.shape, Y_train.shape, X_test_final.shape

((1686, 100), (1686,), (5846, 100))

In [100]:
# Using ADASYN generating synthetic data
from imblearn.over_sampling import ADASYN
from collections import Counter
counter = Counter(Y_train)
print("Before ADASYN ",counter)

ada = ADASYN(random_state=150)
X_train_ada,y_train_ada = ada.fit_resample(X_train, Y_train)
counter = Counter(y_train_ada)
print("After ADASYN ",counter)

Before ADASYN  Counter({0: 1533, 1: 153})
After ADASYN  Counter({1: 1543, 0: 1533})


In [101]:
#splitting into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_ada, y_train_ada, test_size=0.2, random_state=1)

In [102]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score

X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)


rf_train = RandomForestClassifier(max_depth=2, random_state=22)
lr_train = LogisticRegression(C=5,solver='lbfgs', max_iter=5000, random_state=12)
svc = LinearSVC(random_state=12)
rf_train.fit(X_train, y_train)
lr_train.fit(X_train,y_train)
svc.fit(X_train, y_train)

prediction_rf = rf_train.predict(X_test)
predict_lr = lr_train.predict(X_test)
predcit_svc = svc.predict(X_test)


print("Random Forest",round(accuracy_score(y_test, prediction_rf) * 100, 2))
print("LR",round(accuracy_score(y_test, predict_lr)* 100, 2))
print("SVM",round(accuracy_score(y_test, predcit_svc) * 100, 2))


Random Forest 74.51
LR 81.33
SVM 81.01




In [103]:
#F1 score
from sklearn.metrics import accuracy_score,  f1_score
print('F1 Score Random-Forest: %.3f' % f1_score(y_test, prediction_rf))
print('F1 Score Logistic Regression: %.3f' % f1_score(y_test, predict_lr))
print('F1 Score Linear SVM: %.3f' % f1_score(y_test, predcit_svc))


F1 Score Random-Forest: 0.721
F1 Score Logistic Regression: 0.823
F1 Score Linear SVM: 0.822


In [104]:
#final Predictions
predict_lr_final = lr_train.predict(X_test_final)

In [107]:
test_df['Labels'] = predict_lr_final

In [None]:
#saving prediction to CSV
test_df.to_csv('Final_LR_pred.csv')