In [1]:
#importing required packages
import rdkit;
#import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import QED
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import Lipinski
from xgboost import XGBClassifier
from rdkit.Chem import Descriptors

In [2]:
#reading the train csv file
train = pd.read_csv('train_II.csv')
train.shape

#reading the test csv file
test = pd.read_csv('test_II.csv')
test.shape

#Splitting the train data 
train[['Chemicals','AssayID']] = train['Id'].str.split(';',expand=True)

#Splitting the train data 
test[['Chemicals','AssayID']] = test['x'].str.split(';',expand=True)

In [3]:
train.isnull().sum()

Id           0
Expected     0
Chemicals    0
AssayID      0
dtype: int64

In [4]:
#smiles - molecules 
#null - discard
#code to generate Molecules from SMILES:

# Read the dataset file containing SMILES strings
def generating_Molecules(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol

# Convert the SMILES strings into molecule objects
train['Feature_1'] = train['Chemicals'].apply(generating_Molecules)
test['Feature_1'] = test['Chemicals'].apply(generating_Molecules)

#code to remove null molecules:




[20:38:38] Explicit valence for atom # 1 Si, 8, is greater than permitted
[20:38:40] Explicit valence for atom # 1 Si, 8, is greater than permitted
[20:38:42] Explicit valence for atom # 1 Si, 8, is greater than permitted
[20:38:42] Explicit valence for atom # 1 Si, 8, is greater than permitted
[20:38:44] Explicit valence for atom # 1 Si, 8, is greater than permitted
[20:38:45] Explicit valence for atom # 1 Si, 8, is greater than permitted


In [5]:
train.isnull().sum()

Id           0
Expected     0
Chemicals    0
AssayID      0
Feature_1    6
dtype: int64

In [6]:
train = train[train['Feature_1'].notnull()]
test = test[test['Feature_1'].notnull()]
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75377 entries, 0 to 75382
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         75377 non-null  object
 1   Expected   75377 non-null  int64 
 2   Chemicals  75377 non-null  object
 3   AssayID    75377 non-null  object
 4   Feature_1  75377 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.5+ MB


In [7]:
train=train.reset_index()
train.drop(['index'], axis=1, inplace=True)

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75377 entries, 0 to 75376
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         75377 non-null  object
 1   Expected   75377 non-null  int64 
 2   Chemicals  75377 non-null  object
 3   AssayID    75377 non-null  object
 4   Feature_1  75377 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.9+ MB


In [9]:
#Loading RDKIT Descriptors
def rdkit_Descriptors(data):
    
    desc_list = MoleculeDescriptors.MolecularDescriptorCalculator([desc_list[0] for desc_list in Descriptors._descList])
    desc_names = desc_list.GetDescriptorNames()
    l1 =[]
    for x in data["Feature_1"]:
        #calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
        desc = desc_list.CalcDescriptors(x)
        l1.append(desc)
    df=pd.DataFrame(l1, columns=desc_names)
    return df

In [10]:
d_train = rdkit_Descriptors(train)
d_test = rdkit_Descriptors(test)

In [11]:
d_train

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,9.316200,-1.533785,9.316200,0.150485,0.794714,317.599,306.511,315.982463,100,0,...,0,0,0,0,0,0,0,0,0,0
1,10.532611,0.333788,10.532611,0.333788,0.516641,156.269,136.109,156.151415,66,0,...,0,0,0,0,0,0,0,0,4,0
2,2.433032,0.000000,2.433032,0.000000,0.251327,362.086,313.702,361.347528,148,0,...,0,0,0,0,0,0,0,0,12,0
3,10.355080,-0.613825,10.355080,0.282361,0.487998,255.665,245.585,255.052302,90,0,...,0,0,0,0,0,0,0,0,0,0
4,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,149.894,149.894242,8,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75372,11.460021,-3.868472,11.460021,0.053611,0.712426,230.245,220.165,230.036128,82,0,...,0,1,0,0,0,0,0,0,0,0
75373,5.928972,-2.841623,5.928972,0.082346,0.720533,313.747,296.611,313.041677,104,0,...,0,0,0,0,0,0,0,0,0,0
75374,4.975926,0.848333,4.975926,0.848333,0.596343,167.258,162.218,166.986341,50,0,...,0,0,0,0,0,1,0,0,0,0
75375,10.241948,0.324028,10.241948,0.324028,0.519485,128.215,112.087,128.120115,54,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
d_test["AssayID"] = test["AssayID"]
d_test["x"] =  test["x"]
d_train["AssayID"] = train["AssayID"]
d_train["Expected"]=train["Expected"]

In [13]:
d_test.to_csv("d_test.csv")
d_train.to_csv("d_train.csv")