In [1]:
import requests
import csv
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

In [2]:
url = 'http://zinc15.docking.org/substances.txt'

In [3]:
params = {'count': 25286, 'mwt-between': '100 1000'}

response = requests.get(url, params=params)

if response.status_code == 200:
    lines = response.text.splitlines()
    with open('data_25286/zinc_data.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Substance ID", "SMILES"]) 
        for line in lines:
            row = line.split()
            writer.writerow(row)
    print("Data saved")
else:
    print(f"Error: {response.status_code}")

Data saved


In [4]:
data = pd.read_csv(f'data_25286/zinc_data.csv')

duplicated_smiles = data[data.duplicated(subset='SMILES')]
print("duplication count:", len(duplicated_smiles))

data_unique = data.drop_duplicates(subset='SMILES')
data_unique

duplication count: 0


Unnamed: 0,Substance ID,SMILES
0,ZINC000000000007,C=CCc1ccc(OCC(=O)N(CC)CC)c(OC)c1
1,ZINC000000000010,C[C@@]1(c2ccccc2)OC(C(=O)O)=CC1=O
2,ZINC000000000011,COc1cc(Cc2cnc(N)nc2N)cc(OC)c1N(C)C
3,ZINC000000000012,O=C(C[S@@](=O)C(c1ccccc1)c1ccccc1)NO
4,ZINC000000000014,CC[C@H]1[C@H](O)N2[C@H]3C[C@@]45c6ccccc6N(C)[C...
...,...,...
25281,ZINC000000033247,CC(C)COC(=O)Oc1c(Cl)cc2oc(=O)sc2c1[N+](=O)[O-]
25282,ZINC000000033248,Cc1c(C(=O)NN=C2CCCC2)nnn1-c1nonc1N
25283,ZINC000000033249,CN(C)c1nc2nonc2nc1Nc1cccc(Cl)c1
25284,ZINC000000033252,COc1cccc(NC(=O)CSC(N)=O)c1


In [5]:
def get_bertzct(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is not None:
        return Descriptors.BertzCT(molecule)
    else:
        return None

def get_kappa2(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is not None:
        return Descriptors.Kappa2(molecule)
    else:
        return None

data_unique['BertzCT'] = data_unique['SMILES'].apply(get_bertzct)
data_u = data_unique.dropna(subset=['BertzCT'])
data_u['Kappa2'] = data_u['SMILES'].apply(get_kappa2)
data_u2 = data_u.dropna(subset=['Kappa2'])
data_u2.to_csv("result/zinc_25286.csv", index = False)
data_u2

Unnamed: 0,Substance ID,SMILES,BertzCT,Kappa2
0,ZINC000000000007,C=CCc1ccc(OCC(=O)N(CC)CC)c(OC)c1,453.035249,8.250757
1,ZINC000000000010,C[C@@]1(c2ccccc2)OC(C(=O)O)=CC1=O,475.051945,3.483475
2,ZINC000000000011,COc1cc(Cc2cnc(N)nc2N)cc(OC)c1N(C)C,647.972650,6.379815
3,ZINC000000000012,O=C(C[S@@](=O)C(c1ccccc1)c1ccccc1)NO,547.082326,6.977663
4,ZINC000000000014,CC[C@H]1[C@H](O)N2[C@H]3C[C@@]45c6ccccc6N(C)[C...,723.345329,4.169324
...,...,...,...,...
25281,ZINC000000033247,CC(C)COC(=O)Oc1c(Cl)cc2oc(=O)sc2c1[N+](=O)[O-],794.604059,6.323358
25282,ZINC000000033248,Cc1c(C(=O)NN=C2CCCC2)nnn1-c1nonc1N,692.896724,5.306088
25283,ZINC000000033249,CN(C)c1nc2nonc2nc1Nc1cccc(Cl)c1,756.542291,4.932991
25284,ZINC000000033252,COc1cccc(NC(=O)CSC(N)=O)c1,395.405461,6.016134
