In [32]:
import pandas as pd
import pubchempy as pcp
from pubchempy import get_compounds, Compound

def read_file_lines(file_path):
    lines = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                lines.append(line.strip())  # Strip removes leading/trailing whitespaces and newline characters
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except IOError:
        print(f"Error: Unable to read from file '{file_path}'.")
    return lines


def pubchem_get_wln(wln):
    compounds = pcp.get_compounds(f"WLN: {wln}", 'name')
    if len(compounds) > 0:
        can_smiles = compounds[0].canonical_smiles
    else:
        can_smiles = "null"
        
    return can_smiles

In [65]:
pubchem = pd.DataFrame(read_file_lines("../data/pubchem.txt"),columns=["WLN"])
pubchem["SMILES"] = ["null" for i in range(len(pubchem))]

In [69]:
pubchem

Unnamed: 0,WLN,SMILES
0,Z1YQ1,CC(CN)O
1,WNR BG ENW,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl
2,GR BG DG,C1=CC(=C(C=C1Cl)Cl)Cl
3,Q2G,C(CCl)O
4,ZR BQ FVQ,C1=CC(=C(C(=C1)O)N)C(=O)O
...,...,...
6584,L E6 C666 BV FV CU GUTTT&J DQ EQ HQ IN1&1 MQ M...,
6585,T5VOYJ CUYR&VO1 DQ ER,
6586,T5MV EHJ CV1 DO EY2&1 &-NA-,
6587,T66 BOVJ DMVR DQ C2UY1&1& EQ IO- FT6OTJ B1 B1 ...,


In [75]:
for Index, row in pubchem.iterrows():
    wln = row["WLN"]
    smi = row["SMILES"]

    if(smi == "null"):
        pubchem.at[Index, 'SMILES'] = pubchem_get_wln(wln)
        new = pubchem.at[Index, 'SMILES']
        print(f"fetched {new} from {wln}")

fetched null from T56 BN DN FMYMVJ GUM D-BT5OTJ CQ DQ E1Q
fetched null from T5N CNJA2Q B1 ENW
fetched null from T56 BM DN FMYMVJ GUM &GH
fetched C1CN(CCN1CCOCCO)C(C2=CC=CC=C2)C3=CC=CC=C3 from T6N DNTJ AYR&R& D2O2Q
fetched CN1C(S(=O)(=O)CCC1=O)C2=CC(=C(C=C2)Cl)Cl from T6VN DSWTJ B1 CR CG DG
fetched CC(C)(C)CC(C)(C)C1=CC=C(C=C1)O.C=O.C1CO1 from /QR DX1&1&1X1&1&1/ &/*O2*/ &/*O1*/
fetched COC1=C(C=C(C=C1)CCN)OC.Cl from Z2R CO1 DO1 &GH
fetched COC1=CC(=CC(=C1OC)OC)CCN.Cl from Z2R CO1 DO1 EO1 &GH
fetched C(CC(C(=O)O)N)CN.Cl from Z3YZVQ &GH -L
fetched C(C(CO)(CCl)CCl)O from Q1X1Q1G1G
fetched CCN(CC)CCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl.Cl from T C666 BN ISJ B3N2&2 EG &GH
fetched C1=CC=C2C(=C1)N(C=C[N+]2=O)[O-] from T66 BN ENJ BO EO
fetched B1(OCC(CO1)(C)CCC)C2=CC=C(C=C2)C from T6OBOTJ BR D1& E3 E1
fetched C[N+]1(CCCC(C1)OC(=O)C(C2=CC=CC=C2)(C3=CC=CC=C3)O)C.[Br-] from T6KTJ A1 A1 COVXQR&R &E
fetched COC1=CC(=CC(=C1OC)OC)C2C3C(COC3=O)C(C4=CC5=C(C=C24)OCO5)O from T E5 C665 FVO NO PO OHTT&&J DR CO1 D

In [78]:
pubchem.to_csv("./pubchem.tsv",sep='\t',header=False,index=False)