In [1]:
import pandas as pd
import bson
import time
import itertools
from rdkit import Chem
from rdkit.Chem.Descriptors import ExactMolWt
from rdkit.Chem.rdMolDescriptors import CalcMolFormula


BSON_FILE_PATH = './NPOC2021/lotusUniqueNaturalProduct.bson'

start_time = time.time()

data = []
with open(BSON_FILE_PATH, 'rb') as f:
#    for doc in bson.decode_file_iter(f):
#        data.append(doc)

    iterator = bson.decode_file_iter(f)
    for doc in itertools.islice(iterator, 100):
        data.append(doc)
    
df = pd.DataFrame(data)

end_time = time.time()
print(f"Загрузка завершена за {end_time - start_time:.2f} секунд.")
print(f"Всего загружено записей: {len(df)}")

df = df.drop('_id', axis=1)


Загрузка завершена за 0.02 секунд.
Всего загружено записей: 100


In [None]:
# Раскомментируйте, чтобы посмотреть все столбцы

# df.columns

In [None]:
pd.set_option('display.max_colwidth', 1000)

df.taxonomyReferenceObjects.iloc[0]

{'10$x$x$1021/NP068075P': {'iNaturalist': [{'cleaned_organism_id': '61369',
    'organism_value': 'Momordica charantia',
    'kingdom': 'Plantae',
    'phylum': 'Tracheophyta',
    'classx': 'Magnoliopsida',
    'family': 'Cucurbitaceae',
    'genus': 'Momordica',
    'species': 'Momordica charantia',
    '_class': 'de.unijena.cheminf.lotusfiller.mongocollections.UncomplicatedTaxonomy'}],
  'ITIS': [{'cleaned_organism_id': '22399',
    'organism_value': 'Momordica charantia',
    'kingdom': 'Plantae',
    'classx': 'Magnoliopsida',
    'family': 'Cucurbitaceae',
    'genus': 'Momordica',
    'species': 'Momordica charantia',
    '_class': 'de.unijena.cheminf.lotusfiller.mongocollections.UncomplicatedTaxonomy'}],
  'Open Tree of Life': [{'cleaned_organism_id': '955521',
    'organism_value': 'Momordica charantia',
    'wikidata_id': 'http://www.wikidata.org/entity/Q428750',
    'reference_wikidata_id': 'http://www.wikidata.org/entity/Q34660859',
    'domain': 'Eukaryota',
    'kingdom':

In [4]:
def is_from_plant_revised(cell_data):

    if not isinstance(cell_data, dict):
        return False

    for db_sources_dict in cell_data.values():
        if not isinstance(db_sources_dict, dict):
            continue

        for organism_list in db_sources_dict.values():
            if not isinstance(organism_list, list):
                continue

            for organism_info in organism_list:
                if not isinstance(organism_info, dict):
                    continue
                
                kingdom = organism_info.get('kingdom')
                if kingdom in ['Viridiplantae', 'Plantae']:
                    return True
    
    return False

In [5]:
import pandas as pd
import numpy as np

def toICHK(smiles): return Chem.MolToInchiKey(Chem.MolFromSmiles(smiles))
def get_formula(smiles): return CalcMolFormula(Chem.MolFromSmiles(smiles))
def get_weight(smiles_string): return ExactMolWt(Chem.MolFromSmiles(smiles_string))


mask = df["taxonomyReferenceObjects"].apply(is_from_plant_revised)
df = df[mask].copy()

df = df[["inchikey", "smiles"]]
df.columns  = ["standard_inchikey", "smiles"]
df["molecular_formula"] = df["smiles"].apply(get_formula)
df["molecular_weight"] = df["smiles"].apply(get_weight)
df['iupac_name'] = np.nan
df["common_names"] = np.nan
df["plant_sources"] = np.nan
df["classifications"] = np.nan


df.reset_index(drop=True)



Unnamed: 0,standard_inchikey,smiles,molecular_formula,molecular_weight,iupac_name,common_names,plant_sources,classifications
0,NMIXDARFKVGBJR-FSHMXENQSA-N,CO[C@H](C=C(C)C)C[C@@H](C)[C@H]1CC[C@@]2(C)[C@@H]3C=C[C@@]45OC[C@]3(CC[C@]12C)[C@@H]4CC[C@H](O[C@@H]1O[C@H](CO)[C@@H](O)[C@@H](O)[C@H]1O)C5(C)C,C37H60O8,632.428819,,,,
1,SKHRNCAZROJNIS-UHFFFAOYSA-N,CC(=O)OC1C(OC2C(OC3CC(O)CC4=CCC5C6CC7OC8(CCC(C)CO8)C(C)C7C6(C)CCC5C43C)OC(C)C(O)C2OC2OCC(O)C(O)C2O)OC(C)C(O)C1O,C46H72O17,896.476951,,,,
2,MYDXZQQTJDVANI-LWLJHPHVSA-N,C[C@H](CO)[C@H]1OC(=O)C=C2C1=C[C@H]1OC(=O)[C@]3(C)[C@H]1[C@]2(C)[C@@H](Cl)[C@H](O)[C@@H]3O,C19H23ClO7,398.113231,,,,
3,QVJRMEOIOMVCKY-UHFFFAOYSA-N,CS(=O)(=O)C=CCO,C4H8O3S,136.019415,,,,
4,HAFTVEPNLRPRIQ-UHFFFAOYSA-N,CC1C(=O)OC2(C)C1C(=O)OC1CCN3CC=C(COC(=O)C2(C)O)C13,C18H23NO7,365.147452,,,,
...,...,...,...,...,...,...,...,...
63,XEBPNCWIPRFZFV-UHFFFAOYSA-N,COc1cc(-c2cc(=O)c3c(O)cc(OC(=O)C4OC(O)C(O)C(O)C4O)cc3o2)ccc1O,C22H20O12,476.095476,,,,
64,YCHJWRVSZGGWFE-MQFNCDSZSA-N,CC(=O)O[C@@H]1C(=O)C(C)(C)C/C=C(/C)C(=O)[C@@]2(OC(C)=O)C[C@@H](C)[C@H](OC(=O)c3ccccc3)[C@@H]2/C=C(\C)[C@@H]1OC(C)=O,C33H40O10,596.262147,,,,
65,BSRLPROUTURBPN-WDYNHAJCSA-N,COc1cc(OC)c(Oc2cc3c(cc2OC)-c2c(OC)c(OC)c(OC)c4c2[C@H](C3)N(C)CC4)cc1C[C@@H]1c2cc(OC)c(OC)c(O)c2CCN1C,C42H50N2O10,742.346546,,,,
66,FATJTRUVRFSESL-HZYNXAPGSA-N,COC(=O)[C@@]1(C)C[C@@H]2[C@](C)(CC1=O)C[C@H](O)[C@]1(C)C3=CC=C4C(=CC(=O)C(O)=C4C)[C@]3(C)CC[C@@]21C,C30H38O6,494.266839,,,,
