In [3]:
import pandas as pd
import bson
import sys
import time
import itertools
from rdkit import Chem
from rdkit.Chem.Descriptors import ExactMolWt
from rdkit.Chem.rdMolDescriptors import CalcMolFormula


BSON_FILE_PATH = './NPOC2021/lotusUniqueNaturalProduct.bson'

start_time = time.time()
all_strings = 275518
iters = 100_00

data = []
with open(BSON_FILE_PATH, 'rb') as f:
#    for doc in bson.decode_file_iter(f):
#        data.append(doc)

    iterator = bson.decode_file_iter(f)
    for doc in itertools.islice(iterator, iters ):
        data.append(doc)



    
df = pd.DataFrame(data)


end_time = time.time()
times = end_time - start_time

print(f"Загрузка завершена за {times} секунд.")
print(f"Всего загружено записей: {len(df)}")
print(f"Всего строк в файле: {all_strings}")
total_memory_mb = df.memory_usage(deep=True).sum() / (1024**2) 
print(f"Затраченная память: {total_memory_mb} МБ")

df = df.drop('_id', axis=1)



Загрузка завершена за 2.5426390171051025 секунд.
Всего загружено записей: 10000
Всего строк в файле: 275518
Затраченная память: 91.17564582824707 МБ


In [4]:
import pandas as pd

def extract_species(cell_data):
    sources_dict = next(iter(cell_data.values()))
    
    info_list = next(iter(sources_dict.values()))
    
    species = info_list[0].get('species')
    return species

def is_from_plant_revised(cell_data):

    if not isinstance(cell_data, dict):
        return False

    for db_sources_dict in cell_data.values():
        if not isinstance(db_sources_dict, dict):
            continue

        for organism_list in db_sources_dict.values():
            if not isinstance(organism_list, list):
                continue

            for organism_info in organism_list:
                if not isinstance(organism_info, dict):
                    continue
                
                kingdom = organism_info.get('kingdom')
                if kingdom in ['Viridiplantae', 'Plantae']:
                    return True
    
    return False

def to_inchek(smiles): return Chem.MolToInchiKey(Chem.MolFromSmiles(smiles))
def get_formula(smiles): return CalcMolFormula(Chem.MolFromSmiles(smiles))
def get_weight(smiles_string): return ExactMolWt(Chem.MolFromSmiles(smiles_string))
def to_canonical_smiles(smiles): return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))


mask = df["taxonomyReferenceObjects"].apply(is_from_plant_revised)
df = df[mask].copy()

source_df = df.copy()

df = df[["inchikey", "smiles"]]
df.columns  = ["standard_inchikey", "smiles"]
df["smiles"] = df["smiles"].apply(to_canonical_smiles)
df["standard_inchikey"] = df["smiles"].apply(to_inchek)
df["molecular_formula"] = df["smiles"].apply(get_formula)
df["molecular_weight"] = df["smiles"].apply(get_weight)
df['iupac_name'] = source_df["iupac_name"]
df["common_names"] = source_df["traditional_name"]
df["plant_sources"] = source_df["taxonomyReferenceObjects"].apply(extract_species)
df["classifications"] = source_df["allChemClassifications"]
df['calculated_properties'] = source_df.apply(
    lambda row: {'xlogp': row['xlogp'], 'tpsa': row['tpsaEfficiency']},
    axis=1
)
df["associated_targets"] = None
df["synthetic_accessibility_score"] = None


df.reset_index(drop=True)



df.to_csv("lotus.csv")