In [None]:
import pandas as pd


df = pd.read_csv("lotus.csv")

In [8]:
# Элементарный состав датасета

import re
from collections import Counter

element_counts = Counter()
for formula in df["molecular_formula"]:
    pairs = re.findall(r'([A-Z][a-z]*)(\d*)', formula)
    for element, count in pairs:
        count = int(count) if count else 1
        element_counts[element] += count


result_dict = dict(element_counts)
result_dict

{'C': 181094,
 'H': 245545,
 'O': 54414,
 'Cl': 85,
 'S': 100,
 'N': 1892,
 'Br': 113,
 'P': 21,
 'I': 2,
 'Si': 10}

In [9]:
import re

def get_atoms(formula):
    return dict(Counter(elem for elem, count in re.findall(r'([A-Z][a-z]*)(\d*)', formula) for _ in range(int(count) if count else 1)))

def count_atoms(formula):
    return sum(formula.values())
    


ranged_df = df.copy()

ranged_df['atoms'] = ranged_df['molecular_formula'].apply(get_atoms)
ranged_df['total_atoms'] = ranged_df['atoms'].apply(count_atoms)


ranged_df = ranged_df.sort_values(by="total_atoms", ascending=False)
print("Максимально большое количество атомов:", ranged_df["total_atoms"].iloc[0])
print("Датафрейм ranged_df отсортирован по количеству атомов в молекуле")

Максимально большое количество атомов: 382
Датафрейм ranged_df отсортирован по количеству атомов в молекуле


In [10]:
from rdkit import Chem


def get_electrons(smiles):
    total_electrons = 0
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)
    for atom in mol.GetAtoms():
        total_electrons += atom.GetAtomicNum()
    
    total_electrons -= Chem.GetFormalCharge(mol)

    return total_electrons


df["electrons"] = df["smiles"].apply(get_electrons)
df = df.sort_values(by="electrons", ascending=False)
print("Максимально большое количество атомов:", df["electrons"].iloc[0])
print("Датафрейм df отсортирован по количеству электронов в молекуле")

Максимально большое количество атомов: 1944
Датафрейм df отсортирован по количеству электронов в молекуле


In [11]:
def check_openshell(electrons):
    isTroubles = False

    if electrons % 2 == 1: isTroubles = True

    return isTroubles

df["troubles"] = df["electrons"].apply(check_openshell)
df[df["troubles"] == True]

Unnamed: 0.1,Unnamed: 0,standard_inchikey,smiles,molecular_formula,molecular_weight,iupac_name,common_names,plant_sources,classifications,calculated_properties,associated_targets,synthetic_accessibility_score,electrons,troubles
6206,8850,KRMAJVJAPORVMD-KQSYCOFHSA-N,CCC1=CN2CCc3c([nH]c4ccccc34)[C]2C=C1C1=CN2c3cc...,C38H39N4O2,583.307301,"3-ethyl-2-[(1R,11S,12R,13R,14E,19S,21S)-11-hyd...","3-ethyl-2-[(1r,11s,12r,13r,14e,19s,21s)-11-hyd...",Strychnos nux-vomica,"['Alkaloids', 'Alkaloids and derivatives', 'Tr...","{'xlogp': 4.3599999999999985, 'tpsa': 0.112901...",,,311,True
5748,8204,MPWHMYZPTQZGPC-RJHTZORGSA-N,C=C[C@H]1[C@H](O[C@@H]2O[C@H](CO)[C@@H](O)[C@H...,C25H32NO12,538.19245,"5-carboxy-3-[(1E)-2-[(2S,3R,4R)-3-ethenyl-5-(m...","5-carboxy-3-[(1e)-2-[(2s,3r,4r)-3-ethenyl-5-(m...",Lonicera japonica,"['Monoterpenoids', 'Terpenoids', 'Secoiridoid ...","{'xlogp': -0.456, 'tpsa': 0.36290782626954354}",,,285,True
4670,6661,OIZFQAFWYYKPMR-PEVLUNPASA-N,OC[C@H]1O[C@@H](OC2=C(c3cc(O)c(O)c(O)c3)OC3=CC...,C21H21O12,465.103301,"5,7-dihydroxy-3-{[(2S,3R,4S,5S,6R)-3,4,5-trihy...","5,7-dihydroxy-3-{[(2s,3r,4s,5s,6r)-3,4,5-trihy...",Vaccinium padifolium,"['Flavonoids', 'Organooxygen compounds', 'O-gl...","{'xlogp': 0.14999999999999974, 'tpsa': 0.45002...",,,243,True
4148,5920,HMOCYHSZAUMCTH-UHFFFAOYSA-M,C[N+]1CCC2=CN=C3C(=O)C([N-]CCc4ccc(O)cc4)=CC1=C23,C19H18N3O2,320.139902,10-{[2-(4-hydroxyphenyl)ethyl]azanidyl}-7-meth...,10-{[2-(4-hydroxyphenyl)ethyl]azanidyl}-7-meth...,,"['', 'Indoles and derivatives', 'Organic compo...","{'xlogp': 0.4400000000000002, 'tpsa': 0.168458...",,,169,True
