In [3]:
import os
import dotenv

import rdkit
import pandas as pd
from neo4j import GraphDatabase

dotenv.load_dotenv()
os.chdir(os.environ['PYTHONPATH'])

In [4]:
driver = GraphDatabase.driver(
    uri=os.environ['NEO4J_URL'],
    auth=(os.environ['NEO4J_USER'], os.environ['NEO4J_PASSWORD'])
)

In [3]:
with driver.session(database="neo4j") as session:
    smiles = session.run("""
        match (n:small_molecule) return distinct n.name as name, n.content as content, id(n) as nodeid
        """)
    smiles_df = smiles.to_df()



In [4]:
smiles_df

Unnamed: 0,name,content,nodeid
0,Hypusine,NCCC(O)CNCCCCCC(=O)O,12559
1,"2-[3-(2-HYDROXY-1,1-DIHYDROXYMETHYL-ETHYLAMINO...",OCC(CO)NCCCNC(CO)(CO)CO,12560
2,"3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,5...",OCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCO...,12561
3,"2-(2-HYDROXY-1,1-DIHYDROXYMETHYL-ETHYLAMINO)-E...",O=S(=O)(O)CCNC(CO)CO,12562
4,6-O-alpha-D-mannopyranosyl-alpha-D-mannopyranose,CC1OC(OCC2OC(O)C(O)C(O)C2O)C(O)C(O)C1O,12563
...,...,...,...
1264335,Voriconazole,CC(C1=NC=NC=C1F)C(CN2C=NC=N2)(C3=C(C=C(C=C3)F)F)O,1381896
1264336,4-Hydroxy-2-oxoglutaric acid,C(C(C(=O)O)O)C(=O)C(=O)O,1381897
1264337,Mycophenolic acid,CC1=C2COC(=O)C2=C(C(=C1OC)CC=C(C)CCC(=O)O)O,1381898
1264338,Dermatan Sulfate,CC(=O)NC1C(C(C(OC1O)CO)OS(=O)(=O)[O-])OC2C(C(C...,1381899


In [5]:
from rdkit import Chem

def get_canonical_smiles(smiles: str) -> str:
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True)

smiles_df['canonical_smiles'] = smiles_df['content'].apply(get_canonical_smiles)



In [8]:
smiles_df.to_csv("data/canonical_smiles.csv")

In [5]:
smiles_df = pd.read_csv("data/canonical_smiles.csv")

In [7]:
not_match = smiles_df[smiles_df['content'] != smiles_df['canonical_smiles']]

In [9]:
not_match.rename(columns={'canonical_smiles': 'updated_content'}, inplace=True)
not_match[['nodeid', 'updated_content']].to_csv("data/canonical_smiles_renaming.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_match.rename(columns={'canonical_smiles': 'updated_content'}, inplace=True)
