In [1]:
import numpy as np
import pandas as pd
import pubchempy as pcp

import re

In [2]:
annotations_file = '../data/mouse_data/MOLECULAR-LIBRARYSEARCH-V2-3f27f407-download_all_identifications-main.tsv'
annotations = pd.read_csv(annotations_file, sep='\t')
print(len(annotations))

31119


In [3]:
no_inchikey = annotations[annotations['InChIKey'].isnull()]
print(len(no_inchikey))

14989


In [4]:
no_inchi1 = no_inchikey[no_inchikey['INCHI'].isnull()]
no_inchi2 = no_inchikey[no_inchikey['INCHI'] == ' ']
no_inchi = pd.concat([no_inchi1, no_inchi2])
print(len(no_inchi))

14985


In [5]:
no_smiles1 = no_inchi[no_inchi['Smiles'].isnull()]
no_smiles2 = no_inchi[no_inchi['Smiles'] == ' ']
no_smiles = pd.concat([no_smiles1, no_smiles2])
print(len(no_smiles))

14985


In [6]:
no_name = no_smiles[no_smiles['Compound_Name'].isnull()]
len(no_name)

0

In [7]:
compound_queries = pd.unique(no_smiles['Compound_Name'])
len(compound_queries)

264

In [8]:
def _compound_name_cleaning(compound_names: np.ndarray) -> pd.DataFrame:

    compound_names['Compound_Name'] = compound_names['Compound_Name'].str.replace('Spectral Match to ', '')
    compound_names['Compound_Name'] = compound_names['Compound_Name'].str.replace(' from NIST14', '')
    compound_names['Compound_Name'] = compound_names['Compound_Name'].str.replace(r'Massbank:PR[0-9*] ', '', regex=True)
    compound_names['Compound_Name'] = compound_names['Compound_Name'].str.replace(r'\|.*', '', regex=True)
    compound_names['Compound_Name'] = compound_names['Compound_Name'].str.replace(r'; \[M\+H\].*', '', regex=True)
    compound_names['Compound_Name'] = compound_names['Compound_Name'].str.replace(r'PC\(.*\) - ', '', regex=True)

    return compound_names

In [9]:
# create lookup table for annotations without inchikey
compound_queries_clean = pd.unique(_compound_name_cleaning(no_inchikey)['Compound_Name']) # compounds to query

lookup_table = pd.DataFrame(columns=['Compound_Name', 'InChIKey'])
not_found_list = []

for cn in compound_queries_clean:
    try: 
        compound = pcp.get_compounds(cn, 'name')[0]
        inchikey = compound.inchikey
        lookup_table.loc[len(lookup_table.index)] = {'Compound_Name': cn, 'InChIKey': inchikey}
    except Exception as e:
        not_found_list.append(cn)

print(len(lookup_table))
print(len(not_found_list))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compound_names['Compound_Name'] = compound_names['Compound_Name'].str.replace('Spectral Match to ', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compound_names['Compound_Name'] = compound_names['Compound_Name'].str.replace(' from NIST14', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  com

195
70


In [10]:
def lookup_inchikey(row, lookup_table):
    if str(row['InChIKey']) == 'nan':
        if row['Compound_Name'] in lookup_table['Compound_Name'].values:
            return (lookup_table[lookup_table['Compound_Name'] == row['Compound_Name']])['InChIKey'].values[0]
        else:
            return np.nan
    else:
        return row['InChIKey']

In [51]:
annotations_clean = _compound_name_cleaning(annotations)
annotations_clean['InChIKey'] = annotations_clean.apply(lambda r: lookup_inchikey(r, lookup_table), axis=1)
annotations_clean_drop = annotations_clean.dropna(subset=['InChIKey'])

In [52]:
print(len(annotations_clean[annotations_clean['InChIKey'].isna()]))

3670


In [54]:
def extract_inchi_planar(row):
    try:
        return row['InChIKey'].split('-')[0]
    except:
        print(row['InChIKey'])
        print(row.dropna())

In [55]:
annotations_clean_drop.loc[:,'inchikey_p1'] = annotations_clean_drop.apply(lambda r: extract_inchi_planar(r), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotations_clean_drop.loc[:,'inchikey_p1'] = annotations_clean_drop.apply(lambda r: extract_inchi_planar(r), axis=1)


In [56]:
annotations_clean_drop.to_csv('../data/mouse_data/annotations.tsv', sep='\t')