In [1]:
%reset -f

In [2]:
import pandas as pd
from pathlib import Path
import re

In [3]:
bacdive_resource_dir = Path.cwd().parent/ "data/raw/bacdive"
bacdive_transform_dir = Path.cwd().parent / "kg_microbe/transform_utils/bacdive/tmp"

CHEBI_PREFIX = "CHEBI:"
CAS_RN_PREFIX = "CAS-RN:"
KEGG_CPD_PREFIX = "KEGG:"
EC_PREFIX = "EC:"

In [4]:
# Function to apply the rules and return the CURIE value
# UNUSED as of now
def get_curies(row):
    ids_dict = {}
    if 'ID_CHEBI' in row.index and pd.notnull(row['ID_CHEBI']):
        ids_dict["CHEBI_ID"] = CHEBI_PREFIX + str(row['ID_CHEBI']).rstrip(".0")
    if 'kegg_comp' in row.index and pd.notnull(row['kegg_comp']):
        ids_dict["KEGG_ID"] = KEGG_CPD_PREFIX + str(row['kegg_comp'])
    if 'CAS' in row.index and pd.notnull(row['CAS']):
        ids_dict["CAS_RN_ID"] = CAS_RN_PREFIX + str(row['CAS']).strip()
    if 'EC_number' in row.index and pd.notnull(row['EC_number']):
        ids_dict["EC_ID"] = EC_PREFIX + str(row['EC_number'])
    return pd.Series(ids_dict)
    
    # else:
    #     if pd.notnull(row['ID_microbiol']):
    #         return 'API:' + str(row['ID_microbiol'])
    #     else:
    #         return None  # or some default value if needed

In [5]:
# Initialize an empty dictionary to hold the dataframes
dataframes = {}


# Iterate over each item in the directory
for path in bacdive_resource_dir.iterdir():
    # Check if the item is a file and has a .csv extension
    if path.is_file() and path.suffix == '.csv':
        # Create a variable name based on the filename (without extension)
        var_name = path.stem

        first_column = f"ID_{var_name}" #! Same as the 'cupule' column
        if "zym" not in var_name:
            columns_of_interest = [
                                   # first_column,
                                   'cupule',
                                   # 'cupule_Name_Kit',
                                   'name_bacdive',
                                   'reaction_name',
                                   'external_Link',
                                   'ID_microbiol',
                                   'substrate',
                                   'ID_CHEBI',
                                   'CAS',
                                   'kegg_comp',
                                   'enzyme',
                                   'EC_number']
        else:
            columns_of_interest = [
                                    # first_column,
                                    "cupule",
                                    "Enzyme_Name_Kit",
                                    "name_bacdive",
                                    "Substrate",
                                    "EC",
                                    "ID_microbiol",
                                    "ID_CHEBI",
                                    "CAS",
                                    "kegg_comp",
                                  ]
        # Read the CSV file into a DataFrame
        df = pd.read_csv(path, usecols=columns_of_interest)
        if "zym" in var_name:
            df = df.rename(columns={'EC': 'EC_number', "Substrate":"substrate", "Enzyme_Name_Kit":"enzyme"})
            df['reaction_name'] = pd.NA
        df =  df.apply(get_curies, axis = 1).join(df)
        match = re.search(r'kit_api_(.*?)_meta', var_name)
        if match:
            prefix = f'assay:API_{match.group(1)}_'
            df['pseudo_CURIE'] = prefix + df["name_bacdive"].astype(str)
        if 'EC_ID' not in df.columns:
            df['EC_ID'] = pd.NA
        else:
            df['EC_ID'] = df['EC_ID'].str.replace('EC ', '', regex=False) # TO address EC 1.9.3.1 in kit_api_20E_meta
        dataframes[var_name] = df

dataframes.keys()


dict_keys(['kit_api_20A_meta', 'kit_api_rID32A_meta', 'kit_api_CAM_meta', 'kit_api_zym_ec', 'kit_api_ID32E_meta', 'kit_api_20STR_meta', 'kit_api_coryne_meta', 'kit_api_ID32STA_meta', 'kit_api_STA_meta', 'kit_api_50CHas_meta', 'kit_api_rID32STR_meta', 'kit_api_NH_meta', 'kit_api_LIST_meta', 'kit_api_20E_meta', 'kit_api_20NE_meta'])

In [6]:
dataframes['kit_api_20A_meta'].head()

Unnamed: 0,CAS_RN_ID,CHEBI_ID,EC_ID,KEGG_ID,cupule,name_bacdive,reaction_name,external_Link,ID_microbiol,substrate,ID_CHEBI,CAS,kegg_comp,enzyme,EC_number,pseudo_CURIE
0,CAS-RN:73-22-3,CHEBI:16828,EC:4.1.99.1,KEGG:C00078,1,IND,Indole production,http://www.brenda-enzymes.org/enzyme.php?ecno=...,IND_20A,L-tryptophan,16828.0,73-22-3,C00078,tryptophanase,4.1.99.1,assay:API_20A_IND
1,CAS-RN:57-13-6,CHEBI:16199,EC:3.5.1.5,KEGG:C00086,2,URE,Urease/urea hydrolysis,http://www.brenda-enzymes.org/enzyme.php?ecno=...,URE_20A,Urea,16199.0,57-13-6,C00086,Urease,3.5.1.5,assay:API_20A_URE
2,CAS-RN:50-99-7,CHEBI:17634,,KEGG:C00031,3,GLU,Acid from D-glucose,http://www.genome.jp/dbget-bin/www_bget?cpd:C0...,GLU_20A,D-glucose,17634.0,50-99-7,C00031,,,assay:API_20A_GLU
3,CAS-RN:69-65-8,CHEBI:16899,,KEGG:C00392,4,MAN,Acid from D-mannitol,http://www.genome.jp/dbget-bin/www_bget?cpd:C0...,MAN_20A,D-mannitol,16899.0,69-65-8,C00392,,,assay:API_20A_MAN
4,CAS-RN:63-42-3,CHEBI:17716,,KEGG:C00243,5,LAC,Acid from lactose,http://www.genome.jp/dbget-bin/www_bget?cpd:C0...,LAC_20A,D-lactose (lactose),17716.0,63-42-3,C00243,,,assay:API_20A_LAC


In [7]:
dataframes['kit_api_20E_meta']['EC_ID'].drop_duplicates()

0             NaN
1     EC:4.1.1.17
2     EC:4.1.1.18
3      EC:3.5.3.6
4     EC:3.2.1.23
6      EC:3.5.1.5
8     EC:4.1.99.1
20     EC:1.9.3.1
Name: EC_ID, dtype: object

In [8]:
# dataframes['kit_api_zym_ec'] = dataframes['kit_api_zym_ec'].rename(columns={'EC': 'EC_number', "Substrate":"substrate", "Enzyme_Name_Kit":"enzyme"})
# dataframes['kit_api_zym_ec'] = dataframes['kit_api_zym_ec'].apply( get_curies, axis=1).join(dataframes['kit_api_zym_ec'])
dataframes['kit_api_zym_ec']['pseudo_CURIE'] = "assay:API_zym_" + dataframes['kit_api_zym_ec']["name_bacdive"].astype(str)
dataframes['kit_api_zym_ec']['KEGG_ID'] = pd.NA
dataframes['kit_api_zym_ec'].head()

Unnamed: 0,CAS_RN_ID,CHEBI_ID,EC_ID,cupule,enzyme,name_bacdive,substrate,EC_number,ID_microbiol,ID_CHEBI,CAS,kegg_comp,reaction_name,pseudo_CURIE,KEGG_ID
0,CAS-RN:65322-97-6,CHEBI:90426,,8,Cystine arylamidase,Cystine arylamidase,L-cystyl-2-naphthylamide,,test_8_ZYM,90426.0,65322-97-6,,,assay:API_zym_Cystine arylamidase,
1,CAS-RN:0913-04-02,,EC:3.4.21.4,9,Trypsin,Trypsin,N-benzoyl-DL-arginine-2-naphthylamide,3.4.21.4,test_9_ZYM,,0913-04-02,,,assay:API_zym_Trypsin,
2,,,EC:3.4.21.1,10,alpha-Chymotrypsin,alpha- Chymotrypsin,N-glutaryl-phenylalanine-2-naphthylamide,3.4.21.1,test_10_ZYM,,,,,assay:API_zym_alpha- Chymotrypsin,
3,CAS-RN:14463-68-4,,EC:3.1.3.2,11,Acid phosphatase,Acid phosphatase,2-naphthyl phosphate,3.1.3.2,test_11_ZYM,,14463-68-4,,,assay:API_zym_Acid phosphatase,
4,CAS-RN:1919-91-1,,,12,Naphthol-AS-BI-phosphohydrolase,Naphthol-AS-BI-phosphohydrolase,Naphthol-AS-BI-phosphate,,test_12_ZYM,,1919-91-1,,,assay:API_zym_Naphthol-AS-BI-phosphohydrolase,


In [9]:
column_subset = [
    "CHEBI_ID",
    "substrate",
    "KEGG_ID",
    "CAS_RN_ID",
    "EC_ID",
    "enzyme",
    "pseudo_CURIE",
    "reaction_name"
]

df_subset_dict = {k: df[column_subset] for k, df in dataframes.items()}

combined_df = pd.concat(df_subset_dict.values(), ignore_index=True)
combined_df.drop_duplicates(inplace=True)
combined_df.head()


Unnamed: 0,CHEBI_ID,substrate,KEGG_ID,CAS_RN_ID,EC_ID,enzyme,pseudo_CURIE,reaction_name
0,CHEBI:16828,L-tryptophan,KEGG:C00078,CAS-RN:73-22-3,EC:4.1.99.1,tryptophanase,assay:API_20A_IND,Indole production
1,CHEBI:16199,Urea,KEGG:C00086,CAS-RN:57-13-6,EC:3.5.1.5,Urease,assay:API_20A_URE,Urease/urea hydrolysis
2,CHEBI:17634,D-glucose,KEGG:C00031,CAS-RN:50-99-7,,,assay:API_20A_GLU,Acid from D-glucose
3,CHEBI:16899,D-mannitol,KEGG:C00392,CAS-RN:69-65-8,,,assay:API_20A_MAN,Acid from D-mannitol
4,CHEBI:17716,D-lactose (lactose),KEGG:C00243,CAS-RN:63-42-3,,,assay:API_20A_LAC,Acid from lactose


In [10]:
combined_df.to_csv(bacdive_transform_dir/"bacdive_mappings.tsv", sep="\t", index=False)