In [1]:
%reset -f

In [2]:
import pandas as pd
from pathlib import Path
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
bacdive_resource_dir = Path.cwd().parent/ "data/raw/bacdive"
bacdive_transform_dir = Path.cwd().parent / "kg_microbe/transform_utils/bacdive/tmp"

CHEBI_PREFIX = "CHEBI:"
CAS_RN_PREFIX = "CAS-RN:"
KEGG_CPD_PREFIX = "kegg.compound:"
EC_PREFIX = "EC:"

In [4]:
# Function to apply the rules and return the CURIE value
def get_curie(row):
    if pd.notnull(row['ID_CHEBI']):
        return CHEBI_PREFIX + str(row['ID_CHEBI']).rstrip(".0")
    elif pd.notnull(row['kegg_comp']):
        return KEGG_CPD_PREFIX + str(row['kegg_comp'])
    elif pd.notnull(row['CAS']):
        return CAS_RN_PREFIX + str(row['CAS'])
    elif 'EC_number' in row.index and pd.notnull(row['EC_number']):
        return EC_PREFIX + str(row['EC_number'])
    else:
        if pd.notnull(row['ID_microbiol']):
            return 'API:' + str(row['ID_microbiol'])
        else:
            return None  # or some default value if needed

In [5]:
# Initialize an empty dictionary to hold the dataframes
dataframes = {}


# Iterate over each item in the directory
for path in bacdive_resource_dir.iterdir():
    # Check if the item is a file and has a .csv extension
    if path.is_file() and path.suffix == '.csv':
        # Create a variable name based on the filename (without extension)
        var_name = path.stem

        first_column = f"ID_{var_name}" #! Same as the 'cupule' column
        if "zym" not in var_name:
            columns_of_interest = [
                                   # first_column,
                                   'cupule',
                                   # 'cupule_Name_Kit',
                                   'name_bacdive',
                                   'reaction_name',
                                   'external_Link',
                                   'ID_microbiol',
                                   'substrate',
                                   'ID_CHEBI',
                                   'CAS',
                                   'kegg_comp',
                                   'enzyme',
                                   'EC_number']
        else:
            columns_of_interest = [
                                    # first_column,
                                    "cupule",
                                    # "Enzyme_Name_Kit",
                                    "name_bacdive",
                                    "Substrate",
                                    "EC",
                                    "ID_microbiol",
                                    "ID_CHEBI",
                                    "CAS",
                                    "kegg_comp",
                                  ]
        # Read the CSV file into a DataFrame
        df = pd.read_csv(path, usecols=columns_of_interest)
        df['CURIE'] = df.apply(get_curie, axis = 1)
        match = re.search(r'kit_api_(.*?)_meta', var_name)
        if match:
            prefix = f'API_{match.group(1)}:'
            df['pseudo_CURIE'] = prefix + df["name_bacdive"].astype(str)
        dataframes[var_name] = df

dataframes.keys()


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
dataframes['kit_api_20A_meta'].head()

In [None]:
dataframes['kit_api_20A_meta'][dataframes['kit_api_20A_meta']['CURIE'].isna()]

In [None]:
dataframes['kit_api_zym_ec'] = dataframes['kit_api_zym_ec'].rename(columns={'EC': 'EC_number', "Substrate":"substrate"})
dataframes['kit_api_zym_ec']['CURIE'] = dataframes['kit_api_zym_ec'].apply( get_curie, axis=1)
dataframes['kit_api_zym_ec']['pseudo_CURIE'] = "API_zym:" + dataframes['kit_api_zym_ec']["name_bacdive"].astype(str)
dataframes['kit_api_zym_ec']['reaction_name'] = pd.NA
dataframes['kit_api_zym_ec'].head()

In [None]:
dataframes['kit_api_zym_ec'][dataframes['kit_api_zym_ec']['CURIE'].isna()]

In [None]:
column_subset = [
     "cupule",
    "name_bacdive",
    "reaction_name",
    "ID_microbiol",
    "substrate",
    "EC_number",
    "ID_microbiol",
    "ID_CHEBI",
    "CAS",
    "kegg_comp",
    "CURIE",
    "pseudo_CURIE"
]

df_subset_dict = {k: df[column_subset] for k, df in dataframes.items()}

combined_df = pd.concat(df_subset_dict.values(), ignore_index=True)

combined_df.head()


In [None]:
combined_df.to_csv(bacdive_transform_dir/"bacdive_mappings.tsv", sep="\t", index=False)