# Mapping metabolites to strains on MiMeDB

## Importing modules and defining funcitons

In [21]:
#import modules
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
#import tqdm.notebook as tqdm

In [1]:
# #build the url for a secies related search
def build_url(species_name, df_species):
     # Find the row where species matches the given species_name
        matched_row = df_species[df_species['species'] == species_name]

        # Check if any match is found
        if not matched_row.empty:
         # Get the microbe_id from the matched row
             microbe_id = matched_row.iloc[0]['microbe_id']
         # Construct the URL
             url = f'https://mimedb.org/microbes/{microbe_id}'
             return url
        else:
             return "Species not found in the DataFrame"

In [85]:
def scrape_metabolites(species_name, df_species):
    url = build_url(species_name, df_species)
    driver = webdriver.Chrome()  # Replace with your browser's driver
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "related_metabolites-expand-button")))
    time.sleep(3)

    metabolites = []

    try:
        expand_button = driver.find_element(By.ID, "related_metabolites-expand-button")
        if expand_button.is_displayed():
            expand_button.click()
            time.sleep(5)

        dropdown = Select(driver.find_element(By.NAME, "related-metabolites-table_length"))
        dropdown.select_by_value("100")
        time.sleep(3)

        while True:
            metabolites_table = driver.find_element(By.ID, 'related-metabolites-table')
            rows = metabolites_table.find_elements(By.TAG_NAME, 'tr')[1:]

            for row in rows:
                cells = row.find_elements(By.TAG_NAME, 'td')
                if cells and len(cells) >= 6:
                    metabolite_info = {
                        'Metabolite ID': cells[0].text.strip(),
                        'Metabolite Name': cells[1].text.strip(),
                        'Class': cells[3].text.strip(),
                        'Microbial Link': cells[4].text.strip(),
                        'Data Source': cells[5].text.strip(),
                        'Reference': cells[6].text.strip()
                    }
                    metabolites.append(metabolite_info)

            next_button = driver.find_element(By.ID, "related-metabolites-table_next")
            if "disabled" in next_button.get_attribute("class"):
                break
            next_button.click()
            time.sleep(3)

    except Exception as e:
        print(f"An error occurred: {e}")

    finally:
        driver.quit()

    return metabolites

## Loading the soecies database and mapping metabolites to strains 
Informations and databases retrieved from https://mimedb.org/

The databases shown here is mimedb_microbes_v1.csv

In [84]:
file_path = '/home/marco/microbiota/MiMEDB/mimedb_microbes_v1.csv' #change file_path 
df_species = pd.read_csv(file_path, usecols=['microbe_id','species','name'])

In [87]:
species_metabolites_data = []

for index, row in df_species.iterrows():
    try:
        species_name = row['species']
        species_fullname=row['name']
        species_id = row['microbe_id']
        metabolites = scrape_metabolites(species_name, df_species)

        for metabolite in metabolites:
            species_metabolites_data.append({
                'Species Name': species_name,
                'MiMeDB Name': species_fullname,
                'Species ID': species_id,
                'Metabolite ID': metabolite['Metabolite ID'],
                'Metabolite Name': metabolite['Metabolite Name'],
                'Class': metabolite['Class'],
                'Microbial Link': metabolite['Microbial Link'],
                'Data Source': metabolite['Data Source'],
                'Reference': metabolite['Reference']
            })
    except Exception as e:
        print(f"Error processing {species_name}: {e}")

# Convert the collected data to a DataFrame
final_metabolites_df = pd.DataFrame(species_metabolites_data)

An error occurred: Message: stale element reference: stale element not found
  (Session info: chrome=119.0.6045.159); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
#0 0x55666b6bc463 <unknown>
#1 0x55666b38e247 <unknown>
#2 0x55666b394405 <unknown>
#3 0x55666b396716 <unknown>
#4 0x55666b39684c <unknown>
#5 0x55666b3db29c <unknown>
#6 0x55666b3db671 <unknown>
#7 0x55666b3cf683 <unknown>
#8 0x55666b40219d <unknown>
#9 0x55666b3cf0d3 <unknown>
#10 0x55666b40233e <unknown>
#11 0x55666b41dd00 <unknown>
#12 0x55666b401f43 <unknown>
#13 0x55666b3cd7f3 <unknown>
#14 0x55666b3ce79e <unknown>
#15 0x55666b68620a <unknown>
#16 0x55666b689ff4 <unknown>
#17 0x55666b675852 <unknown>
#18 0x55666b68aa10 <unknown>
#19 0x55666b65bbde <unknown>
#20 0x55666b6aacf8 <unknown>
#21 0x55666b6aaeea <unknown>
#22 0x55666b6bb588 <unknown>
#23 0x7fe5afd15b43 <unknown>

Error processing Acidaminococc

## Cleaning and saving the final database in csv

In [88]:
# Group the dataframe by 'Species ID' and create a separate dataframe for each group
grouped = final_metabolites_df.groupby('Species ID')

# Create a dictionary to store each group as a dataframe
dfs_per_species_id = {species_id: group_df for species_id, group_df in grouped}

In [91]:
final_metabolites_df.to_csv('metabolites_and_species_and_names.csv')

In [31]:
df=pd.read_csv('metabolites_and_species_and_names.csv')
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

In [35]:
## Check speies names to manually correct

In [48]:
names_to_fix=[]
for e in df['Species Name']:
    name=e.split(' ')
    if len(name)==1:
        names_to_fix.append(name[0])
names_to_check=[]
for s,n,i in zip(df['Species Name'],df['MiMeDB Name'],df['Species ID']):
    if s not in n:
        names_to_check.append(s)
ntf=set(names_to_fix)
ntc=set(names_to_check)
#print to manually check the names

In [38]:
## Test if manual correction where saved

In [61]:
#df=pd.read_csv('metabolites_and_species_and_names.csv')
df = pd.read_csv('metabolites_and_species_and_names.csv',encoding='cp1252')

In [65]:
names_to_fix=[]
for e in df['Species Name']:
    name=e.split(' ')
    if len(name)==1:
        names_to_fix.append(name[0])
ntf=set(names_to_fix)
if len(ntf)==0:
    print(f'The database has been corrected')
#print to manually check the names

The database has been corrected
