In [1]:
import re
import pandas as pd
import requests

In [2]:
MeSH_url = 'http://www.nlm.nih.gov/cgi/mesh/2015/MB_cgi?mode=&term='

In [3]:
# takes in a disease and finds the corresponding MeSH disease ID
def MeSH_disease_match(disease):
    disease_edit = disease.replace(' ', '+')
    url = MeSH_url + disease_edit
    returned = requests.get(url)
    returned = returned.text
    sub_returned = re.search('Unique ID</TH><TD colspan=1>(.+?)<', returned)
    try:
        sub_returned = sub_returned.group(1)
        returned_list = [disease, sub_returned]
        return(returned_list)
    except:
        returned_list = [disease, '']
        return(returned_list)

# takes in a symptom and finds the corresponding MeSH symptom ID
def MeSH_symptom_match(symptom):
    symptom_edit = symptom.replace(' ', '+')
    url = MeSH_url + symptom_edit
    returned = requests.get(url)
    returned = returned.text
    sub_returned = re.search('Unique ID</TH><TD colspan=1>(.+?)<', returned)
    try:
        sub_returned = sub_returned.group(1)
        returned_list = [symptom, sub_returned]
        return(returned_list)
    except:
        returned_list = [symptom, '']
        return(returned_list)

In [4]:
# functions to extract the list of diseases & symptoms from the input files
def MeSH_disease_extract(file):
    df = pd.read_csv(file, sep = '\t', header=0)
    sub_df = list(df['MeSH Disease Term'])
    return(sub_df)

def MeSH_symptom_extract(file):
    df = pd.read_csv(file, sep = '\t', header=0)
    sub_df = list(df['MeSH Symptom Term'])
    return(sub_df)

In [5]:
# creates reference for MeSH disease IDs
your_list = MeSH_disease_extract('Disease-Occurence-Input.tsv')
output = list(map(MeSH_disease_match, your_list))
header = ['MeSH Disease Term', 'MeSH Disease ID']
output = pd.DataFrame(output, columns = header)

# creates reference for MeSH symptom IDs
your_list = MeSH_symptom_extract('Symptom-Occurence-Input.tsv')
output2 = list(map(MeSH_symptom_match, your_list))
header = ['MeSH Symptom Term', 'MeSH Symptom ID']
output2 = pd.DataFrame(output2, columns = header)



In [6]:
# create data frames for each file then merge with the retrieved IDs
df_combined = pd.read_csv('Combined-Input.tsv', sep = '\t', header = 0)
df_symptom = pd.read_csv('Symptom-Occurence-Input.tsv', sep = '\t', header = 0)
df_disease = pd.read_csv('Disease-Occurence-Input.tsv', sep = '\t', header = 0)

df_combined = pd.merge(df_combined, output)
df_combined = pd.merge(df_combined, output2)
df_symptom = pd.merge(df_symptom, output2)
df_disease = pd.merge(df_disease, output)

In [7]:
# write to output files
df_combined.to_csv('Combined-Output.tsv', sep = '\t')
df_symptom.to_csv('Symptom-Occurence-Output.tsv', sep = '\t')
df_disease.to_csv('Disease-Occurence-Output.tsv', sep = '\t')