# Phase 1: HPO ID extraction

In [15]:
import json

# Load the hp.json file
with open("hp.json", "r", encoding="utf-8") as file:
    hpoData = json.load(file)

# Build a lookup dictionary for label and synonyms â†’ HPO ID
termToHpoId = {}

for node in hpoData['graphs'][0]['nodes']:
    fullId = node.get('id', '')
    label = node.get('lbl', '').lower()

    if fullId.startswith("http://purl.obolibrary.org/obo/HP_"):
        hpoId = fullId.replace("http://purl.obolibrary.org/obo/HP_", "HP:")
    else:
        continue  # Skip nodes that are not HPO terms

    if label:
        termToHpoId[label] = hpoId

    # Include synonyms if available
    synonyms = node.get('meta', {}).get('synonyms', [])
    for syn in synonyms:
        synonymText = syn.get('val', '').lower()
        if synonymText:
            termToHpoId[synonymText] = hpoId

# Function to get HPO ID and display details
def getHpoDetails(term):
    term = term.lower().strip()
    hpoId = termToHpoId.get(term)

    if hpoId:
        urlId = hpoId.replace("HP:", "HP_")
        hpoUrl = f"https://hpo.jax.org/app/browse/term/HP:{urlId[3:]}"
        print(f"HPO ID     : {hpoId}")
        print(f"HPO Website: {hpoUrl}")
    else:
        print("HPO ID not found")

# Example searches
getHpoDetails("Seizure")            # HP:0001250
getHpoDetails("epileptic episode")  # synonym of seizure
getHpoDetails("short stature")      # HP:0004322
getHpoDetails("Parkinsonism")

HPO ID     : HP:0001250
HPO Website: https://hpo.jax.org/app/browse/term/HP:0001250
HPO ID not found
HPO ID     : HP:0004322
HPO Website: https://hpo.jax.org/app/browse/term/HP:0004322
HPO ID     : HP:0001300
HPO Website: https://hpo.jax.org/app/browse/term/HP:0001300


# Phase 2: Displaying and VIsualization

In [19]:
import json
import pandas as pd

def loadHpoTerms(jsonFilePath):
    with open(jsonFilePath, "r", encoding="utf-8") as file:
        hpoData = json.load(file)

    termToHpoId = {}

    for node in hpoData['graphs'][0]['nodes']:
        fullId = node.get('id', '')
        label = node.get('lbl', '').lower()

        if fullId.startswith("http://purl.obolibrary.org/obo/HP_"):
            hpoId = fullId.replace("http://purl.obolibrary.org/obo/HP_", "HP:")
        else:
            continue  # Skip non-HPO terms

        if label:
            termToHpoId[label] = hpoId

        # Include synonyms
        synonyms = node.get('meta', {}).get('synonyms', [])
        for syn in synonyms:
            synonymText = syn.get('val', '').lower()
            if synonymText:
                termToHpoId[synonymText] = hpoId

    return termToHpoId

def getHpoDetails(term, hpoMap):
    searchTerm = term.lower().strip()
    hpoId = hpoMap.get(searchTerm)

    if hpoId:
        hpoUrl = f"https://hpo.jax.org/app/browse/term/{hpoId}"
        return {
            "Term": term,
            "Mapped Term": searchTerm,
            "HPO ID": hpoId,
            "HPO Website": hpoUrl
        }
    else:
        return {
            "Term": term,
            "Mapped Term": "N/A",
            "HPO ID": "Not Found",
            "HPO Website": "Not Found"
        }


# Example usage
hpoFilePath = "hp.json"
hpoMap = loadHpoTerms(hpoFilePath)

inputTerms = ['Seizure', "Parkinson's Disease", 'Ataxia', 'Headache']

results = [getHpoDetails(term, hpoMap) for term in inputTerms]
df = pd.DataFrame(results)

# Display the result
df

Unnamed: 0,Term,Mapped Term,HPO ID,HPO Website
0,Seizure,seizure,HP:0001250,https://hpo.jax.org/app/browse/term/HP:0001250
1,Parkinson's Disease,,Not Found,Not Found
2,Ataxia,ataxia,HP:0001251,https://hpo.jax.org/app/browse/term/HP:0001251
3,Headache,headache,HP:0002315,https://hpo.jax.org/app/browse/term/HP:0002315


# Making it Intelligent

In [3]:
import json
import pandas as pd
from rapidfuzz import process, fuzz

# Abbreviation dictionary
abbreviation_dict = {
    "dm": "diabetes mellitus",
    "htn": "hypertension",
    "copd": "chronic obstructive pulmonary disease",
    "cad": "coronary artery disease",
    "ckd": "chronic kidney disease",
    "tb": "tuberculosis",
    "ra": "rheumatoid arthritis",
    "cvd": "cardiovascular disease",

}

def expand_abbreviation(term):
    return abbreviation_dict.get(term.lower().strip(), term)

def loadHpoTerms(jsonFilePath):
    with open(jsonFilePath, "r", encoding="utf-8") as file:
        hpoData = json.load(file)

    termToHpoId = {}

    for node in hpoData['graphs'][0]['nodes']:
        fullId = node.get('id', '')
        label = node.get('lbl', '').lower()

        if fullId.startswith("http://purl.obolibrary.org/obo/HP_"):
            hpoId = fullId.replace("http://purl.obolibrary.org/obo/HP_", "HP:")
        else:
            continue  # Skip non-HPO terms

        if label:
            termToHpoId[label] = hpoId

        # Include synonyms
        synonyms = node.get('meta', {}).get('synonyms', [])
        for syn in synonyms:
            synonymText = syn.get('val', '').lower()
            if synonymText:
                termToHpoId[synonymText] = hpoId

    return termToHpoId

def getHpoDetails(term, hpoMap, confidence_threshold=85):
    originalTerm = term
    term = expand_abbreviation(term.lower().strip())

    if term in hpoMap:
        hpoId = hpoMap[term]
    else:
        # Fuzzy match to handle variations
        best_match = process.extractOne(term, hpoMap.keys(), scorer=fuzz.token_sort_ratio)
        if best_match and best_match[1] >= confidence_threshold:
            term = best_match[0]
            hpoId = hpoMap[term]
        else:
            hpoId = None

    if hpoId:
        return {
            "Term": originalTerm,
            "Mapped Term": term,
            "HPO ID": hpoId,
            "HPO Website": f"https://hpo.jax.org/app/browse/term/{hpoId}"
        }
    else:
        return {
            "Term": originalTerm,
            "Mapped Term": "Not Found",
            "HPO ID": "Not Found",
            "HPO Website": "Not Found"
        }

# Example usage
if __name__ == "__main__":
    hpoFilePath = "hp.json"
    hpoMap = loadHpoTerms(hpoFilePath)

    inputTerms = ['Seizure', "Parkinson's Disease", 'Ataxia', 'Headache', 'DM', 'HTN', 'COPD', 'RA']

    results = [getHpoDetails(term, hpoMap) for term in inputTerms]
    df = pd.DataFrame(results)
    print(df)

                  Term                            Mapped Term      HPO ID  \
0              Seizure                                seizure  HP:0001250   
1  Parkinson's Disease                   parkinsonian disease  HP:0001300   
2               Ataxia                                 ataxia  HP:0001251   
3             Headache                               headache  HP:0002315   
4                   DM                      diabetes mellitus  HP:0000819   
5                  HTN                           hypertension  HP:0000822   
6                 COPD  chronic obstructive pulmonary disease  HP:0006510   
7                   RA                   rheumatoid arthritis  HP:0001370   

                                      HPO Website  
0  https://hpo.jax.org/app/browse/term/HP:0001250  
1  https://hpo.jax.org/app/browse/term/HP:0001300  
2  https://hpo.jax.org/app/browse/term/HP:0001251  
3  https://hpo.jax.org/app/browse/term/HP:0002315  
4  https://hpo.jax.org/app/browse/term/HP:0000

In [4]:
df

Unnamed: 0,Term,Mapped Term,HPO ID,HPO Website
0,Seizure,seizure,HP:0001250,https://hpo.jax.org/app/browse/term/HP:0001250
1,Parkinson's Disease,parkinsonian disease,HP:0001300,https://hpo.jax.org/app/browse/term/HP:0001300
2,Ataxia,ataxia,HP:0001251,https://hpo.jax.org/app/browse/term/HP:0001251
3,Headache,headache,HP:0002315,https://hpo.jax.org/app/browse/term/HP:0002315
4,DM,diabetes mellitus,HP:0000819,https://hpo.jax.org/app/browse/term/HP:0000819
5,HTN,hypertension,HP:0000822,https://hpo.jax.org/app/browse/term/HP:0000822
6,COPD,chronic obstructive pulmonary disease,HP:0006510,https://hpo.jax.org/app/browse/term/HP:0006510
7,RA,rheumatoid arthritis,HP:0001370,https://hpo.jax.org/app/browse/term/HP:0001370


In [14]:
import json
import pandas as pd
from rapidfuzz import process, fuzz

# Abbreviation dictionary
abbreviationDict = {
    # Common shorthand / abbreviations
    "dm": "diabetes mellitus",
    "htn": "hypertension",
    "ht": "hypertension",
    "cad": "coronary artery disease",
    "copd": "chronic obstructive pulmonary disease",
    "ckd": "chronic kidney disease",
    "tb": "tuberculosis",
    "ra": "rheumatoid arthritis",
    "aiha": "autoimmune hemolytic anemia",
    "asd": "atrial septal defect",
    "pda": "patent ductus arteriosus",
    "aki": "acute kidney injury",
    "vt": "ventricular tachycardia",
    "bph": "benign prostatic hyperplasia",
    "kco": "known case of",
    "f/u": "follow up",
    "f/u/c": "follow up case",
    "lgmd": "limb-girdle muscular dystrophy",
    "wes": "whole exome sequencing",
    "rds": "respiratory distress syndrome",
    "rta": "renal tubular acidosis",
    "sos": "sum of symptoms",
    "mod": "multiple organ dysfunction",
    "mods": "multiple organ dysfunction syndrome",
    "loc": "loss of consciousness",
    "adl": "activities of daily living",
    "dtr": "deep tendon reflexes",
    "ul": "upper limb",
    "ll": "lower limb",
    "bl": "bilateral",
    
    # Conditions in shorthand or miswritten
    "hypothyroid": "hypothyroidism",
    "old d 12 fracture": "fractured thoracic vertebra",
    "encephlopathy": "encephalopathy",
    "dyslipidemia": "lipid metabolism disorder",  # interpreted
    "delayed milestones": "global developmental delay",
    "sick sinus syndrome": "sinus node dysfunction",
    "bicornate uterus": "uterine anomaly",
    "hearing loss": "sensorineural hearing impairment",
    "coagulopathy": "abnormality of coagulation",
    "pseudo obstruction": "intestinal pseudo-obstruction",
    
    # Specific syndromes / diseases
    "west syndrome": "epileptic encephalopathy",
    "actg2": "visceral myopathy",
    "msa": "multiple system atrophy",
    "meningitis": "meningitis",
    "becker's": "becker muscular dystrophy",
    "glycogen storage disorder": "abnormal hepatic glycogen storage",
    "spinocerebellar ataxia": "hereditary ataxia",
    "cidp": "chronic inflammatory demyelinating polyneuropathy",
    "cerebral palsy": "ataxic cerebral palsy",
    "neurocut syndrome": "neurocutaneous syndrome",
    "heart block": "atrioventricular block",
    "stroke": "cerebral infarct",
    "fatty acid oxidation": "fatty-acid oxidation disorder",
}


def expandAbbreviation(term):
    return abbreviationDict.get(term.lower().strip(), term)

def loadHpoTerms(jsonFilePath):
    with open(jsonFilePath, "r", encoding="utf-8") as file:
        hpoData = json.load(file)

    termToHpoData = {}  # term â†’ (HPO ID, Official Label)

    for node in hpoData['graphs'][0]['nodes']:
        fullId = node.get('id', '')
        label = node.get('lbl', '').strip()
        labelLower = label.lower()

        if fullId.startswith("http://purl.obolibrary.org/obo/HP_"):
            hpoId = fullId.replace("http://purl.obolibrary.org/obo/HP_", "HP:")
        else:
            continue  # Skip non-HPO terms

        if labelLower:
            termToHpoData[labelLower] = (hpoId, label)

        # Include synonyms
        synonyms = node.get('meta', {}).get('synonyms', [])
        for syn in synonyms:
            synonymText = syn.get('val', '').lower()
            if synonymText and synonymText not in termToHpoData:
                termToHpoData[synonymText] = (hpoId, label)  # still map to official label

    return termToHpoData

def getHpoDetails(term, hpoMap, confidenceThreshold=85):
    originalTerm = term
    term = expandAbbreviation(term.lower().strip())

    hpoId = None
    officialLabel = None

    if term in hpoMap:
        hpoId, officialLabel = hpoMap[term]
    else:
        bestMatch = process.extractOne(term, hpoMap.keys(), scorer=fuzz.token_sort_ratio)
        if bestMatch and bestMatch[1] >= confidenceThreshold:
            matchKey = bestMatch[0]
            hpoId, officialLabel = hpoMap[matchKey]

    if hpoId:
        return {
            "Term": originalTerm,
            "Mapped Term": officialLabel,
            "HPO ID": hpoId,
            "HPO Website": f"https://hpo.jax.org/app/browse/term/{hpoId}"
        }
    else:
        return {
            "Term": originalTerm,
            "Mapped Term": "Not Found",
            "HPO ID": "Not Found",
            "HPO Website": "Not Found"
        }

# Example usage
if __name__ == "__main__":
    hpoFilePath = "hp.json"
    hpoMap = loadHpoTerms(hpoFilePath)

    inputTerms = [
        'Seizure', 
        "Parkinson's Disease",
        "Encephlopathy",
        'Ataxia', 
        'Headache', 
        'DM', 
        'HTN', 
        'COPD',
        'CKD',
        'HTN',
        'Abnormal Metabolism',
        'AIHA',
        'Acute Pancreatitis',
        'RA'
    ]

    results = [getHpoDetails(term, hpoMap) for term in inputTerms]
    df = pd.DataFrame(results)
    print(df)

                   Term                    Mapped Term      HPO ID  \
0               Seizure                        Seizure  HP:0001250   
1   Parkinson's Disease                   Parkinsonism  HP:0001300   
2         Encephlopathy                 Encephalopathy  HP:0001298   
3                Ataxia                         Ataxia  HP:0001251   
4              Headache                       Headache  HP:0002315   
5                    DM              Diabetes mellitus  HP:0000819   
6                   HTN                   Hypertension  HP:0000822   
7                  COPD  Chronic pulmonary obstruction  HP:0006510   
8                   CKD         Chronic kidney disease  HP:0012622   
9                   HTN                   Hypertension  HP:0000822   
10  Abnormal Metabolism            Abnormal metabolism  HP:0032245   
11                 AIHA    Autoimmune hemolytic anemia  HP:0001890   
12   Acute Pancreatitis             Acute pancreatitis  HP:0001735   
13                  

In [15]:
df

Unnamed: 0,Term,Mapped Term,HPO ID,HPO Website
0,Seizure,Seizure,HP:0001250,https://hpo.jax.org/app/browse/term/HP:0001250
1,Parkinson's Disease,Parkinsonism,HP:0001300,https://hpo.jax.org/app/browse/term/HP:0001300
2,Encephlopathy,Encephalopathy,HP:0001298,https://hpo.jax.org/app/browse/term/HP:0001298
3,Ataxia,Ataxia,HP:0001251,https://hpo.jax.org/app/browse/term/HP:0001251
4,Headache,Headache,HP:0002315,https://hpo.jax.org/app/browse/term/HP:0002315
5,DM,Diabetes mellitus,HP:0000819,https://hpo.jax.org/app/browse/term/HP:0000819
6,HTN,Hypertension,HP:0000822,https://hpo.jax.org/app/browse/term/HP:0000822
7,COPD,Chronic pulmonary obstruction,HP:0006510,https://hpo.jax.org/app/browse/term/HP:0006510
8,CKD,Chronic kidney disease,HP:0012622,https://hpo.jax.org/app/browse/term/HP:0012622
9,HTN,Hypertension,HP:0000822,https://hpo.jax.org/app/browse/term/HP:0000822


# Tkinter App

In [18]:
import json
import pandas as pd
from tkinter import Tk, filedialog, Button, Text, Scrollbar, RIGHT, Y, END, LEFT, BOTH
from tkinter.ttk import Treeview
from rapidfuzz import process, fuzz
import os
import re

# -------------------------
# Abbreviation Dictionary
# -------------------------
abbreviationDict = {
    "dm": "diabetes mellitus", "htn": "hypertension", "ht": "hypertension", "cad": "coronary artery disease",
    "copd": "chronic obstructive pulmonary disease", "ckd": "chronic kidney disease", "tb": "tuberculosis",
    "ra": "rheumatoid arthritis", "aiha": "autoimmune hemolytic anemia", "asd": "atrial septal defect",
    "pda": "patent ductus arteriosus", "aki": "acute kidney injury", "vt": "ventricular tachycardia",
    "bph": "benign prostatic hyperplasia", "kco": "known case of", "f/u": "follow up",
    "f/u/c": "follow up case", "lgmd": "limb-girdle muscular dystrophy", "wes": "whole exome sequencing",
    "rds": "respiratory distress syndrome", "rta": "renal tubular acidosis", "sos": "sum of symptoms",
    "mod": "multiple organ dysfunction", "mods": "multiple organ dysfunction syndrome", "loc": "loss of consciousness",
    "adl": "activities of daily living", "dtr": "deep tendon reflexes", "ul": "upper limb", "ll": "lower limb",
    "bl": "bilateral", "hypothyroid": "hypothyroidism", "old d 12 fracture": "fractured thoracic vertebra",
    "encephlopathy": "encephalopathy", "dyslipidemia": "lipid metabolism disorder",
    "delayed milestones": "global developmental delay", "sick sinus syndrome": "sinus node dysfunction",
    "bicornate uterus": "uterine anomaly", "hearing loss": "sensorineural hearing impairment",
    "coagulopathy": "abnormality of coagulation", "pseudo obstruction": "intestinal pseudo-obstruction",
    "west syndrome": "epileptic encephalopathy", "actg2": "visceral myopathy", "msa": "multiple system atrophy",
    "meningitis": "meningitis", "becker's": "becker muscular dystrophy", "glycogen storage disorder": "abnormal hepatic glycogen storage",
    "spinocerebellar ataxia": "hereditary ataxia", "cidp": "chronic inflammatory demyelinating polyneuropathy",
    "cerebral palsy": "ataxic cerebral palsy", "neurocut syndrome": "neurocutaneous syndrome",
    "heart block": "atrioventricular block", "stroke": "cerebral infarct", "fatty acid oxidation": "fatty-acid oxidation disorder"
}

# -------------------------
# Helper Functions
# -------------------------

def expandAbbreviation(term):
    return abbreviationDict.get(term.lower().strip(), term)

def loadHpoTerms(jsonFilePath):
    with open(jsonFilePath, "r", encoding="utf-8") as file:
        hpoData = json.load(file)

    termMap = {}

    for node in hpoData['graphs'][0]['nodes']:
        fullId = node.get('id', '')
        label = node.get('lbl', '').strip()
        labelLower = label.lower()

        if not fullId.startswith("http://purl.obolibrary.org/obo/HP_"):
            continue

        hpoId = fullId.replace("http://purl.obolibrary.org/obo/HP_", "HP:")
        if labelLower:
            termMap[labelLower] = (hpoId, label)

        # Include synonyms
        for syn in node.get('meta', {}).get('synonyms', []):
            synonymText = syn.get('val', '').lower()
            if synonymText and synonymText not in termMap:
                termMap[synonymText] = (hpoId, label)

    return termMap

def getHpoDetails(term, hpoMap, confidenceThreshold=85):
    original = term.strip()
    term = expandAbbreviation(original.lower())

    hpoId, label = None, None

    if term in hpoMap:
        hpoId, label = hpoMap[term]
    else:
        match = process.extractOne(term, hpoMap.keys(), scorer=fuzz.token_sort_ratio)
        if match and match[1] >= confidenceThreshold:
            hpoId, label = hpoMap[match[0]]

    return {
        "Term": original,
        "Mapped Term": label or "Not Found",
        "HPO ID": hpoId or "Not Found",
        "HPO Website": f"https://hpo.jax.org/app/browse/term/{hpoId}" if hpoId else "Not Found"
    }

def extractTermsFromText(text):
    text = text.lower()
    rawTerms = re.split(r'[\/,;\n\-]+', text)
    return [t.strip() for t in rawTerms if t.strip()]

def processFile(filePath, hpoMap):
    ext = os.path.splitext(filePath)[-1].lower()

    if ext in ['.xls', '.xlsx']:
        df = pd.read_excel(filePath, dtype=str)
    elif ext == '.csv':
        df = pd.read_csv(filePath, dtype=str)
    elif ext == '.txt':
        with open(filePath, 'r', encoding='utf-8') as f:
            lines = f.read()
        df = pd.DataFrame({'text': [lines]})
    else:
        return pd.DataFrame()

    allTerms = []
    for row in df.astype(str).values.flatten():
        allTerms.extend(extractTermsFromText(row))

    uniqueTerms = list(set([t for t in allTerms if t]))
    results = [getHpoDetails(term, hpoMap) for term in uniqueTerms]

    return pd.DataFrame(results)

# -------------------------
# GUI Setup
# -------------------------

def browseFile():
    filePath = filedialog.askopenfilename(
        filetypes=[("All supported", "*.xls *.xlsx *.csv *.txt"), 
                   ("Excel", "*.xls *.xlsx"), 
                   ("CSV", "*.csv"), 
                   ("Text", "*.txt")]
    )
    if not filePath:
        return

    resultDf = processFile(filePath, hpoMap)

    for row in tree.get_children():
        tree.delete(row)
    for _, row in resultDf.iterrows():
        tree.insert("", "end", values=tuple(row))

# -------------------------
# Run the App
# -------------------------

hpoMap = loadHpoTerms("hp.json")

root = Tk()
root.title("HPO Term Extractor")

btn = Button(root, text="Upload File", command=browseFile)
btn.pack(pady=10)

tree = Treeview(root, columns=("Term", "Mapped Term", "HPO ID", "HPO Website"), show="headings")
tree.heading("Term", text="Term")
tree.heading("Mapped Term", text="Mapped Term")
tree.heading("HPO ID", text="HPO ID")
tree.heading("HPO Website", text="HPO Website")
tree.pack(fill=BOTH, expand=True)

scroll = Scrollbar(root, orient="vertical", command=tree.yview)
tree.configure(yscroll=scroll.set)
scroll.pack(side=RIGHT, fill=Y)

root.geometry("1000x500")
root.mainloop()


# Tkinter App UI/UX Improved

In [31]:
import json
import pandas as pd
from tkinter import Tk, filedialog, Button, Text, Scrollbar, RIGHT, Y, END, LEFT, BOTH, Frame, Label
from tkinter.ttk import Treeview, Style
from rapidfuzz import process, fuzz
import os
import re

# -------------------------
# Abbreviation Dictionary
# -------------------------
abbreviationDict = {
    "dm": "diabetes mellitus", "htn": "hypertension", "ht": "hypertension", "cad": "coronary artery disease",
    "pivd": "prolapsed intervertebral disc",
    "copd": "chronic obstructive pulmonary disease", "ckd": "chronic kidney disease", "tb": "tuberculosis",
    "ra": "rheumatoid arthritis", "aiha": "autoimmune hemolytic anemia", "asd": "atrial septal defect",
    "pda": "patent ductus arteriosus", "aki": "acute kidney injury", "vt": "ventricular tachycardia",
    "bph": "benign prostatic hyperplasia", "kco": "known case of", "f/u": "follow up",
    "f/u/c": "follow up case", "lgmd": "limb-girdle muscular dystrophy", "wes": "whole exome sequencing",
    "rds": "respiratory distress syndrome", "rta": "renal tubular acidosis", "sos": "sum of symptoms",
    "mod": "multiple organ dysfunction", "mods": "multiple organ dysfunction syndrome", "loc": "loss of consciousness",
    "adl": "activities of daily living", "dtr": "deep tendon reflexes", "ul": "upper limb", "ll": "lower limb",
    "bl": "bilateral", "hypothyroid": "hypothyroidism", "old d 12 fracture": "fractured thoracic vertebra",
    "encephlopathy": "encephalopathy", "dyslipidemia": "lipid metabolism disorder",
    "delayed milestones": "global developmental delay", "sick sinus syndrome": "sinus node dysfunction",
    "bicornate uterus": "uterine anomaly", "hearing loss": "sensorineural hearing impairment",
    "coagulopathy": "abnormality of coagulation", "pseudo obstruction": "intestinal pseudo-obstruction",
    "west syndrome": "epileptic encephalopathy", "actg2": "visceral myopathy", "msa": "multiple system atrophy",
    "meningitis": "meningitis", "becker's": "becker muscular dystrophy", "glycogen storage disorder": "abnormal hepatic glycogen storage",
    "spinocerebellar ataxia": "hereditary ataxia", "cidp": "chronic inflammatory demyelinating polyneuropathy",
    "cerebral palsy": "ataxic cerebral palsy", "neurocut syndrome": "neurocutaneous syndrome",
    "heart block": "atrioventricular block", "stroke": "cerebral infarct", "fatty acid oxidation": "fatty-acid oxidation disorder"
}

# -------------------------
# Helper Functions
# -------------------------

def expandAbbreviation(term):
    return abbreviationDict.get(term.lower().strip(), term)

def loadHpoTerms(jsonFilePath):
    with open(jsonFilePath, "r", encoding="utf-8") as file:
        hpoData = json.load(file)

    termMap = {}

    for node in hpoData['graphs'][0]['nodes']:
        fullId = node.get('id', '')
        label = node.get('lbl', '').strip()
        labelLower = label.lower()

        if not fullId.startswith("http://purl.obolibrary.org/obo/HP_"):
            continue

        hpoId = fullId.replace("http://purl.obolibrary.org/obo/HP_", "HP:")
        if labelLower:
            termMap[labelLower] = (hpoId, label)

        # Include synonyms
        for syn in node.get('meta', {}).get('synonyms', []):
            synonymText = syn.get('val', '').lower()
            if synonymText and synonymText not in termMap:
                termMap[synonymText] = (hpoId, label)

    return termMap

def getHpoDetails(term, hpoMap, confidenceThreshold=70):
    original = term.strip()
    term = expandAbbreviation(original.lower())

    hpoId, label = None, None

    if term in hpoMap:
        hpoId, label = hpoMap[term]
    else:
        match = process.extractOne(term, hpoMap.keys(), scorer=fuzz.token_sort_ratio)
        if match and match[1] >= confidenceThreshold:
            hpoId, label = hpoMap[match[0]]

    return {
        "Term": original,
        "Mapped Term": label or "Not Found",
        "HPO ID": hpoId or "Not Found",
        "HPO Website": f"https://hpo.jax.org/app/browse/term/{hpoId}" if hpoId else "Not Found"
    }

def extractTermsFromText(text):
    text = text.lower()
    rawTerms = re.split(r'[\/,;\n\-]+', text)
    return [t.strip() for t in rawTerms if t.strip()]

def processFile(filePath, hpoMap):
    ext = os.path.splitext(filePath)[-1].lower()

    if ext in ['.xls', '.xlsx']:
        df = pd.read_excel(filePath, dtype=str)
    elif ext == '.csv':
        df = pd.read_csv(filePath, dtype=str)
    elif ext == '.txt':
        with open(filePath, 'r', encoding='utf-8') as f:
            lines = f.read()
        df = pd.DataFrame({'text': [lines]})
    else:
        return pd.DataFrame()

    allTerms = []
    for row in df.astype(str).values.flatten():
        allTerms.extend(extractTermsFromText(row))

    uniqueTerms = list(set([t for t in allTerms if t]))
    results = [getHpoDetails(term, hpoMap) for term in uniqueTerms]

    return pd.DataFrame(results)

# -------------------------
# Animation Functions
# -------------------------

def on_upload_button_press(event):
    """Upload button press animation"""
    upload_btn.config(relief="sunken", bd=2)
    
def on_upload_button_release(event):
    """Upload button release animation"""
    upload_btn.config(relief="raised", bd=4)
    
def on_upload_button_enter(event):
    """Upload button hover effect with lift animation and shadow"""
    # Create lift effect by changing geometry and adding shadow effect
    upload_btn.config(
        bg="#FF8C00", 
        activebackground="#FF7F00", 
        relief="raised", 
        bd=6,
        highlightbackground="#888888",
        highlightcolor="#888888",
        highlightthickness=2
    )
    # Add visual lift by adjusting margins
    upload_btn.pack(pady=(7, 13))  # Lift up by reducing top padding
    
def on_upload_button_leave(event):
    """Upload button leave effect - return to original state"""
    upload_btn.config(
        bg="#FFA500", 
        activebackground="#FF8C00", 
        relief="raised", 
        bd=4,
        highlightthickness=0
    )
    # Return to original position
    upload_btn.pack(pady=10)

def on_download_button_press(event):
    """Download button press animation"""
    download_btn.config(relief="sunken", bd=2)
    
def on_download_button_release(event):
    """Download button release animation"""
    download_btn.config(relief="raised", bd=4)
    
def on_download_button_enter(event):
    """Download button hover effect with lift animation and shadow"""
    # Create lift effect by changing geometry and adding shadow effect
    download_btn.config(
        bg="#FF8C00", 
        activebackground="#FF7F00", 
        relief="raised", 
        bd=6,
        highlightbackground="#888888",
        highlightcolor="#888888",
        highlightthickness=2
    )
    # Add visual lift by adjusting margins
    download_btn.pack(pady=(7, 13))  # Lift up by reducing top padding
    
def on_download_button_leave(event):
    """Download button leave effect - return to original state"""
    download_btn.config(
        bg="#FFA500", 
        activebackground="#FF8C00", 
        relief="raised", 
        bd=4,
        highlightthickness=0
    )
    # Return to original position
    download_btn.pack(pady=10)

# -------------------------
# GUI Setup
# -------------------------

def browseFile():
    filePath = filedialog.askopenfilename(
        title="Select HPO Terms File",
        filetypes=[("All supported", "*.xls *.xlsx *.csv *.txt"), 
                   ("Excel", "*.xls *.xlsx"), 
                   ("CSV", "*.csv"), 
                   ("Text", "*.txt")]
    )
    if not filePath:
        return

    # Update status
    status_label.config(text="Processing file...", fg="#FF8C00")
    root.update()
    
    global current_results_df
    current_results_df = processFile(filePath, hpoMap)

    # Clear previous results
    for row in tree.get_children():
        tree.delete(row)
    
    # Insert new results with alternating colors
    for i, (_, row) in enumerate(current_results_df.iterrows()):
        item = tree.insert("", "end", values=tuple(row))
        # Apply alternating row colors
        if i % 2 == 0:
            tree.item(item, tags=('evenrow',))
        else:
            tree.item(item, tags=('oddrow',))
    
    # Update status and enable download button
    status_label.config(text=f"Loaded {len(current_results_df)} terms successfully", fg="#228B22")
    download_btn.config(state="normal")

def downloadSheet():
    """Download the current results as an Excel file"""
    if current_results_df.empty:
        status_label.config(text="No data to download", fg="#DC143C")
        return
    
    # Ask user where to save the file
    filePath = filedialog.asksaveasfilename(
        title="Save HPO Results",
        defaultextension=".xlsx",
        filetypes=[("Excel files", "*.xlsx"), ("CSV files", "*.csv")]
    )
    
    if not filePath:
        return
    
    try:
        # Update status
        status_label.config(text="Downloading file...", fg="#FF8C00")
        root.update()
        
        # Save based on file extension
        if filePath.endswith('.csv'):
            current_results_df.to_csv(filePath, index=False)
        else:
            current_results_df.to_excel(filePath, index=False)
        
        status_label.config(text=f"File saved successfully to {os.path.basename(filePath)}", fg="#228B22")
    except Exception as e:
        status_label.config(text=f"Error saving file: {str(e)}", fg="#DC143C")

# -------------------------
# Run the App
# -------------------------

hpoMap = loadHpoTerms("hp.json")

# Global variable to store current results
current_results_df = pd.DataFrame()

# Create main window
root = Tk()
root.title("HPO Term Extractor")
root.configure(bg="#FFF8DC")  # Light orange/cream background

# Create main frame
main_frame = Frame(root, bg="#FFF8DC", padx=20, pady=20)
main_frame.pack(fill=BOTH, expand=True)

# Title Label
title_label = Label(
    main_frame, 
    text="HPO Term Extractor",
    font=("Arial", 18, "bold"),
    bg="#FFF8DC",
    fg="#FF6347",
    pady=10
)
title_label.pack()

# Subtitle Label
subtitle_label = Label(
    main_frame, 
    text="Upload your file to extract and map HPO terms",
    font=("Arial", 10),
    bg="#FFF8DC",
    fg="#696969",
    pady=5
)
subtitle_label.pack()

# Button Frame
btn_frame = Frame(main_frame, bg="#FFF8DC", pady=10)
btn_frame.pack()

# Enhanced Upload Button with 3D effect
upload_btn = Button(
    btn_frame,
    text="Upload File",
    font=("Arial", 12, "bold"),
    bg="#FFA500",  # Orange background
    fg="white",
    activebackground="#FF8C00",
    activeforeground="white",
    relief="raised",
    bd=4,
    padx=20,
    pady=10,
    cursor="hand2",
    command=browseFile
)
upload_btn.pack()

# Bind animation events for upload button
upload_btn.bind("<Button-1>", on_upload_button_press)
upload_btn.bind("<ButtonRelease-1>", on_upload_button_release)
upload_btn.bind("<Enter>", on_upload_button_enter)
upload_btn.bind("<Leave>", on_upload_button_leave)

# Status Label
status_label = Label(
    main_frame,
    text="Ready to process files",
    font=("Arial", 9),
    bg="#FFF8DC",
    fg="#808080",
    pady=5
)
status_label.pack()

# Table Frame
table_frame = Frame(main_frame, bg="#FFF8DC", pady=10)
table_frame.pack(fill=BOTH, expand=True)

# Configure TTK Style for the table
style = Style()
style.theme_use('clam')

# Configure Treeview styles
style.configure(
    "Custom.Treeview",
    background="white",
    foreground="black",
    rowheight=25,
    fieldbackground="white",
    borderwidth=1,
    relief="solid"
)

style.configure(
    "Custom.Treeview.Heading",
    background="#4169E1",  # Imperial blue
    foreground="white",
    font=("Arial", 10, "bold"),
    relief="raised",
    borderwidth=1
)

# Map the selection colors
style.map(
    "Custom.Treeview",
    background=[('selected', '#FFE4B5')],  # Light orange selection
    foreground=[('selected', 'black')]
)

style.map(
    "Custom.Treeview.Heading",
    background=[('active', '#1E90FF')],  # Lighter blue on hover
    foreground=[('active', 'white')]
)

# Configure grid lines for the treeview
style.layout("Custom.Treeview", [
    ('Custom.Treeview.treearea', {'sticky': 'nswe'})
])

# Create custom style for grid lines
root.option_add('*TCombobox*Listbox.selectBackground', '#FFE4B5')

# Create Treeview with custom style and grid lines (no extra column)
tree = Treeview(
    table_frame, 
    columns=("Term", "Mapped Term", "HPO ID", "HPO Website"), 
    show="headings",  # Only show headings, not the tree column
    style="Custom.Treeview"
)

# Configure grid appearance
tree.tag_configure('oddrow', background='#F8F8F8')
tree.tag_configure('evenrow', background='white')

# Configure column headings
tree.heading("Term", text="Original Term")
tree.heading("Mapped Term", text="Mapped Term")
tree.heading("HPO ID", text="HPO ID")
tree.heading("HPO Website", text="HPO Website")

# Configure column widths and grid lines
tree.column("Term", width=200, minwidth=150, anchor="w")
tree.column("Mapped Term", width=300, minwidth=200, anchor="w")
tree.column("HPO ID", width=150, minwidth=100, anchor="center")
tree.column("HPO Website", width=350, minwidth=250, anchor="w")

# Add alternating row colors function
def insert_with_alternating_colors(parent, index, **kwargs):
    item = tree.insert(parent, index, **kwargs)
    # Get the current row count
    children = tree.get_children()
    row_num = len(children)
    # Apply alternating row tags
    if row_num % 2 == 0:
        tree.item(item, tags=('evenrow',))
    else:
        tree.item(item, tags=('oddrow',))
    return item

# Create scrollbar
scroll = Scrollbar(table_frame, orient="vertical", command=tree.yview)
tree.configure(yscrollcommand=scroll.set)

# Pack table and scrollbar
tree.pack(side=LEFT, fill=BOTH, expand=True)
scroll.pack(side=RIGHT, fill=Y)

# Download Button Frame (centered at bottom)
download_frame = Frame(main_frame, bg="#FFF8DC", pady=15)
download_frame.pack(side="bottom")

# Enhanced Download Button with 3D effect
download_btn = Button(
    download_frame,
    text="Download Sheet",
    font=("Arial", 12, "bold"),
    bg="#FFA500",  # Orange background
    fg="white",
    activebackground="#FF8C00",
    activeforeground="white",
    relief="raised",
    bd=4,
    padx=25,
    pady=10,
    cursor="hand2",
    command=downloadSheet,
    state="disabled"  # Initially disabled
)
download_btn.pack()

# Bind animation events for download button
download_btn.bind("<Button-1>", on_download_button_press)
download_btn.bind("<ButtonRelease-1>", on_download_button_release)
download_btn.bind("<Enter>", on_download_button_enter)
download_btn.bind("<Leave>", on_download_button_leave)

# Footer
footer_label = Label(
    main_frame,
    text="Â© 2024 HPO Term Extractor - Enhanced UI Version",
    font=("Arial", 8),
    bg="#FFF8DC",
    fg="#A0A0A0",
    pady=5
)
footer_label.pack(side="bottom")

# Set window properties
root.geometry("1200x700")
root.minsize(800, 500)
root.configure(bg="#FFF8DC")

# Center the window
root.update_idletasks()
x = (root.winfo_screenwidth() // 2) - (root.winfo_width() // 2)
y = (root.winfo_screenheight() // 2) - (root.winfo_height() // 2)
root.geometry(f"+{x}+{y}")

root.mainloop()

# StreamlIt app

In [1]:
import streamlit as st
import pandas as pd
import requests
import time
import random
import logging
from typing import Dict, List, Optional
from urllib.parse import quote
from openai import OpenAI

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize Grok API client
openai = OpenAI(
    api_key="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",
    base_url="https://api.groq.com/openai/v1"
)

class HPODataExtractor:
    def __init__(self):
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15"
        ]
        self.abbreviation_map = {
            'HTN': 'Hypertension',
            'DM': 'Diabetes Mellitus',
            'CKD': 'Chronic Kidney Disease',
            'PIVD': 'Herniation of intervertebral nuclei',
            'HLD': 'Hyperlipidemia',
            'CAD': 'Coronary Artery Disease',
            'COPD': 'Chronic Obstructive Pulmonary Disease',
            'CHF': 'Congestive Heart Failure',
            'MI': 'Myocardial Infarction',
            'AF': 'Atrial Fibrillation'
        }
        self.corrections = {
            'encephlopathy': 'Encephalopathy',
            'hypothyroid': 'Hypothyroidism',
            'non dm': 'Diabetes Mellitus',
            'old d 12 fracture': 'Fractured thoracic vertebra',
            'ckd': 'Chronic Kidney Disease',
            'pivd': 'Herniation of intervertebral nuclei'
        }

    def map_abbreviation(self, term: str) -> str:
        term_clean = term.strip().lower()
        term_clean = self.corrections.get(term_clean, term_clean)
        return self.abbreviation_map.get(term_clean.upper(), term_clean.capitalize())

    def query_hpo_api(self, term: str):
        url = "https://clinicaltables.nlm.nih.gov/api/hpo/v3/search"
        params = {"terms": term, "maxList": 20}
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            data = response.json()
            if data and len(data) > 2 and data[1]:
                results = [
                    {"id": data[2][i], "label": data[1][i]} for i in range(len(data[1]))
                ]
                return results
            return []
        except Exception as e:
            logger.error(f"Error querying HPO API: {e}")
            return []

    def query_hpo_fallback(self, term: str):
        mapped_term = self.map_abbreviation(term)
        try:
            prompt = (
                f"Given the medical term '{term}' (mapped to '{mapped_term}'), "
                "find the most relevant Human Phenotype Ontology (HPO) ID and term name. "
                "Return only the HPO ID (format: HP:XXXXXXX) and HPO Name. "
                "If no HPO term applies, return 'Not found' for both."
            )
            response = openai.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[
                    {"role": "system", "content": "You are a medical ontology expert with access to HPO."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=100,
                temperature=0.3
            )
            result = response.choices[0].message.content.strip()
            hpo_id = 'Not found'
            hpo_name = 'Not found'
            for line in result.split('\n'):
                if line.startswith('HPO ID:'):
                    hpo_id = line.split(': ')[1].strip()
                elif line.startswith('HPO Name:'):
                    hpo_name = line.split(': ')[1].strip()
            return [{"id": hpo_id, "label": hpo_name}]
        except Exception as e:
            logger.error(f"Grok API error for {term}: {e}")
            return []

    def get_hpo_id(self, term: str) -> Optional[Dict]:
        term_lower = term.lower()
        mapped_term = self.map_abbreviation(term)
        if term_lower == 'non dm':
            return {
                'Term': term,
                'Mapped Term': 'Diabetes Mellitus',
                'HPO ID': 'Not applicable',
                'HPO Name': 'No HPO ID for absence of condition',
                'Verified': 'Not applicable'
            }
        results = self.query_hpo_api(mapped_term)
        if not results:
            results = self.query_hpo_fallback(mapped_term)
        if results:
            return {
                'Term': term,
                'Mapped Term': mapped_term,
                'HPO ID': results[0]['id'],
                'HPO Name': results[0]['label'],
                'Verified': 'From API/LLM'
            }
        return {
            'Term': term,
            'Mapped Term': mapped_term,
            'HPO ID': 'Not found',
            'HPO Name': 'Not found',
            'Verified': 'Not found'
        }

    def process_term(self, term: str) -> Optional[Dict]:
        if not term or term.lower() in ['nan', 'null', '']:
            return None
        term = term.strip()
        return self.get_hpo_id(term)

    def process_terms_from_text(self, text: str) -> List[Dict]:
        terms = [t.strip() for t in text.split('/') if t.strip()]
        results = []
        for term in terms:
            result = self.process_term(term)
            if result:
                results.append(result)
        return results

def create_hpo_analysis_interface():
    st.header("ðŸ§¬ HPO ID Extractor")
    if 'hpo_extractor' not in st.session_state:
        st.session_state.hpo_extractor = HPODataExtractor()
    uploaded_file = st.file_uploader("Upload a CSV, Excel, or Text file", type=['csv', 'xlsx', 'txt'])
    if uploaded_file:
        try:
            if uploaded_file.name.endswith('.csv'):
                df = pd.read_csv(uploaded_file)
            elif uploaded_file.name.endswith('.xlsx'):
                df = pd.read_excel(uploaded_file)
            else:
                content = uploaded_file.read().decode('utf-8')
                terms = [line.strip() for line in content.split('\n') if line.strip()]
                df = pd.DataFrame({'terms': terms})
            st.write("Preview of uploaded data:")
            st.dataframe(df.head())
            if st.button("Analyze HPO IDs"):
                if len(df) > 0:
                    results = []
                    progress_bar = st.progress(0)
                    status_text = st.empty()
                    for i, row in df.iterrows():
                        terms = None
                        for col in df.columns:
                            if pd.notna(row[col]) and str(row[col]).strip():
                                terms = str(row[col]).strip()
                                break
                        if terms:
                            status_text.text(f"Analyzing: {terms}")
                            term_results = st.session_state.hpo_extractor.process_terms_from_text(terms)
                            results.extend(term_results)
                            for result in term_results:
                                st.write(f"âœ“ {result['Term']} -> {result['HPO ID']} ({result['HPO Name']}) - Verified: {result['Verified']}")
                            progress_bar.progress((i + 1) / len(df))
                            time.sleep(0.3)
                    if results:
                        st.session_state.hpo_results = results
                        st.success(f"Analyzed {len(results)} terms!")
                        results_df = pd.DataFrame(results)
                        csv = results_df.to_csv(index=False)
                        st.download_button(
                            label="Download Results as CSV",
                            data=csv,
                            file_name=f"hpo_analysis_{time.strftime('%Y%m%d_%H%M%S')}.csv",
                            mime='text/csv'
                        )
                    else:
                        st.warning("No valid terms found")
        except Exception as e:
            st.error(f"Error processing file: {e}")
    if 'hpo_results' in st.session_state and st.session_state.hpo_results:
        st.subheader("HPO Analysis Results")
        results_df = pd.DataFrame(st.session_state.hpo_results)
        st.dataframe(results_df)

def main():
    st.set_page_config(
        page_title="HPO ID Extractor",
        page_icon="ðŸ§¬",
        layout="wide"
    )
    st.markdown("""
    <style>
    .main-header { text-align: center; color: #2E86AB; margin-bottom: 30px; }
    </style>
    """, unsafe_allow_html=True)
    st.markdown("<h1 class='main-header'>ðŸ§¬ HPO ID Extractor</h1>", unsafe_allow_html=True)
    create_hpo_analysis_interface()

if __name__ == "__main__":
    main()


2025-08-20 12:18:33.098 
  command:

    streamlit run C:\ProgramData\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-08-20 12:18:33.099 Session state does not function when running a script without `streamlit run`


In [2]:
import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time
import random
import logging
from typing import Dict, List, Optional
from urllib.parse import quote
from openai import OpenAI

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize Grok API client
openai = OpenAI(
    api_key="YOUR GROK API KEY HERE",
    base_url="https://api.groq.com/openai/v1"
)

class HPODataExtractor:
    def __init__(self):
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15"
        ]
        self.abbreviation_map = {
            'HTN': 'Hypertension',
            'DM': 'Diabetes Mellitus',
            'CKD': 'Chronic Kidney Disease',
            'PIVD': 'Herniation of intervertebral nuclei',
            'HLD': 'Hyperlipidemia',
            'CAD': 'Coronary Artery Disease',
            'COPD': 'Chronic Obstructive Pulmonary Disease',
            'CHF': 'Congestive Heart Failure',
            'MI': 'Myocardial Infarction',
            'AF': 'Atrial Fibrillation'
        }
        self.corrections = {
            'encephlopathy': 'Encephalopathy',
            'hypothyroid': 'Hypothyroidism',
            'non dm': 'Diabetes Mellitus',
            'old d 12 fracture': 'Fractured thoracic vertebra',
            'ckd': 'Chronic Kidney Disease',
            'pivd': 'Herniation of intervertebral nuclei'
        }

    def get_random_header(self):
        return {
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://hpo.jax.org/'
        }

    def map_abbreviation(self, term: str) -> str:
        """Map medical abbreviations to full forms and correct misspellings."""
        term_clean = term.strip().lower()
        term_clean = self.corrections.get(term_clean, term_clean)
        return self.abbreviation_map.get(term_clean.upper(), term_clean.capitalize())

    def get_hpo_id(self, term: str) -> Optional[Dict]:
        """Use Grok's API (llama-3.3-70b-versatile) to find the most relevant HPO ID."""
        term_lower = term.lower()
        mapped_term = self.map_abbreviation(term)

        # Handle non-phenotypic terms
        if term_lower == 'non dm':
            return {'Term': term, 'HPO ID': 'Not applicable'}

        try:
            # Construct prompt for Grok API
            prompt = (
                f"Given the medical term '{term}' (mapped to '{mapped_term}'), "
                "find the most relevant Human Phenotype Ontology (HPO) ID. "
                "Return only the HPO ID (format: HP:XXXXXXX). "
                "If no HPO term applies, return 'Not found'."
            )
            response = openai.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[
                    {"role": "system", "content": "You are a medical ontology expert with access to the Human Phenotype Ontology (HPO)."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=50,
                temperature=0.3
            )
            result = response.choices[0].message.content.strip()

            # Extract HPO ID
            hpo_id = "Not found"
            match = re.search(r"(HP:\d{7})", result)
            if match:
                hpo_id = match.group(1)

            return {'Term': term, 'HPO ID': hpo_id}
        except Exception as e:
            logger.error(f"Grok API error for {term}: {e}")
            return {'Term': term, 'HPO ID': 'Error'}

    def process_term(self, term: str) -> Optional[Dict]:
        """Process a single term to get its HPO ID."""
        if not term or term.lower() in ['nan', 'null', '']:
            return None
        term = term.strip()
        return self.get_hpo_id(term)

    def process_terms_from_text(self, text: str) -> List[Dict]:
        """Process a text string containing multiple terms separated by '/'."""
        terms = [t.strip() for t in text.split('/') if t.strip()]
        results = []
        for term in terms:
            result = self.process_term(term)
            if result:
                results.append(result)
        return results

def create_hpo_analysis_interface():
    """Create the HPO analysis interface."""
    st.header("ðŸ§¬ HPO ID Extractor")
    
    # Initialize HPO extractor
    if 'hpo_extractor' not in st.session_state:
        st.session_state.hpo_extractor = HPODataExtractor()
    
    # File upload
    uploaded_file = st.file_uploader("Upload a CSV, Excel, or Text file", type=['csv', 'xlsx', 'txt'])
    
    if uploaded_file:
        try:
            if uploaded_file.name.endswith('.csv'):
                df = pd.read_csv(uploaded_file)
            elif uploaded_file.name.endswith('.xlsx'):
                df = pd.read_excel(uploaded_file)
            else:
                content = uploaded_file.read().decode('utf-8')
                terms = [line.strip() for line in content.split('\n') if line.strip()]
                df = pd.DataFrame({'terms': terms})
            
            st.write("Preview of uploaded data:")
            st.dataframe(df.head())
            
            if st.button("Analyze HPO IDs"):
                if len(df) > 0:
                    results = []
                    progress_bar = st.progress(0)
                    status_text = st.empty()
                    
                    for i, row in df.iterrows():
                        terms = None
                        for col in df.columns:
                            if pd.notna(row[col]) and str(row[col]).strip():
                                terms = str(row[col]).strip()
                                break
                        
                        if terms:
                            status_text.text(f"Analyzing: {terms}")
                            term_results = st.session_state.hpo_extractor.process_terms_from_text(terms)
                            results.extend(term_results)
                            for result in term_results:
                                st.write(f"âœ“ {result['Term']} -> {result['HPO ID']}")
                            
                            progress_bar.progress((i + 1) / len(df))
                            time.sleep(0.5)  # Delay to prevent rate limiting
                        
                    if results:
                        st.session_state.hpo_results = results
                        st.success(f"Successfully analyzed {len(results)} terms!")
                        
                        # Generate and offer CSV download
                        results_df = pd.DataFrame(results, columns=['Term', 'HPO ID'])
                        csv = results_df.to_csv(index=False)
                        st.download_button(
                            label="Download Results as CSV",
                            data=csv,
                            file_name=f"hpo_analysis_{time.strftime('%Y%m%d_%H%M%S')}.csv",
                            mime='text/csv'
                        )
                    else:
                        st.warning("No valid terms found in the file")
                else:
                    st.warning("No data found in the uploaded file")
        
        except Exception as e:
            st.error(f"Error processing file: {e}")
    
    # Display results
    if 'hpo_results' in st.session_state and st.session_state.hpo_results:
        st.subheader("HPO Analysis Results")
        results_df = pd.DataFrame(st.session_state.hpo_results, columns=['Term', 'HPO ID'])
        st.dataframe(results_df)

def main():
    """Main application."""
    st.set_page_config(
        page_title="HPO ID Extractor",
        page_icon="ðŸ§¬",
        layout="wide"
    )
    
    # Custom CSS for styling
    st.markdown("""
    <style>
    .main-header {
        text-align: center;
        color: #2E86AB;
        margin-bottom: 30px;
    }
    </style>
    """, unsafe_allow_html=True)
    
    # Main header
    st.markdown("<h1 class='main-header'>ðŸ§¬ HPO ID Extractor</h1>", unsafe_allow_html=True)
    
    create_hpo_analysis_interface()

if __name__ == "__main__":
    main()


# Fill the database

In [12]:
import psycopg2
import requests
import pandas as pd

# 1. Connect to Postgres
conn = psycopg2.connect(
    dbname="vide",
    user="postgres",
    password="Bioinfo@1234",
    host="122.160.11.246",
    port="5432"
)
cursor = conn.cursor()

# 2. Fetch rows that need processing
cursor.execute("""
    SELECT internal_id, clinical_history 
    FROM master_sheet 
    WHERE clinical_history IS NOT NULL;
""")


rows = cursor.fetchall()
print(f"Fetched {len(rows)} rows.")

# 3. Loop over rows and call your API
api_url = "http://127.0.0.1:8000/extract"

for internal_id, clinical_history in rows:
    payload = {"text": clinical_history}
    try:
        response = requests.post(api_url, json=payload)

        if response.status_code == 200:
            data = response.json()

            # Debug: print raw response so we know what API sends back
            print(f"\nRaw API response for {internal_id}: {data}")

            if isinstance(data, list) and all(isinstance(d, dict) for d in data):
                # normalize keys (lowercase and replace spaces)
                normalized = [
                    {k.lower().replace(" ", "_"): v for k, v in d.items()}
                    for d in data
                ]

                hpo_terms = []
                hpo_ids = []

                for d in normalized:
                    term = d.get("hpo_term") or d.get("term")
                    hpo_id = d.get("hpo_id")

                    # Keep "NA" if that's all we got, but skip empty strings
                    if term and term != "":
                        hpo_terms.append(term)
                    if hpo_id and hpo_id != "":
                        hpo_ids.append(hpo_id)

                hpo_term_str = ", ".join(hpo_terms)
                hpo_id_str = ", ".join(hpo_ids)

                print("hpo_term_str:", hpo_term_str)
                print("hpo_terms:", hpo_terms)
                print("hpo_ids:", hpo_ids)

                cursor.execute("""
                    UPDATE master_sheet
                    SET hpo_term = %s, hpo_id = %s
                    WHERE internal_id = %s;
                """, (hpo_term_str, hpo_id_str, internal_id))
                conn.commit()
                print(f" Updated row {internal_id}")

            else:
                print(f" Unexpected response format for row {internal_id}, skipping.")
        else:
            print(f" API error for row {internal_id}: {response.text}")
    except Exception as e:
        print(f" Error processing row {internal_id}: {e}")

# Close connection
cursor.close()
conn.close()


Fetched 192 rows.

Raw API response for 202500108: [{'Term': 'Bilateral breast cancer', 'HPO ID': 'HP:0003002'}, {'Term': 'Left Breast Biopsy: Malignant; (B5b) ,Invasive carcinoma; Grade 2.', 'HPO ID': 'HP:0003002'}]
hpo_term_str: Bilateral breast cancer, Left Breast Biopsy: Malignant; (B5b) ,Invasive carcinoma; Grade 2.
hpo_terms: ['Bilateral breast cancer', 'Left Breast Biopsy: Malignant; (B5b) ,Invasive carcinoma; Grade 2.']
hpo_ids: ['HP:0003002', 'HP:0003002']
 Updated row 202500108

Raw API response for 202500147: [{'Term': 'C', 'HPO ID': 'Not found'}, {'Term': 'O Breathlessness & chest heavinessClinical & radiological evaluation strong suspicion of ATTR type Cardiac Amyloidosis', 'HPO ID': 'Error'}, {'Term': 'CT angiography shows :- Extensive coronary calcifications, right coronary dominance Coronary Artery Disease -SVDP', 'HPO ID': 'Error'}, {'Term': 'PTCA + Stent to RCA(15', 'HPO ID': 'Error'}, {'Term': '03', 'HPO ID': 'Error'}, {'Term': '2022).history of herpes (2022).', 'HPO

In [2]:
import psycopg2
import requests
import pandas as pd
import time   #  for sleep

# 1. Connect to Postgres
conn = psycopg2.connect(
    dbname="vide",
    user="postgres",
    password="Bioinfo@1234",
    host="122.160.11.246",
    port="5432"
)
cursor = conn.cursor()

# 2. Fetch rows that need processing
cursor.execute("""
    SELECT internal_id, clinical_history 
    FROM master_sheet 
    WHERE clinical_history IS NOT NULL;
""")

rows = cursor.fetchall()
print(f"Fetched {len(rows)} rows.")

# 3. Loop over rows and call your API
api_url = "http://127.0.0.1:8000/extract"

for internal_id, clinical_history in rows:
    payload = {"text": clinical_history}
    try:
        response = requests.post(api_url, json=payload)

        if response.status_code == 200:
            data = response.json()

            # Debug: print raw response so we know what API sends back
            print(f"\nRaw API response for {internal_id}: {data}")

            if isinstance(data, list) and all(isinstance(d, dict) for d in data):
                # normalize keys (lowercase and replace spaces)
                normalized = [
                    {k.lower().replace(" ", "_"): v for k, v in d.items()}
                    for d in data
                ]

                hpo_terms = []
                hpo_ids = []

                for d in normalized:
                    term = d.get("hpo_term") or d.get("term")
                    hpo_id = d.get("hpo_id")

                    # Keep "NA" if that's all we got, but skip empty strings
                    if term and term.strip():
                        hpo_terms.append(term)
                    if hpo_id and hpo_id.strip():
                        hpo_ids.append(hpo_id)

                hpo_term_str = ", ".join(hpo_terms)
                hpo_id_str = ", ".join(hpo_ids)

                print("hpo_term_str:", hpo_term_str)
                print("hpo_terms:", hpo_terms)
                print("hpo_ids:", hpo_ids)



                
                cursor.execute("""
                    UPDATE master_sheet
                    SET hpo_term = %s, hpo_id = %s
                    WHERE internal_id = %s;
                """, (hpo_term_str, hpo_id_str, internal_id))
                conn.commit()
                print(f"  Updated row {internal_id}")

            else:
                print(f" Unexpected response format for row {internal_id}, skipping.")
        else:
            print(f" API error for row {internal_id}: {response.text}")
    except Exception as e:
        print(f" Error processing row {internal_id}: {e}")

    #  Give some rest between requests to avoid overload
    time.sleep(2)

# Close connection
cursor.close()
conn.close()


Fetched 192 rows.

Raw API response for 202500002: [{'Term': 'Right UL involuntary movements X 2011 predominantly postural, Difficulty in holding cup,Similar involuntary movements in left UL since 6 months, No h', 'HPO ID': 'HP:0001334'}, {'Term': 's', 'HPO ID': 'Not found'}, {'Term': 'o bradykinesia,Difficulty walking in narrow passages No fall', 'HPO ID': 'HP:0002065'}, {'Term': 'imbalance,  mood issues, irritability, P', 'HPO ID': 'HP:0012366'}, {'Term': 'H Multiple episode of jaundice- bil2-3', 'HPO ID': 'HP:0002908'}]
hpo_term_str: Right UL involuntary movements X 2011 predominantly postural, Difficulty in holding cup,Similar involuntary movements in left UL since 6 months, No h, s, o bradykinesia,Difficulty walking in narrow passages No fall, imbalance,  mood issues, irritability, P, H Multiple episode of jaundice- bil2-3
hpo_terms: ['Right UL involuntary movements X 2011 predominantly postural, Difficulty in holding cup,Similar involuntary movements in left UL since 6 months, No