#Class Project II - Data Acquistion from XML and UMLS Files

Description: : In this program we pull the UMLS files, dataset.xml, MRCONSO.RFF and MRREF.RRF, from a directory and pull out the specified data (MRN, ConceptID, Medication Name, Drug Class, Mechanism of Action, Diagnosis) using the given MRN Identifiers and associated Concept IDs as well as create and write said data to a csv file.

Logic: Using the Element Tree module, we create a tree and iterate through the tree to create a data structure containg the specified data by utilizing a list of dictionaries which refer to the xml dataset records that contain the MRN and associated Concept IDs. Further, using the Pandas module, we create dataframes and iterate through each row of the dataframe extracting the specified data needed by using a concept number and relationship as a unique key. Finally, we use the csv writer to write the needed extracted data to a csv file. 

In [2]:
import csv 
import os
import pandas as pd
import xml.etree.ElementTree as ET

data_directory = "/opt/class/umls/"

def getConceptNums(data_frame, concept_num):
    #contains concept numbers for each respective type of needed data
    drug_nums = []
    diagnosis_nums = []
    moa_nums = []
    list_of_nums = []
    
    #creates series objects and iterates through lines to get concept numbers
    for row in data_frame.iterrows():
        column = row[1] #series object
        concept_left = row[1].at[0]
        concept_right = row[1].at[4]
        
        #checks if line has one of the respective attributes, returns associated conceptID in line
        if (column[3] == "PAR" or column[3] == "CHD" or column[7] == "isa" or column[7] == "inverse_isa"): 
            drug_nums = getConcept(concept_num, concept_left, concept_right, drug_nums)
            
        elif(column[7] == "may_be_treated_by"):
            diagnosis_nums = getConcept(concept_num,concept_left, concept_right, diagnosis_nums)
        
        elif(column[7] == "mechanism_of_action_of"):
            moa_nums = getConcept(concept_num,concept_left, concept_right, moa_nums)
        
    #remove duplicate conceptIDs and adds to list_of_nums
    drug_nums = remove_duplicates(drug_nums)
    diagnosis_nums = remove_duplicates(diagnosis_nums)
    moa_nums = remove_duplicates(moa_nums)
    list_of_nums.append(drug_nums)
    list_of_nums.append(diagnosis_nums)
    list_of_nums.append(moa_nums)
    
    return list_of_nums

#finds concept ID and returns opposite associated concept ID
def getConcept(concept_num, concept_left, concept_right, nums):
    if (concept_left == concept_num):
        nums.append(concept_right)
    elif(concept_right == concept_num):
        nums.append(concept_left)
        
    return(nums)

def remove_duplicates(nums):
    return(list(set(nums)))

#iterates through rows of data_frame, gets series object, returns concept associated data
def getData(data_frame, nums):
    for row in data_frame.iterrows():
        line = row[1]
        concept_num = line[0]
        for num in nums:
            if num == concept_num:
                return line[14]

#gets Medication name from MRCONSO file
def getMedicationName(data_frame, num):
    for row in data_frame.iterrows(): 
        line = row[1] 
        concept_num = line[0] 
        if num == concept_num: 
            return line[14]
        
def getRecordDataFrame():
    #Creates an elemenet tree from given dataset and then creates a data structure containing the MRN and medication
    #values which are then used to create an associated record data frame
    xml_data = open(data_directory +'dataset.xml').read()
    root = ET.XML(xml_data) #create tree
    all_records = []
    
    for child in (root): #iterates through tree
        record = {"PATIENT": "", "MEDICATION": []} #creates record dictionary
        Patient = {}
        Medication = {}
        
        #iterates through and finds associated values from MRN and Item tag names and appends respective dictionaries
        for subchild in child:
            
            for inner_child in subchild:
                
                if (inner_child.tag == "MRN"): 
                    mrn_dict = inner_child.attrib
                    
                    Patient["MRN"] = mrn_dict["value"]
                    
                    record["PATIENT"] = Patient
                    
                elif (inner_child.tag == "Item"):
                    item_value = inner_child.attrib["value"]
                    record["MEDICATION"].append(item_value)
                    
        all_records.append(record)
                    
    return pd.DataFrame(all_records) #passes in records to create dataframe
                    
def processData(records): #iterate through all the records
    
    for row in records.iterrows():
        record = row[1]
        mrn = record[1]["MRN"]
        concept_nums = record[0]

        for concept_num in concept_nums: #passes in MRN and Concept ID
            writeData(concept_num, mrn)
                
def writeData(concept_num, mrn): #gets and writes needed data by implementing getConceptNums, getMedicationName, 
                                 #and getData functions
    
    nums = getConceptNums(MRREF_FRAME, concept_num)
    med = getMedicationName(MRCONSO_FRAME, concept_num)
    drugClass = getData(MRCONSO_FRAME, nums[0])
    Diagnosis = getData(MRCONSO_FRAME, nums[1])
    MOA = getData(MRCONSO_FRAME, nums[2])

    #writes rows of data to csv file
    row = [mrn, concept_num, med, drugClass, MOA, Diagnosis]
    w.writerow(row)
    
file_list = os.listdir(data_directory)

csv_file = open("jake77_UMLS_Class_ProjectII.csv", "w+", newline='')
w = csv.writer(csv_file)

header = ["Patient MRN", "Medication CUI", "Medication Name", "Medication Class", "Mechanism of Action", "Mechanism Diagnosis"]
w.writerow(header)


MRCONSO_FILE = open(data_directory + "MRCONSO.RRF")
MRREF_FILE = open(data_directory + "MRREL.RRF")

MRCONSO_FRAME = pd.read_csv(MRCONSO_FILE, header = None, delimiter="|")
MRREF_FRAME = pd.read_csv(MRREF_FILE, header = None, delimiter="|")

records = getRecordDataFrame()
processData(records)


MRREF_FILE.close()
MRCONSO_FILE.close()
csv_file.close()

Output: Outputs a csv file in the home directory containing the specified UMLS data needed from the patient dataset and corresponding UMLS files using the Patient MRN Identifiers and associated Concept IDs.