# Unit 10 HW - Data Wrangling from UMLS RRF Files

Description: : In this file we pull the UMLS files, MRCONSO.RFF and MRREF.RRF, from a directory and wrangle the specified data (ConceptID, Medication Name, Drug Class, Diagnosis, Mechanism of Action) using the given Concept IDs as well as create and write said data to a csv file.

Logic: Using the Pandas module, we create respective dataframes and iterate through each row of the relationship dataframe extracting the specified data needed by using a given Concept ID and identifying its respective concept relationship as a unique key, which we use to extract the relationship from the associated concept structure dataframe. Then we use the csv writer to write the needed extracted data to a csv file. 

In [1]:
import csv 
import os
import pandas as pd

data_directory = "/opt/class/umls/"

def getConceptNums(data_frame, concept_num):
    #contains concept numbers for each respective type of needed data
    drug_nums = []
    diagnosis_nums = []
    moa_nums = []
    list_of_nums = []
    
    #creates series objects and iterates through lines to get concept numbers
    for row in data_frame.iterrows():
        column = row[1] #series object
        concept_left = row[1].at[0]
        concept_right = row[1].at[4]
        
        #checks if line has one of the respective attributes, returns associated concept ID in line
        if (column[3] == "PAR" or column[3] == "CHD" or column[7] == "isa" or column[7] == "inverse_isa"): 
            drug_nums = getConcept(concept_left, concept_right, drug_nums)
            
        elif(column[7] == "may_be_treated_by"):
            diagnosis_nums = getConcept(concept_left, concept_right, diagnosis_nums)
        
        elif(column[7] == "mechanism_of_action_of"):
            moa_nums = getConcept(concept_left, concept_right, moa_nums)
        
    #remove duplicate concept IDs and adds to list_of_nums
    drug_nums = remove_duplicates(drug_nums)
    diagnosis_nums = remove_duplicates(diagnosis_nums)
    moa_nums = remove_duplicates(moa_nums)
    list_of_nums.append(drug_nums)
    list_of_nums.append(diagnosis_nums)
    list_of_nums.append(moa_nums)
    
    return list_of_nums

#finds concept ID and returns opposite associated concept ID
def getConcept(concept_left, concept_right, nums):
    if (concept_left == concept_num):
        nums.append(concept_right)
    elif(concept_right == concept_num):
        nums.append(concept_left)
        
    return(nums)

def remove_duplicates(nums):
    return(list(set(nums)))

#iterates through rows of data_frame, gets series object, returns concept associated data
def getData(data_frame, nums):
    for row in data_frame.iterrows():
        line = row[1]
        concept_num = line[0]
        for num in nums:
            if num == concept_num:
                return line[14]

#gets Medication name from MRCONSO file
def getMedicationName(data_frame, num):
    for row in data_frame.iterrows(): 
        line = row[1] 
        concept_num = line[0] 
        if num == concept_num: 
            return line[14]


file_list = os.listdir(data_directory)

csv_file = open("jake77_UMLS.csv", "w+", newline='')
w = csv.writer(csv_file)

header = ["ConceptID", "Medication Name", "Drug Class", "Diagnosis", "Mechanism of Action"]
w.writerow(header)

#creates all variables and writes to csv file
MRCONSO_FILE = open(data_directory + "MRCONSO.RRF")
MRREF_FILE = open(data_directory + "MRREL.RRF")

MRCONSO_FRAME = pd.read_csv(MRCONSO_FILE, header = None, delimiter="|")
MRREF_FRAME = pd.read_csv(MRREF_FILE, header = None, delimiter="|")

#takes in concept ID gets data related to the associated concept lists
concept_num = "C0016860"
nums = getConceptNums(MRREF_FRAME, concept_num)
med = getMedicationName(MRCONSO_FRAME, concept_num)
drugClass = getData(MRCONSO_FRAME, nums[0])
Diagnosis = getData(MRCONSO_FRAME, nums[1])
MOA = getData(MRCONSO_FRAME, nums[2])

#writes rows of data to csv file
row = [concept_num, med, drugClass, Diagnosis, MOA]
w.writerow(row)

concept_num = "C0012265"
nums = getConceptNums(MRREF_FRAME, concept_num)
med = getMedicationName(MRCONSO_FRAME, concept_num)
drugClass = getData(MRCONSO_FRAME, nums[0])
Diagnosis = getData(MRCONSO_FRAME, nums[1])
MOA = getData(MRCONSO_FRAME, nums[2])

row = [concept_num, med, drugClass, Diagnosis, MOA]
w.writerow(row)

MRREF_FILE.close()
MRCONSO_FILE.close()
csv_file.close()



Output: Outputs a csv file in the home directory containing the specified UMLS data needed associated with each Concept ID.