## extract data from biolincc_dedupLUT

In [3]:
from xml.etree import ElementTree as ET
import pandas as pd

def extract_data_from_xml(file_path, output_file_path):
    # Load and parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Initialize a list to hold the extracted data
    data = []

    # Iterate over each BIOLINCC_DEDUPLUT element and extract the required information
    for deduplut in root.findall('.//BIOLINCC_DEDUPLUT'):
        study = deduplut.find('Study').text if deduplut.find('Study') is not None else ''
        variable = deduplut.find('Variable').text if deduplut.find('Variable') is not None else ''
        label = deduplut.find('Label').text if deduplut.find('Label') is not None else ''
        uid = deduplut.find('UID').text if deduplut.find('UID') is not None else ''
        
        # Append the extracted data to the list
        data.append([study, variable, label, uid])

    # Convert the list to a DataFrame for easier handling
    df = pd.DataFrame(data, columns=['Study', 'Variable', 'Label', 'UID'])

    # Save the DataFrame to a TSV file
    df.to_csv(output_file_path, sep='\t', index=False)

# Specify the file paths
file_path = '../raw_data/v2/biolincc_dedupLUT.xml'  # Update this to your XML file path
output_file_path = '../raw_data/v2/biolincc_dedupLUT.tsv'  # Update this to your desired output file path

# Call the function
extract_data_from_xml(file_path, output_file_path)

print(f"Data extracted and saved to {output_file_path}")


Data extracted and saved to ../raw_data/v2/biolincc_dedupLUT.tsv


## extract data from biolincc_deduplabelsclean

In [4]:
from xml.etree import ElementTree as ET
import pandas as pd

def extract_data_from_xml(file_path, output_file_path):
    # Load and parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Initialize a list to hold the extracted data
    data = []

    # Iterate over each BIOLINCC_DEDUPLUT element and extract the required information
    for deduplut in root.findall('.//BIOLINCC_DEDUPLABELSCLEAN'):
        label_clean = deduplut.find('label_clean').text if deduplut.find('label_clean') is not None else ''
        # variable = deduplut.find('Variable').text if deduplut.find('Variable') is not None else ''
        # label = deduplut.find('Label').text if deduplut.find('Label') is not None else ''
        uid = deduplut.find('UID').text if deduplut.find('UID') is not None else ''
        
        # Append the extracted data to the list
        data.append([label_clean, uid])

    # Convert the list to a DataFrame for easier handling
    df = pd.DataFrame(data, columns=['label_clean', 'UID'])

    # Save the DataFrame to a TSV file
    df.to_csv(output_file_path, sep='\t', index=False)

# Specify the file paths
file_path = '../raw_data/v2/biolincc_deduplabelsclean.xml'  # Update this to your XML file path
output_file_path = '../raw_data/v2/biolincc_deduplabelsclean.tsv'  # Update this to your desired output file path

# Call the function
extract_data_from_xml(file_path, output_file_path)

print(f"Data extracted and saved to {output_file_path}")


Data extracted and saved to ../raw_data/v2/biolincc_deduplabelsclean.tsv


In [7]:
!pip freeze > requirements.txt