In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import bs4 as bs

In [43]:
def clinical_trial_xml_reader(file):
    """Uses BeautifulSoup to open and parse an xml file from a clinical trial.
    Returns the html/xml text.  
    
    The path and file name together are the ony argument.
    The xml_soup is returned
    """
    xml_soup = bs.BeautifulSoup(open(file,"r"), "html.parser")
    # This should automatically close the file.
    return xml_soup


def get_tag_text(soup, tag="title"):
    """A function that returns the text of the first specified tag if present, otherwise returns nan.

    Takes a soup of choice and the tag of choice as arguments.  Remember to put the tag in quotes.
    Returns either the text from the tag or, if the tag isn't present, NaN.
    """
    try:
        return soup.find(tag).get_text()
    except AttributeError:
        return np.NaN


def parse_clinical_trial_xml(soup, trial_data_categories):
    """A function to parse multiple myeloma clinical trials from xml files.
    Scrapes multiple fields of interest to describe the study generally.  Uses the get_tag_text()
    function to find text corresponding to tags in the list trial_data_categories.
    
    Takes as arguments the html/xml text from xml_reader() function and a list
    that acts as labels for the columns of the Series'.
    Returns a Series called 'clinical_trial_row' that can be appended as a row to a DataFrame.
    """
    category_dict = {}
    for category in trial_data_categories:
        category_dict[category] = get_tag_text(soup, category.lower())
    clinical_trial_row = pd.Series(data=category_dict, dtype=None)
    return clinical_trial_row

def clinical_trial_scrape(folder_path):
    """Uses clinical_trial_xml_reader() and parse_clinical_trial_xml functions to scrape basic information
    about all clinical trials present in the folder_path.  If a specific field contains "nan", that means 
    the trial did not report that information, which could be either improper reporting or just absence of information.
    
    Takes a folder's path as an argument.  The folder should contain .xml files from ClinicalTrials.gov to scrape.
    Returns a DataFrame called 'clinical_trial_df' containing basic information about each clinical trial as a row.
    """
    # Create a list of the categories to be scraped, and use this list as column names for a DataFrame.
    trial_data_categories = ["NCT_ID",
                             "Acronym",
                             "Brief_Title",
                             "Phase",
                             "Agency",
                             "URL",
                             "Overall_Status",
                             "Start_Date",
                             "Completion_Date",
                             "Enrollment",
                             "Number_of_Arms"]
    clinical_trial_df = pd.DataFrame(columns=trial_data_categories)
    
    # Generate a list of all .xml files in the folder, then iterate over the list to parse each file.
    files = sorted([file for file in os.listdir(path) if file.endswith(".xml")])
    for file in files:
        soup = clinical_trial_xml_reader(os.path.join(path, file))
        clinical_trial_row = parse_clinical_trial_xml(soup, trial_data_categories)
        clinical_trial_df = clinical_trial_df.append(clinical_trial_row, ignore_index=True)
    
    clinical_trial_df.Start_Date = pd.to_datetime(clinical_trial_df.Start_Date)
    clinical_trial_df.Completion_Date = pd.to_datetime(clinical_trial_df.Completion_Date)
    clinical_trial_df.Enrollment = clinical_trial_df.Enrollment.astype('int64')
    
    return clinical_trial_df

In [46]:
path = "/Users/blixt007/HTML/xml/MM_Trials"

# Scrape information from all .xml files in the above path.
# This includes 311 separate trials involving multiple myeloma.
MM_trials = clinical_trial_scrape(path)

# Save the clinical trial information as a .csv file within the same path as the trial .xml files.
MM_trials.to_csv(os.path.join(path, "MM_Trials.csv"), index=False)

# Here are the first five rows of trial information.
MM_trials.head()

Unnamed: 0,NCT_ID,Acronym,Brief_Title,Phase,Agency,URL,Overall_Status,Start_Date,Completion_Date,Enrollment,Number_of_Arms
0,NCT00002850,,Antibiotic Therapy in Preventing Early Infecti...,Phase 3,Gary Morrow,https://clinicaltrials.gov/show/NCT00002850,Completed,1997-03-01,2012-01-01,212,3
1,NCT00006184,,"Chemotherapy, Stem Cell Transplantation and Do...",Phase 2,National Cancer Institute (NCI),https://clinicaltrials.gov/show/NCT00006184,Completed,2001-02-08,2008-01-12,20,2
2,NCT00006244,,"Melphalan, Peripheral Stem Cell Transplantatio...",Phase 2,Fred Hutchinson Cancer Research Center,https://clinicaltrials.gov/show/NCT00006244,Completed,2000-02-01,2016-04-01,36,1
3,NCT00027560,,"Melphalan, Fludarabine, and Alemtuzumab Follow...",Phase 2,Memorial Sloan Kettering Cancer Center,https://clinicaltrials.gov/show/NCT00027560,Completed,2001-07-01,2009-04-01,51,1
4,NCT00040937,,"S0204 Thalidomide, Chemotherapy, and Periphera...",Phase 2,Southwest Oncology Group,https://clinicaltrials.gov/show/NCT00040937,Completed,2002-06-01,2015-10-01,147,1


In [47]:
# Here are the last five rows of trial information.
MM_trials.tail()

Unnamed: 0,NCT_ID,Acronym,Brief_Title,Phase,Agency,URL,Overall_Status,Start_Date,Completion_Date,Enrollment,Number_of_Arms
306,NCT02481934,NK-VS-MM,Clinical Trial of Expanded and Activated Autol...,Phase 1,"Joaquín Martínez López, MD, PhD",https://clinicaltrials.gov/show/NCT02481934,Completed,2013-03-01,2016-10-01,5,1
307,NCT02566265,SHIVERING 2,Study of High-dose Influenza Vaccine Efficacy ...,Phase 2,Yale University,https://clinicaltrials.gov/show/NCT02566265,Completed,2015-09-01,2018-06-01,122,2
308,NCT02632786,PRONTO,"The PRONTO Study, a Global Phase 2b Study of N...",Phase 2,Prothena Therapeutics Ltd.,https://clinicaltrials.gov/show/NCT02632786,Completed,2016-03-01,2018-03-01,129,2
309,NCT02669615,,Pharmacokinetic Study of Propylene Glycol-Free...,Phase 2,Medical College of Wisconsin,https://clinicaltrials.gov/show/NCT02669615,Completed,2016-11-01,2017-07-19,24,1
310,NCT03000452,FUSION-MM-005,A Study to Determine the Efficacy of the Combi...,Phase 2,Celgene,https://clinicaltrials.gov/show/NCT03000452,Completed,2017-03-14,2017-12-04,18,1


In [20]:
def parse_adverse_events(soup):
    """Parse all reported adverse events, the number of participants affected, and total number at risk.

    Takes a clinical trial soup as the only argument.
    Returns a DataFrame containing the adverse events as the indices and the treatment
    arms as column names. Values are represented as percent of affected out of total
    per treatment arm.
    """

    # Classification of groupings for treatment arms and adverse effects are nested under
    # <reported_events>.  Create a dictionary matching the group_id to treatment arm(s).
    group_id_dict = {}
    reported_events = soup.reported_events.find_all("group")
    n = 0
    for n in range(n, len(reported_events)):
        group_id_dict[reported_events[n].get("group_id")] = reported_events[n].description.get_text()

    # Serious and non-serious adverse events are nested under <default_assessment>.
    # Obtain all children of each "default_assessment".
    adverse_events = []
    for assessment in soup.find_all("default_assessment"):
        adverse_events.append(assessment.find_next())


    counts = []
    sub_titles = []
    events = []

    # Iterate over adverse_events data (should only be 2).
    for event in adverse_events:
        events = event.find_all("event")

        # Each adverse_event item has a <sub_title> and <counts> for each treatment arm.
        # Obtain separate lists of <counts> and <sub_title> for each event.
        for event in events:
            counts_per_event = event.find_all("counts")
            sub_titles_per_event = event.find_all("sub_title")

            # Obtain <subjects_affected> and <subjects_at_risk> numbers for each count value.
            for count in counts_per_event:
                count_info = []
                count_info.append(count.get("subjects_affected"))
                count_info.append(count.get("subjects_at_risk"))
                counts.append(count_info)

            # Get text from the <sub_title> and append to list.
            for sub_title in sub_titles_per_event:
                sub_titles.append(sub_title.get_text())


    # Determine the <group_id> values used to distinguish treatment arms.
    group_id = []
    for count in counts_per_event:
        group_id.append(count.get("group_id"))
        # Add a dictionary that links group_id to treatment.


    # Counts should contain values for <subjects_affected> and <subjects_at_risk> for each
    # <group_id> per event.  Convert these values into a percentage and store in counts_percent.
    counts_percent = []
    for count in counts:
        counts_percent.append(round((int(count[0])/int(count[1]))*100, 2))

    # Separate the counts_percent values into sub-lists by their group_id.
    counts_percent_groups = []
    n=0
    for n in range(n, len(set(group_id))):
        counts_percent_groups.append(counts_percent[n::len(set(group_id))])
        n+=1

    # Create an empty DataFrame.  Iterate over counts_percent_groups to add values to the DataFrame.
    adverse_events_df = pd.DataFrame()
    n=0
    for n in range(len(counts_percent_groups)):
        adverse_events_df[group_id[n]] = counts_percent_groups[n]
        n+=1
    adverse_events_df.index = sub_titles
    adverse_events_df.rename()
    # Note that the non-serious adverse events are appended after the serious adverse events.
    # If the index is sorted differently, it will be difficult to separate them again.

    return adverse_events_df

In [24]:
def get_treatments(soup):
    """Classification of groupings for treatment arms and adverse effects are nested under
    <reported_events>.  Create a dictionary matching the group_id to the treatment arm(s).
    """
    group_id_dict = {}
    reported_events = soup.reported_events.find_all("group")
    n = 0
    for n in range(n, len(reported_events)):
        group_id_dict[reported_events[n].get("group_id")] = reported_events[n].description.get_text()
    
    return group_id_dict

{'E1': 'Tabalumab 100 milligram (mg) administered once intravenously (IV) over 30 minutes on Day 1 every 21 days for 8 cycles.\nDexamethasone 20 mg administered once orally on Days 1, 2, 4, 5, 8, 9, 11 and 12 every 21 days for 8 cycles.\nBortezomib 1.3 milligram per square meter (mg/m^2) administered once subcutaneously (SQ) on Days 1, 4, 8 and 11 every 21 days for 8 cycles.',
 'E2': 'Tabalumab 300 mg administered once IV over 30 minutes on Day 1 every 21 days for 8 cycles.\nDexamethasone 20 mg administered orally on Days 1, 2, 4, 5, 8, 9, 11 and 12 every 21 days for 8 cycles.\nBortezomib 1.3 mg/m^2 administered once SQ on Days 1, 4, 8 and 11 every 21 days for 8 cycles.',
 'E3': 'Placebo administered once IV on Day 1 every 21 days for 8 cycles. Dexamethasone 20 mg administered once orally on Days 1, 2, 4, 5, 8, 9, 11 and 12 every 21 days for 8 cycles.\nBortezomib 1.3 mg/m^2 administered once SQ on Days 1, 4, 8 and 11 every 21 days for 8 cycles.'}

In [16]:
df_style = min_max_adverse_event(path, "neuropathy")
df_style


Unnamed: 0,Min % neuropathy,Max % neuropathy
NCT00006184,0,50
NCT00075881,6.98,58.14
NCT00081939,0.33,38.94
NCT00084747,,36.67
NCT00103506,0,14.47
NCT00111813,0,50
NCT00148317,,73.68
NCT00153920,7.81,60.94
NCT00287872,,83.33
NCT00307086,NR,NR


In [13]:
def min_max_adverse_event(path, event):
    """Determine the maximum and minimum percentage of participants in any treatment arm that experience the
    specified adverse event.
    Takes a path and the event as a string as agruments.
    Returns a Series of float values with the trial's NCT ID as the index.
    If the study does not report the specified adverse event, np.NaN will be returned.  
    
    Note: many studies report similar adverse events with slightly different names.  For this reason it is best
    to search for the essential portion of the adverse event's name instead of a very specific format.  For instance,
    some studies report only "neuropathy", while others report "neuropathy peripheral" or even "peripheral neuropathy."
    """
    
    min_adverse_event_dict = {}
    max_adverse_event_dict = {}
    files = sorted([file for file in os.listdir(path) if file.endswith(".xml")])
    for file in files:
        soup = clinical_trial_xml_reader(os.path.join(path, file))

        adverse_events = [sub_title for sub_title in soup.find_all("sub_title") if event.lower() in sub_title.get_text().lower()]

        # Iterate over each adverse event type to find all <counts> and determine the percentage 
        # of each group with said event.
        all_adverse_event_dict = {}
        for adverse_event in adverse_events:
            counts = adverse_event.parent.find_all("counts")
            for count in counts:
                all_adverse_event_dict[(count.get("group_id") + "_" + adverse_event.get_text())] = (
                    round(int(count["subjects_affected"])/int(count["subjects_at_risk"])*100, 2))

        try:
            max_adverse_event_dict[soup.nct_id.get_text()] = max(all_adverse_event_dict.values())
            if min(all_adverse_event_dict.values()) is not max(all_adverse_event_dict.values()):
                min_adverse_event_dict[soup.nct_id.get_text()] = min(all_adverse_event_dict.values())
        except ValueError:
            min_adverse_event_dict[soup.nct_id.get_text()] = "NR"
            max_adverse_event_dict[soup.nct_id.get_text()] = "NR"


    return pd.DataFrame([min_adverse_event_dict, max_adverse_event_dict], index=["Min % " + event, "Max % " + event]).transpose()



def percent_adverse_events(path, event_list=["neuropathy", "paraesthesia"]):
    """Use the min_max_adverse_event function to parse clinical trials for multiple adverse events supplied as a list.
    Returns a DataFrame with the reported minimum and maximum percentage of participants who experienced each
    specified adverse event.
    """
    adverse_events_dataFrame = pd.DataFrame()
    for event in event_list:
        percent_event = min_max_adverse_event(path, event)
        adverse_events_dataFrame = pd.concat([adverse_events_dataFrame, percent_event], sort=False)
    return adverse_events_dataFrame
    #return percent_event
    
df = percent_adverse_events(path, ["Neuropathy", "Constipation"])


#def adverse_event_publications(df)

In [21]:
#df.describe()

df["Max Neuropathy"].loc[df["Max Neuropathy"] > 50].index
#df["Max Neuropathy"] > 50

Index(['NCT00075881', 'NCT00148317', 'NCT00153920', 'NCT00287872'], dtype='object')