In [206]:
import pandas as pd
import os
import re
import random
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select

from bs4 import BeautifulSoup

In [207]:
def clean_data_table2(table2,column_name_old, column_name_new,delimiter, confounder_text):
    """
    Dataframe preprocessing for tables from VigilApp. Splits by delimiter & transposes original table. Renames cols.
    Deletes unnecessary string. Adds new row with confounder_text
    
    Input:
    
    table2 -- df to transpose
    column_name_old, column_name_new -- col names
    delimiter -- to split by
    confounder_text -- text to add in new row
    
    Returns
    
    table2 -- modified df
    
    """
    
    
    table2 = table2[column_name_old].str.split(delimiter, expand=True).transpose() #one col is one future confounder table, split content by delimter (vals are all in one row, delimited), transpose
    table2.columns = [column_name_new]
    #table2.rename(columns={table2['1']: column_name_new}, inplace=True)#rename COL WITH HEADER 1 to more intelligible
    table2[column_name_new] = table2[column_name_new].astype(str) + delimiter #change to str (int in some cases), add delimiter
    table2[column_name_new] = table2[column_name_new].str.replace(r'More results Show/Hide', '') #remove string artefact
    new_row_table2 = {column_name_new:confounder_text} #new row is confounder info text, first col
    table2 = table2.append(new_row_table2, ignore_index=True) #append it
    
    return table2

def drop_row(df,column_name,value):
    
    """
    Removes rows where value.
    
    Input:
    df -- dataframe to modify
    column_name -- loc of value
    value -- value of rows to remove
    
    Returns
    
    df -- modified df
    
    """

    index_to_remove = df[df[column_name] == value].index #get index of row with )
    df.drop(index_to_remove, inplace=True) #drop above
    return df

def initiate_driver(path_to_driver, website_address):
    """ Initiates chrome webdriver with website_address based on driver path
    
    On windows: provide chromedriver.exe location (here in current dir) | mac: provide binary path to chromedriver: chromedriver executable file must be moved to usr/local/bin/chromedriver, eg. with command mv chromedriver /usr/local/bin;  
    Ipnut:
    path_to_driver, website_address (str)

    
    """
    
    chromedriver_path = path_to_driver
    driver = webdriver.Chrome(chromedriver_path)

    website_address = website_address
    sleep(2)
    driver.get(website_address)
    sleep(random.randint(5,6)) #wait until fully loaded, alternative: WebDriverWait, check ways to apply selenium waits to iframe
    
    return driver

def search_enter_keys(path,keys_to_send):

    """Looks for element by xpath. Sends keys.

    """
    path = path
    field = driver.find_element_by_xpath(path) 
    field.send_keys(keys_to_send) 
    sleep(random.randint(1,3))

    return driver


def scrape_signal(path_to_signal, signal_col_name, events, event):

    """Switches to second tab, grabs text in path_to_signal, inputs text in signal_col_name:

    Input:
    path_to_signal -- path to info in vigilapp (wether event unrelated or putative adverse event)
    signal_col_name -- name of col in events df to write signal info into
    events -- df with events list & drug names, has 'event' col
    event -- row value of col 'event' in events df where to write signal info
    
    Returns signal (ADR or unrelated) as per text in vigilapp 

    """
    driver.switch_to.window(driver.window_handles[1])

    text = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH,path_to_signal)))
    events.loc[(events['event'] == event), signal_col_name] = text.text
    signal = text.text
    
    return driver,signal

def click_all_buttons(button_text):

    """Clicks all the button with button_text

    """

    shows = driver.find_elements_by_link_text(button_text)
    sleep(4)

    for show in shows:
        show.click()
        sleep(3)

    return driver

def find_all_tables(driver, table_tag):
    """Gets all the tables in BeautifulSoup html_page in driver source page


    """

    html_page = driver.page_source
    soup = BeautifulSoup(html_page, 'lxml')

    tables = soup.find_all(table_tag)
    all_tables = pd.read_html(str(tables))
    
    
    return all_tables

def process_tables(all_tables):
    """Get three first tables, table 3: transpose, assign new header (first row) """
    
    table0 = all_tables[0]
    table1 = all_tables[1]
    table2 = all_tables[2].transpose()
    
    new_header = table2.iloc[0] #grab the first row for the header
    table2 = table2[1:] #take the data less the header row
    table2.columns = new_header #set the header row as the df header
    
    return table0,table1,table2

def get_DPA_indicators(DA_table,contingency_table):

    """Gets PRR, chi-squared and no of reports for DPA evaluation (from DA_table, contingency_table)"""

    PRR = DA_table.Value[3]
    chi = DA_table.Value[1]
    no_reports = (contingency_table[contingency_table.columns[1]][0]).replace(r'DE','')

    return PRR, chi, no_reports

def add_interpretation(tableDA, signal,text,suffix_related,suffix_unrelated):
    
    """ Add row to DPA table explaining how to interpret results, depending on the signal.
    Text is a base info string, suffixes depend on signal evaluation
    
    """
    if signal == 'ADVERSE DRUG REACTION':
        new_row_tableDA = {'Disproportionality indicators':text + suffix_related}
        tableDA = tableDA.append(new_row_tableDA, ignore_index=True)

    if signal == 'unrelated':
        new_row_tableDA = {'Disproportionality indicators':text + suffix_unrelated}
        tableDA = tableDA.append(new_row_tableDA, ignore_index=True)
        
    return tableDA

def add_rows_contingency(first_col_name,table_contingency,formulaPRR,formula_chi):

    """Add remaining info on formulas for PRR, chi-squared used for calculations. Adding into first col.
    """
    new_row_table0 = {first_col_name:formulaPRR}
    table_contingency = table_contingency.append(new_row_table0, ignore_index=True)
    new_row_table0b = {first_col_name:formula_chi}
    table_contingency = table_contingency.append(new_row_table0b, ignore_index=True)

    return table_contingency

def write_to_excel(path,engine,tables,table_names):

    """Write all DPA analysis to one excel with multiple sheets"""
    # Create a Pandas Excel writer using XlsxWriter as the engine
    writer = pd.ExcelWriter(path, engine=engine)

    for table, table_name in zip(tables, table_names):
        # Write each dataframe to a different worksheet.
        table.to_excel(writer, sheet_name=table_name, index=False)
    # Close the Pandas Excel writer and output the Excel file.
    writer.save()

In [208]:
all_adrs_dir = os.path.join(os.getcwd(), 'signals')
da = os.path.join(os.getcwd(), 'DA')
dirs = [all_adrs_dir,da]
for directory in dirs:
    if not os.path.exists(directory):
        os.makedirs(directory)

In [210]:
#df with events (per drug)
events = pd.read_excel('events.xlsx') #list of events per drug
events['event'] = events['event'].str.strip() #copied from VigilApp, have trailing spaces

references = pd.read_excel('references.xlsx') #needs .xlsx with ref to append

formulaPRR = 'Additional explanation: calculations are made using following formulas: PRR = ( DE / D ) / (dE/ d)'
formula_chi = 'Chi squared with Yate\'s correction = N * ( | DE*de – dE*De | - N/2 )2/ (D * d * E * e)'
confounder_text = 'Aditional explanation: this sheet contains possible confounders: ie. drugs, events, indications, ages or sex that are most frequent among the subpopulation selected for this drug:event pair. You can use them to refine the subpopulations you want to compare (e.g., unmasking signals by using (NOT this drug)) or include them into the background correction'
table_names = ['Contingency Table', 'DA results','Gender Distribution', 'Age Distribution', 'Top Drugs', 'Top Product Names', 'Top Drug Classes', 'Top ADRs', 'Top Indications', 'References']

#only use when events sorted by rel/unrel for unrel events
text = 'Additional explanation: interpretation of the DA results: According to the criteria by Evans 2001, which requires a report count > 3 (this combination: {0}) and a PRR > 2 (here: {1}) and a Chi-squared > 4 (here: {2}) this drug and event are '
suffix_related = 'statistically significantly related (=putative ADVERSE DRUG REACTION).'
suffix_unrelated = 'probably unrelated.'
    

In [211]:
for event, drug_name in zip(events.event, events.drug_name):
    
    driver = initiate_driver('/usr/local/bin/chromedriver', 'https://openvigil.pharmacology.uni-kiel.de/openvigilfda.php')
    driver.find_element_by_xpath('/html/body/form/ul/ul[2]/li/input').click() #checkbox for DPA as method of data analysis
    
    driver = search_enter_keys('//input[@name="wdrugname2"]', drug_name) #input: drug name
    driver = search_enter_keys('//input[@name="waevent2"]', event) # input: event
    
    driver.find_element_by_xpath('//input[@name="query"]').click() #query
    sleep(7)
    
    driver,signal = scrape_signal("/html/body/strong", 'signal', events, event) #write info on signal (unrelated or ADR)
    
    
    driver = click_all_buttons("Show") #click all show buttons
    
    all_tables = find_all_tables(driver,'table') #get all tables on the page

    driver.quit() #close window
    
    table0,table1,table2 = process_tables(all_tables) #get first three tables, clean table3
    
    PRR,chi,no_reports = get_DPA_indicators(table1,table0) #get DPA indicators from DPA & contingency tables
    
    text_to_add = text.format(no_reports, PRR, chi)#current DPA indicatiors - put in text
    
    table1 = add_interpretation(table1,signal,text_to_add,suffix_related,suffix_unrelated )  #add additional info on interpretation of signal to DPA table

    table0 = add_rows_contingency('Groups',table0,formulaPRR,formula_chi) #add remaining explanation info to contingency table
    
    #get all additional tables (confounders) as result of DPA analysis by transposing table2
    
    #age distribution
    age = clean_data_table2(table2, 'Age distributionAge (Number of reports)', 'Age (number of reports)', ')', confounder_text)
    age = drop_row(age, 'Age (number of reports)', ')') 
    
    #top confounding drugs 
    drugs_top = clean_data_table2(table2, 'Top drugsGeneric Name (Number of reports)', 'Top drugs by Generic Name (number of reports)', ')', confounder_text)
    drugs_top = drop_row(drugs_top, 'Top drugs by Generic Name (number of reports)', ')' )
    
    #top confounding products
    top_prod_names = clean_data_table2(table2, 'Top medicinalproductsProductname (Number of reports)', 'Top medicinal products by Product name (Number of reports)', ')',confounder_text)
    top_prod_names = drop_row(top_prod_names, 'Top medicinal products by Product name (Number of reports)', ')')
    
    #top confounding drug clases
    top_drug_clases = clean_data_table2(table2, 'Top drugclasses MoAMechanism of Action (Number of reports)', 'Top drug classes by Mechanism of Action [MoA] (number of reports)', ')', confounder_text)
    top_drug_clases = drop_row(top_drug_clases, 'Top drug classes by Mechanism of Action [MoA] (number of reports)', ')')
    
    #top confounding events
    top_events = clean_data_table2(table2, 'Top adverse eventsEvent (Number of reports)', 'Top adverse events by Event (number of reports)', ')',confounder_text)
    top_events = drop_row(top_events, 'Top adverse events by Event (number of reports)', ')')
    
    #top confounding indications
    top_indicat = clean_data_table2(table2, 'Top indicationsIndication (Number of reports)', 'Top indications by Indication (number of reports)', ')',confounder_text)
    top_indicat = drop_row(top_indicat, 'Top indications by Indication (number of reports)', ')')

    #gender distribution
    gender = clean_data_table2(table2,'Gender distribution','Gender distribution', 'le', confounder_text )
    gender = drop_row(gender, 'Gender distribution', 'le' )
    
    tables = [table0,table1,gender,age,drugs_top,top_prod_names,top_drug_clases,top_events,top_indicat,references]
    
    #write all to excel with multiple sheets
    write_to_excel(os.path.join(da,'DA_{0}_{1}.xlsx'.format(drug_name,event)),'xlsxwriter',tables,table_names)
    
    
#change "ADR" to "Related" to follow related/unrelated convention
events.loc[(events['signal'] == 'ADVERSE DRUG REACTION'), 'signal'] = 'Related'
events['signal'] = events['signal'].str.capitalize() #capitalise unrelated
   
events.to_excel('events_signals.xlsx', index=False) #save to excel events+drugs+signals
    