# Duplicate & Overlapping Adverse Event Check using GPT

In [1]:
# Import the necessary libraries, modules and functions:
import pandas as pd
import numpy as np
import openai
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity
from datetime import datetime
from getpass import getpass

# Prompt to enter OpenAI API key:
openai.api_key = getpass()

········


In [2]:
# Load the AE dataset as a dataframe, showing only the necessary rows and columns for the check:
dtypes1 = {'AEYN': str, 'CENTRE': str, 'SUBJECT_ID': str, 'FORM_OCCURENCE': str, 'AETERM': str, 'AESTDAT': str, 'AESTTIM': str, 'AEONGO': str, 'AEENDAT': str, 'AEENTIM': str, 'AESEV': str}
df1 = pd.read_csv('Input (EDC Datasets)\AE.csv', dtype = dtypes1)
df1 = df1[df1['AEYN'] == 'Y'].reset_index(drop = True)
df1 = df1[['CENTRE', 'SUBJECT_ID', 'FORM_OCCURENCE', 'AETERM', 'AESTDAT', 'AESTTIM', 'AEONGO', 'AEENDAT', 'AEENTIM', 'AESEV']]
df1 = df1.rename(columns = {'FORM_OCCURENCE': 'AE#'})

# Fill all empty AETERM fields with a blank space so the 'get_embedding' function won't throw an error later on:
df1['AETERM'] = df1['AETERM'].fillna(" ")

# Remove seconds from the AE time fields so only hours and minutes remain. This may or may not be necessary, depending on the
# format of the raw datasets you're working with:
df1['AESTTIM'] = df1['AESTTIM'].fillna(" ").str[:-3]
df1['AEENTIM'] = df1['AEENTIM'].fillna(" ").str[:-3]

# Fill all blank AE start and end dates to their theoretical extremes. Missing data edit checks in the EDC will generally avoid
# the need for this, however blank dates may still occur on occasion. This allows column-wise date comparisons to still compute
# later on without error if blank dates are present:
df1['AESTDAT'] = df1['AESTDAT'].fillna('01/01/0001')
df1['AEENDAT'] = df1['AEENDAT'].fillna('31/12/9999')

# Convert the AE date string formats to a format the later date comparison functions will accept. You may need to tweak this
# based on your specific scenario and input dataset:
df1['AESTDAT'] = df1['AESTDAT'].apply(lambda x: '-'.join([i.zfill(2) for i in x.split('/')[::-1][::-1]]))
df1['AEENDAT'] = df1['AEENDAT'].apply(lambda x: '-'.join([i.zfill(2) for i in x.split('/')[::-1][::-1]]))

# Get the LLM vector embeddings for all AE terms. Vector embeddings are numerical representations of the underlying meaning
# (semantics) of natural language. This allows the program to perform computations on free text entries in the AE term field.
# With the code below, only a list of isolated AE terms will be sent to OpenAI, with no other context:
df1['AETERM_Embedding'] = df1['AETERM'].apply(lambda x: get_embedding(x, engine = 'text-embedding-ada-002'))

# Left join the AE dataframe to itself, using Centre and Subject_ID as keys. This provides all pairwise combinations of a
# subject's AEs side-by-side:
df2 = pd.merge(df1, df1, on = ['CENTRE', 'SUBJECT_ID'], how = 'left')

# Identify all self-matched AEs and filter these rows out of the dataframe:
df2['Self_Match'] = df2['AE#_x'] == df2['AE#_y']
df2 = df2[df2['Self_Match'] == False].reset_index(drop = True)

# Calculate the cosine similarity between the AE term embedding vectors. This represents the difference in angle between the AE
# term embedding vectors in n-dimensional space and numerically represents the semantic similarity between the two pieces of text:
df2['Similarity'] = df2.apply(lambda row: cosine_similarity(row['AETERM_Embedding_x'], row['AETERM_Embedding_y']), axis = 1)

# Identify all AE pairs where the cosine similarity is sufficiently high to indicate a semantic match. Tweak the similarity
# threshold to meet your specific needs:
df2['Similarity_Match'] = df2['Similarity'] >= 0.9

In [3]:
# Define a function to check if a known date falls within a specified known date range. This function is used to check if the
# start and/or end dates of a given AE fall within the timeframe of its paired AE. It requires the input date strings to be in
# dd-mm-yyyy format (e.g. 01-01-2023). The function yields a True or False result according to: start_date <= date <= end_date:

def is_within_range(start_date_str, end_date_str, date_str):
    date_format = "%d-%m-%Y"
    
    try:
        date = datetime.strptime(date_str, date_format)
        start_date = datetime.strptime(start_date_str, date_format)
        end_date = datetime.strptime(end_date_str, date_format)
        
        if start_date <= date <= end_date:
            return True
        
        return False
    except ValueError:
        return False

In [4]:
if not df2.empty:
    
    # Identify all AE pairs that temporally overlap with one another by applying the 'is_within_range' function:
    df2['AESTDAT1_Within_Range'] = df2.apply(lambda row: is_within_range(row['AESTDAT_y'], row['AEENDAT_y'], row['AESTDAT_x']), axis = 1)
    df2['AEENDAT1_Within_Range'] = df2.apply(lambda row: is_within_range(row['AESTDAT_y'], row['AEENDAT_y'], row['AEENDAT_x']), axis = 1)
    df2['AESTDAT2_Within_Range'] = df2.apply(lambda row: is_within_range(row['AESTDAT_x'], row['AEENDAT_x'], row['AESTDAT_y']), axis = 1)
    df2['AEENDAT2_Within_Range'] = df2.apply(lambda row: is_within_range(row['AESTDAT_x'], row['AEENDAT_x'], row['AEENDAT_y']), axis = 1)

In [5]:
if not df2.empty:
    
    # Identify all AE pairs that semantically match AND temporally overlap with one another. If both conditions are met, then
    # the 'Similar_Overlap' column value is True, otherwise it is False:
    df2['Similar_Overlap'] = df2['Similarity_Match'] & ((df2['AESTDAT1_Within_Range'] | df2['AEENDAT1_Within_Range']) | (df2['AESTDAT2_Within_Range'] | df2['AEENDAT2_Within_Range']))
    
    # Filter the dataframe so it contains only the semantically matching AND temporally overlapping AEs:
    df3 = df2[df2['Similar_Overlap'] == True].reset_index(drop = True)
    
    if not df3.empty:
        
        # Revert all fields that were originally blank back to blank strings, so they simply show as blank spaces on the output:
        df3['AESTDAT_x'] = df3['AESTDAT_x'].replace('01-01-0001', '')
        df3['AEENDAT_x'] = df3['AEENDAT_x'].replace('31-12-9999', '')
        df3['AEONGO_x'] = df3['AEONGO_x'].fillna('')
        df3['AESEV_x'] = df3['AESEV_x'].fillna('')
        
        # Show only the necessary columns for the check output and rename them appropriately:
        df4 = df3[['CENTRE', 'SUBJECT_ID', 'AE#_x', 'AETERM_x', 'AESTDAT_x', 'AESTTIM_x', 'AEONGO_x', 'AEENDAT_x', 'AEENTIM_x', 'AESEV_x']]
        df4 = df4.rename(columns = {'AE#_x': 'AE#', 'AETERM_x': 'AETERM', 'AESTDAT_x': 'AESTDAT', 'AESTTIM_x': 'AESTTIM', 'AEONGO_x': 'AEONGO', 'AEENDAT_x': 'AEENDAT', 'AEENTIM_x': 'AEENTIM', 'AESEV_x': 'AESEV'})
        
        # Drop all duplicate rows where the Centre, Subject_ID, and AE# column values are the same. These represent the same AEs:
        df5 = df4.drop_duplicates(subset = ['CENTRE', 'SUBJECT_ID', 'AE#']).reset_index(drop = True)
        
    else:
        df5 = pd.DataFrame({"No Issues Found": []})
else:
    df5 = pd.DataFrame({"No Issues Found": []})
df5

Unnamed: 0,CENTRE,SUBJECT_ID,AE#,AETERM,AESTDAT,AESTTIM,AEONGO,AEENDAT,AEENTIM,AESEV
0,999,S011,1,Increased BP,15-01-2022,10:00,N,15-01-2022,11:00,Mild (Grade 1)
1,999,S011,2,High Blood Pressure,15-01-2022,16:30,N,15-01-2022,23:00,Mild (Grade 1)
2,999,S011,3,Elevated Blood Pressure,16-01-2022,,N,17-01-2022,,Mild (Grade 1)
3,999,S011,4,Hypertension,17-01-2022,,Y,,,Mild (Grade 1)
4,999,S011,5,High BP,22-03-2023,,N,22-03-2023,,Mild (Grade 1)


In [6]:
# Lastly, export the output to an excel spreadsheet:
df5.to_excel('Output\Duplicate & Overlapping AE Check Output.xlsx', sheet_name = 'Duplicate & Overlapping AEs', startrow = 0, startcol = 0, index = False, na_rep = '', header = True)