# Concomitant Procedures vs Adverse Event Reconciliation using GPT

In [1]:
# Import the necessary libraries, modules and functions:
import pandas as pd
import numpy as np
import openai
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity
from datetime import datetime
from getpass import getpass

# Prompt to enter OpenAI API key:
openai.api_key = getpass()

········


In [2]:
# Load the PR dataset as a dataframe, showing only the necessary columns for the reconciliation:
dtypes1 = {'CENTRE': str, 'SUBJECT_ID': str, 'FORM_OCCURENCE': str, 'PRTRT': str, 'PRINDC': str, 'PRINDC_CAT': str, 'PRDAT': str}
df1 = pd.read_csv('Input (EDC Datasets)\PR.csv', dtype = dtypes1)
df1 = df1[['CENTRE', 'SUBJECT_ID', 'FORM_OCCURENCE', 'PRTRT', 'PRINDC', 'PRINDC_CAT', 'PRDAT']]
df1 = df1.rename(columns = {'FORM_OCCURENCE': 'PR#'})

# Filter the PR dataframe so that it only contains PRs where the indication category is 'AE' and fill all empty PRINDC fields
# with a blank space so the 'get_embedding' function won't throw an error later on:
df1 = df1[df1['PRINDC_CAT'] == 'AE'].reset_index(drop = True)
df1['PRINDC'] = df1['PRINDC'].fillna(" ")

# Fill all blank PR dates with a dummy date so that the later date comparison functions won't throw an error:
df1['PRDAT'] = df1['PRDAT'].fillna('01/01/0001')

# Convert the PR date string format to a format the later date comparison functions will accept. You may need to tweak this
# based on your specific scenario and input dataset:
df1['PRDAT'] = df1['PRDAT'].apply(lambda x: '-'.join([i.zfill(2) for i in x.split('/')[::-1][::-1]]))

# Get the LLM vector embeddings for all PR indications. Vector embeddings are numerical representations of the underlying
# meaning (semantics) of natural language. This allows the program to perform computations on free text entries in the PR
# indication field. With the code below, only a list of isolated PR indications will be sent to OpenAI, with no other context:
df1['PRINDC_Embedding'] = df1['PRINDC'].apply(lambda x: get_embedding(x, engine = 'text-embedding-ada-002'))

# Load the AE dataset as a dataframe, showing only the necessary columns for the reconciliation:
dtypes2 = {'CENTRE': str, 'SUBJECT_ID': str, 'FORM_OCCURENCE': str, 'AETERM': str, 'AESTDAT': str, 'AEONGO': str, 'AEENDAT': str, 'AEACNOTH_PROC': str}
df2 = pd.read_csv('Input (EDC Datasets)\AE.csv', dtype = dtypes2)
df2 = df2[['CENTRE', 'SUBJECT_ID', 'FORM_OCCURENCE', 'AETERM', 'AESTDAT', 'AEONGO', 'AEENDAT', 'AEACNOTH_PROC']]
df2 = df2.rename(columns = {'FORM_OCCURENCE': 'AE#'})

# Filter the AE dataframe so that it only contains AEs where other action taken is 'procedure' and drop rows where the AETERM
# is empty:
df2 = df2[df2['AEACNOTH_PROC'] == '1'].dropna(subset = ['AETERM']).reset_index(drop = True)
df2['AEACNOTH_PROC'] = 'Y'

# Fill all blank AE start and end dates to their theoretical extremes. Missing data edit checks in the EDC will generally avoid
# the need for this, however blank dates may still occur on occasion. This allows column-wise date comparisons to still compute
# later on without error if blank dates are present:
df2['AESTDAT'] = df2['AESTDAT'].fillna('01/01/0001')
df2['AEENDAT'] = df2['AEENDAT'].fillna('31/12/9999')

# Convert the AE date string formats to a format the later date comparison functions will accept. You may need to tweak this
# based on your specific scenario and input dataset formats:
df2['AESTDAT'] = df2['AESTDAT'].apply(lambda x: '-'.join([i.zfill(2) for i in x.split('/')[::-1][::-1]]))
df2['AEENDAT'] = df2['AEENDAT'].apply(lambda x: '-'.join([i.zfill(2) for i in x.split('/')[::-1][::-1]]))

# Get the LLM vector embeddings for all AE terms. Vector embeddings are numerical representations of the underlying meaning
# (semantics) of natural language. This allows the program to perform computations on free text entries in the AE term field.
# With the code below, only a list of isolated AE terms will be sent to OpenAI, with no other context:
df2['AETERM_Embedding'] = df2['AETERM'].apply(lambda x: get_embedding(x, engine = 'text-embedding-ada-002'))

# Perform a left join on the PR (left) and AE (right) dataframes using Centre and Subject_ID as the keys. This combines the PR
# and AE data into a single dataframe, providing all pairwise combinations of PRs and AEs side-by-side within each subject:
df3 = pd.merge(df1, df2, on = ['CENTRE', 'SUBJECT_ID'], how = 'left')

# Calculate the cosine similarity between the PR indication and AE term embedding vectors. This represents the difference in
# angle between the PR indication and AE term embedding vectors in n-dimensional space and numerically represents the semantic
# similarity between the two pieces of text:
df3['Similarity'] = df3.apply(lambda row: cosine_similarity(row['PRINDC_Embedding'], row['AETERM_Embedding']) if pd.notnull(row['AE#']) else 0, axis = 1)

# Identify all PR-AE pairs where the cosine similarity is sufficiently high to indicate a semantic match. Tweak the similarity
# threshold to meet your specific needs:
df3['Similarity_Match'] = df3['Similarity'] >= 0.95

In [3]:
# Define a function to check if a known date falls within a specified known date range. This function is used to check if the
# PR date falls within the timeframe of each AE. It requires the input date strings to be in dd-mm-yyyy format (e.g. 01-01-2023).
# The function yields a True or False result according to: start_date <= date <= end_date:

def is_within_range(start_date_str, end_date_str, date_str):
    date_format = "%d-%m-%Y"
    
    try:
        date = datetime.strptime(date_str, date_format)
        start_date = datetime.strptime(start_date_str, date_format)
        end_date = datetime.strptime(end_date_str, date_format)
        
        if start_date <= date <= end_date:
            return True
        
        return False
    except ValueError:
        return False

In [4]:
# Fill any post-merge 'NaN' AE dates with a dummy date so that the 'is_within_range' function won't throw an error:
df3['AESTDAT'] = df3['AESTDAT'].fillna('01-01-0001')
df3['AEENDAT'] = df3['AEENDAT'].fillna('01-01-0001')

if not df3.empty:
    
    # Determine if the PR date is within the timeframe of each AE by applying the 'is_within_range' function:
    df3['PRDAT_Within_Range'] = df3.apply(lambda row: is_within_range(row['AESTDAT'], row['AEENDAT'], row['PRDAT']), axis=1)

In [5]:
if not df3.empty:
    
    # Determine if the PR indication and AE term semantically match AND the PR date is within the AE timeframe. If both of these
    # conditions are met, then the 'Entire_Match' column value is True, otherwise it is False:
    df3['Entire_Match'] = df3['Similarity_Match'] & df3['PRDAT_Within_Range']
    
    # Determine if each Subject-PR grouping has a fully reconciling AE present. If a reconciling AE is present, then the
    # 'Entire_Match_Grouped' column value is True, otherwise it is False:
    df3['Entire_Match_Grouped'] = df3.groupby(['CENTRE', 'SUBJECT_ID', 'PR#', 'PRTRT', 'PRINDC'])['Entire_Match'].transform(lambda x: True if x.sum() else False)
    
    # Filter the dataframe so that it contains only Subject-PRs that don't have a fully reconciling AE present:
    df4 = df3[df3['Entire_Match_Grouped'] == False].reset_index(drop = True)
    
    # Determine if the Subject-PRs that don't have a fully reconciling AE present, have at least one semantically matching AE
    # term present (i.e. partial reconciliation of AE term, but not dates). If a partially reconciling AE is present, then the
    # 'Similarity_Match_Grouped' column value is True, otherwise it is False:
    df4['Similarity_Match_Grouped'] = df4.groupby(['CENTRE', 'SUBJECT_ID', 'PR#', 'PRTRT', 'PRINDC'])['Similarity_Match'].transform(lambda x: True if x.sum() else False)
    
    if not df4.empty:
        
        # Create a new column called 'ISSUE_DESCRIPTION' containing a descriptive message for the check's output which depends
        # upon the value in the 'Similarity_Match_Grouped' column:
        df4.loc[df4['Similarity_Match_Grouped'] == True, 'ISSUE_DESCRIPTION'] = "Matching event term ('Action Taken' = 'Procedure') found in AE, but dates inconsistent"
        df4.loc[df4['Similarity_Match_Grouped'] == False, 'ISSUE_DESCRIPTION'] = "Matching event term ('Action Taken' = 'Procedure') not found in AE"
        
        # Show only the unique Subject-PRs for which there are flagged reconciliation issues:
        df5 = df4.groupby(['CENTRE', 'SUBJECT_ID', 'PR#', 'PRTRT', 'PRINDC', 'PRINDC_CAT', 'ISSUE_DESCRIPTION'], as_index = False).agg({'AE#': 'count'}).drop('AE#', axis = 1)
        
    else:
        df5 = pd.DataFrame({"No Issues Found": []})
else:
    df5 = pd.DataFrame({"No Issues Found": []})
df5

Unnamed: 0,CENTRE,SUBJECT_ID,PR#,PRTRT,PRINDC,PRINDC_CAT,ISSUE_DESCRIPTION
0,999,S007,1,Valvuloplasty,Heart disease,AE,Matching event term ('Action Taken' = 'Procedu...


In [6]:
# Lastly, export the output to an excel spreadsheet:
df5.to_excel('Output\PR vs AE Reconciliation Output.xlsx', sheet_name = 'PR vs AE Reconciliation', startrow = 0, startcol = 0, index = False, na_rep = '', header = True)