# Adverse Event vs Concomitant Medication Reconciliation using GPT

In [1]:
# Import the necessary libraries, modules and functions:
import pandas as pd
import numpy as np
import openai
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity
from datetime import datetime
from getpass import getpass

# Prompt to enter OpenAI API key:
openai.api_key = getpass()

········


In [2]:
# Load the AE dataset as a dataframe, showing only the necessary rows and columns for the reconciliation:
dtypes1 = {'AEYN': str, 'CENTRE': str, 'SUBJECT_ID': str, 'FORM_OCCURENCE': str, 'AETERM': str, 'AESTDAT': str, 'AEONGO': str, 'AEENDAT': str, 'AEACNOTH_DRUG': str}
df1 = pd.read_csv('Input (EDC Datasets)\AE.csv', dtype = dtypes1)
df1 = df1[df1['AEYN'] == 'Y'].reset_index(drop = True)
df1 = df1[['CENTRE', 'SUBJECT_ID', 'FORM_OCCURENCE', 'AETERM', 'AESTDAT', 'AEONGO', 'AEENDAT', 'AEACNOTH_DRUG']]
df1 = df1.rename(columns = {'FORM_OCCURENCE': 'AE#'})

# Filter the AE dataframe so that it only contains AEs where other action taken is 'drug' and fill all empty AETERM fields with
# a blank space so the 'get_embedding' function won't throw an error later on:
df1 = df1[df1['AEACNOTH_DRUG'] == '1'].reset_index(drop = True)
df1['AEACNOTH_DRUG'] = 'Y'
df1['AETERM'] = df1['AETERM'].fillna(" ")

# Fill all blank AE start and end dates to their theoretical extremes. Missing data edit checks in the EDC will generally avoid
# the need for this, however blank dates may still occur on occasion. This allows column-wise date comparisons to still compute
# later on without error if blank dates are present:
df1['AESTDAT'] = df1['AESTDAT'].fillna('01/01/0001')
df1['AEENDAT'] = df1['AEENDAT'].fillna('31/12/9999')

# Convert the AE date string formats to a format the later date comparison functions will accept. You may need to tweak this
# based on your specific scenario and input dataset:
df1['AESTDAT'] = df1['AESTDAT'].apply(lambda x: '-'.join([i.zfill(2) for i in x.split('/')[::-1][::-1]]))
df1['AEENDAT'] = df1['AEENDAT'].apply(lambda x: '-'.join([i.zfill(2) for i in x.split('/')[::-1][::-1]]))

# Get the LLM vector embeddings for all AE terms. Vector embeddings are numerical representations of the underlying meaning
# (semantics) of natural language. This allows the program to perform computations on free text entries in the AE term field.
# With the code below, only a list of isolated AE terms will be sent to OpenAI, with no other context:
df1['AETERM_Embedding'] = df1['AETERM'].apply(lambda x: get_embedding(x, engine = 'text-embedding-ada-002'))

# Load the CM dataset as a dataframe:
dtypes2 = {'CENTRE': str, 'SUBJECT_ID': str, 'FORM_OCCURENCE': str, 'CMTRT': str, 'CMINDC': str, 'CMINDC_CAT': str, 'CMONGO': str, 'CMSTDAT_d': str, 'CMSTDAT_m': str, 'CMSTDAT_y': str, 'CMENDAT_d': str, 'CMENDAT_m': str, 'CMENDAT_y': str}
df2 = pd.read_csv('Input (EDC Datasets)\CM.csv', dtype = dtypes2)

# Fill all blank CM start and end date parts to their theoretical extremes, similar to AE above.
df2['CMSTDAT_d'] = df2['CMSTDAT_d'].fillna('01')
df2['CMSTDAT_m'] = df2['CMSTDAT_m'].fillna('01')
df2['CMSTDAT_y'] = df2['CMSTDAT_y'].fillna('0001')
df2['CMENDAT_d'] = df2['CMENDAT_d'].fillna('31')
df2['CMENDAT_m'] = df2['CMENDAT_m'].fillna('12')
df2['CMENDAT_y'] = df2['CMENDAT_y'].fillna('9999')

# Define and apply a function to pad CM day and month parts with a leading zero if not already present. This ensures the
# concatenated date format will be accepted by the later date comparison function. This may or may not be necessary for your
# specific scenario:
def pad_with_zero(x):
    if pd.isnull(x):
        return x
    elif len(x) == 1:
        return x.zfill(2)
    else:
        return x
df2['CMSTDAT_d'] = df2['CMSTDAT_d'].apply(pad_with_zero)
df2['CMSTDAT_m'] = df2['CMSTDAT_m'].apply(pad_with_zero)
df2['CMENDAT_d'] = df2['CMENDAT_d'].apply(pad_with_zero)
df2['CMENDAT_m'] = df2['CMENDAT_m'].apply(pad_with_zero)

# If your CM dataset allows partially unknown dates, replace the raw EDC codelist values representing the unknown date parts
# with standard 'UNK' notation. This ensures the unknown date format will be accepted by the later date comparison function.
# This assumes your EDC platform captures the day, month and year parts as separate fields, otherwise equivalent operations can
# be performed on already combined date strings using regex:
df2['CMSTDAT_d'] = df2['CMSTDAT_d'].replace('UN', 'UNK')
df2['CMSTDAT_m'] = df2['CMSTDAT_m'].replace('UNK', 'UNK')
df2['CMSTDAT_y'] = df2['CMSTDAT_y'].replace('UKUK', 'UNK')
df2['CMENDAT_d'] = df2['CMENDAT_d'].replace('UN', 'UNK')
df2['CMENDAT_m'] = df2['CMENDAT_m'].replace('UNK', 'UNK')
df2['CMENDAT_y'] = df2['CMENDAT_y'].replace('UKUK', 'UNK')

# Concatenate the CM day, month and year parts into a single date string. This may or may not be necessary, depending upon your
# EDC's method of capturing partially unknown dates:
df2 = df2.drop(['CMSTDAT', 'CMENDAT'], axis = 1)
df2['CMSTDAT'] = df2['CMSTDAT_d'] + '-' + df2['CMSTDAT_m'] + '-' + df2['CMSTDAT_y']
df2['CMENDAT'] = df2['CMENDAT_d'] + '-' + df2['CMENDAT_m'] + '-' + df2['CMENDAT_y']

# Show only the necessary CM columns for the reconciliation and then filter the CM dataframe so that it only contains CMs where
# the indication category is 'AE'. Drop rows where CMINDC is empty:
df2 = df2[['CENTRE', 'SUBJECT_ID', 'FORM_OCCURENCE', 'CMTRT', 'CMINDC', 'CMINDC_CAT', 'CMSTDAT', 'CMONGO', 'CMENDAT']]
df2 = df2.rename(columns = {'FORM_OCCURENCE': 'CM#'})
df2 = df2[df2['CMINDC_CAT'] == 'AE'].dropna(subset = ['CMINDC']).reset_index(drop = True)

# Get the LLM vector embeddings for all CM indications. Vector embeddings are numerical representations of the underlying
# meaning (semantics) of natural language. This allows the program to perform computations on free text entries in the CM
# indication field. With the code below, only a list of isolated CM indications will be sent to OpenAI, with no other context:
df2['CMINDC_Embedding'] = df2['CMINDC'].apply(lambda x: get_embedding(x, engine = 'text-embedding-ada-002'))

# Perform a left join on the AE (left) and CM (right) dataframes using Centre and Subject_ID as the keys. This combines the AE
# and CM data into a single dataframe, providing all pairwise combinations of AEs and CMs side-by-side within each subject:
df3 = pd.merge(df1, df2, on = ['CENTRE', 'SUBJECT_ID'], how = 'left')

# Calculate the cosine similarity between the AE term and CM indication embedding vectors. This represents the difference in
# angle between the AE term and CM indication embedding vectors in n-dimensional space and numerically represents the semantic
# similarity between the two pieces of text:
df3['Similarity'] = df3.apply(lambda row: cosine_similarity(row['AETERM_Embedding'], row['CMINDC_Embedding']) if pd.notnull(row['CM#']) else 0, axis = 1)

# Identify all AE-CM pairs where the cosine similarity is sufficiently high to indicate a semantic match. Tweak the similarity
# threshold to meet your specific needs:
df3['Similarity_Match'] = df3['Similarity'] >= 0.95

In [3]:
# Define a function to check if a date, which may be partially unknown, falls within a specified known date range. This function
# is used to check if both the CM start and end dates fall within the timeframe of each AE. It requires the input date strings
# to be in a dd-mm-yyyy format (e.g. 01-01-2023). It first determines if the input date_str has any unknown parts, and then
# varies the unknown parts to encompass all possible values they could take, otherwise it keeps just the known part. The function
# then consecutively iterates over all combinations of day, month and year part ranges in a nested manner, combining them into
# single datetime objects which are then evaluated as either True or False according to: start_date <= date <= end_date. If at
# least one of the possible dates yields a True result (i.e. is within range), then the loop stops and the output of the
# function is True, otherwise the function continues until it reaches the end of its nested loops, where it will output False:

def is_within_range(start_date_str, end_date_str, date_str):
    start_date_formats = ['%d-%m-%Y', '%d-%b-%Y']
    end_date_formats = ['%d-%m-%Y', '%d-%b-%Y']

    for start_format in start_date_formats:
        try:
            start_date = datetime.strptime(start_date_str, start_format)
            break
        except ValueError:
            pass
    else:
        raise ValueError(f"Invalid start date format: {start_date_str}")

    for end_format in end_date_formats:
        try:
            end_date = datetime.strptime(end_date_str, end_format)
            break
        except ValueError:
            pass
    else:
        raise ValueError(f"Invalid end date format: {end_date_str}")

    date_parts = date_str.split('-')
    day_range = range(1, 32) if date_parts[0] == 'UNK' else [int(date_parts[0])]
    month_range = range(1, 13) if date_parts[1] == 'UNK' else [int(date_parts[1])]
    year_range = range(start_date.year, end_date.year + 1) if date_parts[2] == 'UNK' else [int(date_parts[2])]

    for day in day_range:
        for month in month_range:
            for year in year_range:
                try:
                    date = datetime(day=day, month=month, year=year)
                    if start_date <= date <= end_date:
                        return True
                except ValueError:
                    pass

    return False

In [4]:
# Fill any post-merge 'NaN' CM dates with a dummy date so that the 'is_within_range' function won't throw an error:
df3['CMSTDAT'] = df3['CMSTDAT'].fillna('01-01-0001')
df3['CMENDAT'] = df3['CMENDAT'].fillna('01-01-0001')

if not df3.empty:
    
    # Determine if the CM start and end dates are within (or possible to be within) the timeframe of each AE by applying the
    # 'is_within_range' function:
    df3['CMSTDAT_Within_Range'] = df3.apply(lambda row: is_within_range(row['AESTDAT'], row['AEENDAT'], row['CMSTDAT']), axis=1)
    df3['CMENDAT_Within_Range'] = df3.apply(lambda row: is_within_range(row['AESTDAT'], row['AEENDAT'], row['CMENDAT']), axis=1)

In [5]:
if not df3.empty:
    
    # Determine if the AE term and CM indication semantically match AND the CM start date is within the AE timeframe AND the CM
    # end date is within the AE timeframe. If all of these conditions are met, then the 'Entire_Match' column value is True,
    # otherwise it is False:
    df3['Entire_Match'] = df3['Similarity_Match'] & df3['CMSTDAT_Within_Range'] & df3['CMENDAT_Within_Range']
    
    # Determine if each Subject-AE grouping has a fully reconciling CM present. If a reconciling CM is present, then the
    # 'Entire_Match_Grouped' column value is True, otherwise it is False: 
    df3['Entire_Match_Grouped'] = df3.groupby(['CENTRE', 'SUBJECT_ID', 'AE#', 'AETERM'])['Entire_Match'].transform(lambda x: True if x.sum() else False)
    
    # Filter the dataframe so that it contains only Subject-AEs that don't have a fully reconciling CM present:
    df4 = df3[df3['Entire_Match_Grouped'] == False].reset_index(drop = True)
    
    # Determine if the Subject-AEs that don't have a fully reconciling CM present, have at least one semantically matching CM
    # indication present (i.e. partial reconciliation of indication, but not dates). If a partially reconciling CM is present,
    # then the 'Similarity_Match_Grouped' column value is True, otherwise it is False:
    df4['Similarity_Match_Grouped'] = df4.groupby(['CENTRE', 'SUBJECT_ID', 'AE#', 'AETERM'])['Similarity_Match'].transform(lambda x: True if x.sum() else False)
    
    if not df4.empty:
        
        # Create a new column called 'ISSUE_DESCRIPTION' containing a descriptive message for the check's output which depends
        # upon the value in the 'Similarity_Match_Grouped' column:
        df4.loc[df4['Similarity_Match_Grouped'] == True, 'ISSUE_DESCRIPTION'] = "Matching indication ('Category' = 'AE') found in CM, but dates inconsistent"
        df4.loc[df4['Similarity_Match_Grouped'] == False, 'ISSUE_DESCRIPTION'] = "Matching indication ('Category' = 'AE') not found in CM"
        
        # Show only the unique Subject-AEs for which there are flagged reconciliation issues:
        df5 = df4.groupby(['CENTRE', 'SUBJECT_ID', 'AE#', 'AETERM', 'AEACNOTH_DRUG', 'ISSUE_DESCRIPTION'], as_index = False).agg({'CM#': 'count'}).drop('CM#', axis = 1)
        
    else:
        df5 = pd.DataFrame({"No Issues Found": []})
else:
    df5 = pd.DataFrame({"No Issues Found": []})
df5

Unnamed: 0,CENTRE,SUBJECT_ID,AE#,AETERM,AEACNOTH_DRUG,ISSUE_DESCRIPTION
0,999,S001,1,Headache,Y,Matching indication ('Category' = 'AE') found ...
1,999,S001,3,Fractured Tibia,Y,Matching indication ('Category' = 'AE') not fo...
2,999,S001,5,Transaminitis,Y,Matching indication ('Category' = 'AE') not fo...
3,999,S002,2,Low mood,Y,Matching indication ('Category' = 'AE') not fo...
4,999,S002,3,Visual Disturbance,Y,Matching indication ('Category' = 'AE') not fo...
5,999,S002,4,Skin cancer,Y,Matching indication ('Category' = 'AE') found ...
6,999,S003,1,Myocardial Infaction,Y,Matching indication ('Category' = 'AE') not fo...
7,999,S003,3,Weight Gain,Y,Matching indication ('Category' = 'AE') found ...
8,999,S003,5,Hyperglycemia,Y,Matching indication ('Category' = 'AE') found ...
9,999,S006,1,Excessive sweating,Y,Matching indication ('Category' = 'AE') not fo...


In [6]:
# Lastly, export the output to an excel spreadsheet:
df5.to_excel('Output\AE vs CM Reconciliation Output.xlsx', sheet_name = 'AE vs CM Reconciliation', startrow = 0, startcol = 0, index = False, na_rep = '', header = True)