# Duplicate & Overlapping Concomitant Medication Check using GPT

In [14]:
# Import the necessary libraries, modules and functions:
import pandas as pd
import numpy as np
import openai
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity
from datetime import datetime
from getpass import getpass

# Prompt to enter OpenAI API key:
openai.api_key = getpass()

········


In [15]:
# Load the CM dataset as a dataframe:
dtypes1 = {'CENTRE': str, 'SUBJECT_ID': str, 'FORM_OCCURENCE': str, 'CMTRT': str, 'CMINDC': str, 'CMSTDAT_d': str, 'CMSTDAT_m': str, 'CMSTDAT_y': str, 'CMONGO': str, 'CMENDAT_d': str, 'CMENDAT_m': str, 'CMENDAT_y': str, 'CMDSTXT': str, 'CMDOSFRM': str, 'CMDOSFRQ': str, 'CMROUTE': str}
df1 = pd.read_csv('Input (EDC Datasets)\CM.csv', dtype = dtypes1)
df1 = df1.rename(columns = {'FORM_OCCURENCE': 'CM#'})

# Fill all empty CMTRT fields with a blank space so the 'get_embedding' function won't throw an error later on:
df1['CMTRT'] = df1['CMTRT'].fillna(" ")

# Fill all blank CM start and end date parts to their theoretical extremes. Missing data edit checks in the EDC will generally
# avoid the need for this, however blank dates may still occur on occasion. This allows column-wise date comparisons to still
# compute later on without error if blank dates are present:
df1['CMSTDAT_d'] = df1['CMSTDAT_d'].fillna('01')
df1['CMSTDAT_m'] = df1['CMSTDAT_m'].fillna('01')
df1['CMSTDAT_y'] = df1['CMSTDAT_y'].fillna('0001')
df1['CMENDAT_d'] = df1['CMENDAT_d'].fillna('31')
df1['CMENDAT_m'] = df1['CMENDAT_m'].fillna('12')
df1['CMENDAT_y'] = df1['CMENDAT_y'].fillna('9999')

# Define and apply a function to pad CM day and month parts with a leading zero if not already present. This ensures the
# concatenated date format will be accepted by the later date comparison function. This may or may not be necessary for your
# specific scenario:
def pad_with_zero(x):
    if pd.isnull(x):
        return x
    elif len(x) == 1:
        return x.zfill(2)
    else:
        return x
df1['CMSTDAT_d'] = df1['CMSTDAT_d'].apply(pad_with_zero)
df1['CMSTDAT_m'] = df1['CMSTDAT_m'].apply(pad_with_zero)
df1['CMENDAT_d'] = df1['CMENDAT_d'].apply(pad_with_zero)
df1['CMENDAT_m'] = df1['CMENDAT_m'].apply(pad_with_zero)

# If your CM dataset allows partially unknown dates, replace the raw EDC codelist values representing the unknown date parts
# with standard 'UNK' notation. This ensures the unknown date format will be accepted by the later date comparison function.
# This assumes your EDC platform captures the day, month and year parts as separate fields, otherwise equivalent operations can
# be performed on already combined date strings using regex:
df1['CMSTDAT_d'] = df1['CMSTDAT_d'].replace('UN', 'UNK')
df1['CMSTDAT_m'] = df1['CMSTDAT_m'].replace('UNK', 'UNK')
df1['CMSTDAT_y'] = df1['CMSTDAT_y'].replace('UKUK', 'UNK')
df1['CMENDAT_d'] = df1['CMENDAT_d'].replace('UN', 'UNK')
df1['CMENDAT_m'] = df1['CMENDAT_m'].replace('UNK', 'UNK')
df1['CMENDAT_y'] = df1['CMENDAT_y'].replace('UKUK', 'UNK')

# Concatenate the CM day, month and year parts into a single date string. This may or may not be necessary, depending upon your
# EDC's method of capturing partially unknown dates:
df1 = df1.drop(['CMSTDAT', 'CMENDAT'], axis = 1)
df1['CMSTDAT'] = df1['CMSTDAT_d'] + '-' + df1['CMSTDAT_m'] + '-' + df1['CMSTDAT_y']
df1['CMENDAT'] = df1['CMENDAT_d'] + '-' + df1['CMENDAT_m'] + '-' + df1['CMENDAT_y']

# Replace all instances of CMDOSU, CMDOSFRM, CMDOSFRQ, and CMROUTE = 'Other' with the respective values found in the CMDOSU_OTH,
# CMDOSFRM_OTH, CMDOSFRQ_OTH, and CMROUTE_OTH fields. This will simplify the output:
df1.loc[df1['CMDOSU'] == 'OTHER', 'CMDOSU'] = df1.loc[df1['CMDOSU'] == 'OTHER', 'CMDOSU_OTH']
df1.loc[df1['CMDOSFRM'] == 'OTHER', 'CMDOSFRM'] = df1.loc[df1['CMDOSFRM'] == 'OTHER', 'CMDOSFRM_OTH']
df1.loc[df1['CMDOSFRQ'] == 'OTHER', 'CMDOSFRQ'] = df1.loc[df1['CMDOSFRQ'] == 'OTHER', 'CMDOSFRQ_OTH']
df1.loc[df1['CMROUTE'] == 'OTHER', 'CMROUTE'] = df1.loc[df1['CMROUTE'] == 'OTHER', 'CMROUTE_OTH']

# Append the dose unit to the dose value so they are conveniently displayed together. This will simplify the output:
df1['CMDSTXT'] = df1['CMDSTXT'] + " " + df1['CMDOSU']

# Show only the necessary columns for the check:
df1 = df1[['CENTRE', 'SUBJECT_ID', 'CM#', 'CMTRT', 'CMINDC', 'CMSTDAT', 'CMONGO', 'CMENDAT', 'CMDSTXT', 'CMDOSFRM', 'CMDOSFRQ', 'CMROUTE']]

# Get the LLM vector embeddings for all CM names (CMTRTs). Vector embeddings are numerical representations of the underlying
# meaning (semantics) of natural language. This allows the program to perform computations on free text entries in the CMTRT
# field. With the code below, only a list of isolated CM names will be sent to OpenAI, with no other context:
df1['CMTRT_Embedding'] = df1['CMTRT'].apply(lambda x: get_embedding(x, engine = 'text-embedding-ada-002'))

# Left join the CM dataframe to itself, using Centre and Subject_ID as keys. This provides all pairwise combinations of a
# subject's CMs side-by-side:
df2 = pd.merge(df1, df1, on = ['CENTRE', 'SUBJECT_ID'], how = 'left')

# Identify all self-matched CMs and filter these rows out of the dataframe:
df2['Self_Match'] = df2['CM#_x'] == df2['CM#_y']
df2 = df2[df2['Self_Match'] == False].reset_index(drop = True)

# Calculate the cosine similarity between the CM name (CMTRT) embedding vectors. This represents the difference in angle between
# the CM name embedding vectors in n-dimensional space and numerically represents the semantic similarity between the two pieces
# of text:
df2['Similarity'] = df2.apply(lambda row: cosine_similarity(row['CMTRT_Embedding_x'], row['CMTRT_Embedding_y']), axis = 1)

# Identify all CM pairs where the cosine similarity is sufficiently high to indicate a semantic match. Tweak the similarity
# threshold to meet your specific needs:
df2['Similarity_Match'] = df2['Similarity'] >= 0.9

In [16]:
# Define a function to check if a date, which may be partially unknown, falls within a specified known date range. This function
# is used to check if the start and/or end dates of a given CM fall within the timeframe of its paired CM. It requires the input
# date strings to be in dd-mm-yyyy format (e.g. 01-01-2023). It first determines if the input date_str has any unknown parts,
# and then varies the unknown parts to encompass all possible values they could take, otherwise it keeps just the known part.
# The function then consecutively iterates over all combinations of day, month and year part ranges in a nested manner, combining
# them into single datetime objects which are then evaluated as either True or False according to: start_date <= date <= end_date.
# If at least one of the possible dates yields a True result (i.e. is within range), then the loop stops and the output of the
# function is True, otherwise the function continues until it reaches the end of its nested loops, where it will output False:

def is_within_range(start_date_str, end_date_str, date_str):
    start_date_formats = ['%d-%m-%Y', '%d-%b-%Y']
    end_date_formats = ['%d-%m-%Y', '%d-%b-%Y']

    for start_format in start_date_formats:
        try:
            start_date = datetime.strptime(start_date_str, start_format)
            break
        except ValueError:
            pass
    else:
        raise ValueError(f"Invalid start date format: {start_date_str}")

    for end_format in end_date_formats:
        try:
            end_date = datetime.strptime(end_date_str, end_format)
            break
        except ValueError:
            pass
    else:
        raise ValueError(f"Invalid end date format: {end_date_str}")

    date_parts = date_str.split('-')
    day_range = range(1, 32) if date_parts[0] == 'UNK' else [int(date_parts[0])]
    month_range = range(1, 13) if date_parts[1] == 'UNK' else [int(date_parts[1])]
    year_range = range(start_date.year, end_date.year + 1) if date_parts[2] == 'UNK' else [int(date_parts[2])]

    for day in day_range:
        for month in month_range:
            for year in year_range:
                try:
                    date = datetime(day=day, month=month, year=year)
                    if start_date <= date <= end_date:
                        return True
                except ValueError:
                    pass

    return False

In [17]:
# For each CM in the pair (x, y), set all unknown CM date parts to their lowermost or uppermost possible extremes, depending on
# whether they represent a start date or end date:
df2['CMSTDAT_x_reg'] = df2['CMSTDAT_x'].str.replace('^UNK-', '01-', regex = True).str.replace('-UNK-', '-01-').str.replace('-UNK$', '-0001', regex = True)
df2['CMENDAT_x_reg'] = df2['CMENDAT_x'].str.replace('^UNK-UNK-', '31-12-', regex = True)
df2['CMENDAT_x_reg'] = df2['CMENDAT_x_reg'].str.replace('-UNK-', '-12-').str.replace('-UNK$', '-9999', regex = True)
df2['CMENDAT_x_reg'] = df2['CMENDAT_x_reg'].str.replace(r'^(UNK-)(02-)', r'28-\2', regex = True).str.replace(r'^(UNK-)(04-|06-|09-|11-)', r'30-\2', regex = True).str.replace(r'^(UNK-)(01-|03-|05-|07-|08-|10-|12-)', r'31-\2', regex = True)

df2['CMSTDAT_y_reg'] = df2['CMSTDAT_y'].str.replace('^UNK-', '01-', regex = True).str.replace('-UNK-', '-01-').str.replace('-UNK$', '-0001', regex = True)
df2['CMENDAT_y_reg'] = df2['CMENDAT_y'].str.replace('^UNK-UNK-', '31-12-', regex = True)
df2['CMENDAT_y_reg'] = df2['CMENDAT_y_reg'].str.replace('-UNK-', '-12-').str.replace('-UNK$', '-9999', regex = True)
df2['CMENDAT_y_reg'] = df2['CMENDAT_y_reg'].str.replace(r'^(UNK-)(02-)', r'28-\2', regex = True).str.replace(r'^(UNK-)(04-|06-|09-|11-)', r'30-\2', regex = True).str.replace(r'^(UNK-)(01-|03-|05-|07-|08-|10-|12-)', r'31-\2', regex = True)

In [18]:
if not df2.empty:
    
    # Identify all CM pairs that temporally overlap with one another by applying the 'is_within_range' function:
    df2['CMSTDAT1_Within_Range'] = df2.apply(lambda row: is_within_range(row['CMSTDAT_y_reg'], row['CMENDAT_y_reg'], row['CMSTDAT_x']), axis = 1)
    df2['CMENDAT1_Within_Range'] = df2.apply(lambda row: is_within_range(row['CMSTDAT_y_reg'], row['CMENDAT_y_reg'], row['CMENDAT_x']), axis = 1)
    df2['CMSTDAT2_Within_Range'] = df2.apply(lambda row: is_within_range(row['CMSTDAT_x_reg'], row['CMENDAT_x_reg'], row['CMSTDAT_y']), axis = 1)
    df2['CMENDAT2_Within_Range'] = df2.apply(lambda row: is_within_range(row['CMSTDAT_x_reg'], row['CMENDAT_x_reg'], row['CMENDAT_y']), axis = 1)
    
    # Convert all x and y CM date strings into datetime format to allow for their ordinal comparison below:
    df2['CMSTDAT_x_reg'] = df2['CMSTDAT_x_reg'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y').date())
    df2['CMENDAT_x_reg'] = df2['CMENDAT_x_reg'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y').date())
    df2['CMSTDAT_y_reg'] = df2['CMSTDAT_y_reg'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y').date())
    df2['CMENDAT_y_reg'] = df2['CMENDAT_y_reg'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y').date())
    
    # The 'is_within_range' function allows the input date to vary its unknown parts to all possible values they could take,
    # however it only considers the CM start and end dates in isolation at any given time. It is therefore possible that when
    # the end date of CM x is before the start date of CM y (and vice versa), the 'is_within_range' function may vary the end
    # date of CM y to be before the start date of CM y (and vice versa), and the date may therefore flag as within range (True),
    # however this scenario is not possible in reality. The code below ensures this does not happen:
    df2.loc[df2['CMENDAT_x_reg'] < df2['CMSTDAT_y_reg'], 'CMSTDAT1_Within_Range'] = False
    df2.loc[df2['CMENDAT_y_reg'] < df2['CMSTDAT_x_reg'], 'CMSTDAT2_Within_Range'] = False
    df2.loc[df2['CMSTDAT_x_reg'] > df2['CMENDAT_y_reg'], 'CMENDAT1_Within_Range'] = False
    df2.loc[df2['CMSTDAT_y_reg'] > df2['CMENDAT_x_reg'], 'CMENDAT2_Within_Range'] = False

In [19]:
if not df2.empty:
    
    # Identify all CM pairs that semantically match AND temporally overlap with one another. If both conditions are met, then
    # the 'Similar_Overlap' column value is True, otherwise it is False:
    df2['Similar_Overlap'] = df2['Similarity_Match'] & ((df2['CMSTDAT1_Within_Range'] | df2['CMENDAT1_Within_Range']) | (df2['CMSTDAT2_Within_Range'] | df2['CMENDAT2_Within_Range']))
    
    # Filter the dataframe so it contains only the semantically matching AND temporally overlapping CMs:
    df3 = df2[df2['Similar_Overlap'] == True].reset_index(drop = True)
    
    if not df3.empty:
        
        # Revert all fields that were originally blank back to blank strings, so they simply show as blank spaces on the output:
        df3['CMSTDAT_x'] = df3['CMSTDAT_x'].replace('01-01-0001', '')
        df3['CMENDAT_x'] = df3['CMENDAT_x'].replace('31-12-9999', '')
        df3['CMINDC_x'] = df3['CMINDC_x'].fillna('')
        df3['CMONGO_x'] = df3['CMONGO_x'].fillna('')
        df3['CMDSTXT_x'] = df3['CMDSTXT_x'].fillna('')
        df3['CMDOSFRM_x'] = df3['CMDOSFRM_x'].fillna('')
        df3['CMDOSFRQ_x'] = df3['CMDOSFRQ_x'].fillna('')
        df3['CMROUTE_x'] = df3['CMROUTE_x'].fillna('')
        
        # Show only the necessary columns for the check output and rename them appropriately:
        df4 = df3[['CENTRE', 'SUBJECT_ID', 'CM#_x', 'CMTRT_x', 'CMINDC_x', 'CMSTDAT_x', 'CMONGO_x', 'CMENDAT_x', 'CMDSTXT_x', 'CMDOSFRM_x', 'CMDOSFRQ_x', 'CMROUTE_x']]
        df4 = df4.rename(columns = {'CM#_x': 'CM#', 'CMTRT_x': 'CMTRT', 'CMINDC_x': 'CMINDC', 'CMSTDAT_x': 'CMSTDAT', 'CMONGO_x': 'CMONGO', 'CMENDAT_x': 'CMENDAT', 'CMDSTXT_x': 'CMDSTXT', 'CMDOSFRM_x': 'CMDOSFRM', 'CMDOSFRQ_x': 'CMDOSFRQ', 'CMROUTE_x': 'CMROUTE'})
        
        # Drop all duplicate rows where the Centre, Subject_ID, and CM# column values are the same. These represent the same CMs:
        df5 = df4.drop_duplicates(subset = ['CENTRE', 'SUBJECT_ID', 'CM#']).reset_index(drop = True)
        
    else:
        df5 = pd.DataFrame({"No Issues Found": []})
else:
    df5 = pd.DataFrame({"No Issues Found": []})
df5

Unnamed: 0,CENTRE,SUBJECT_ID,CM#,CMTRT,CMINDC,CMSTDAT,CMONGO,CMENDAT,CMDSTXT,CMDOSFRM,CMDOSFRQ,CMROUTE
0,999,S022,1,Panadol,Headache,15-01-2022,N,15-01-2022,500 mg,TABLET,QD,ORAL
1,999,S022,2,Panadol,Headache,15-01-2022,N,15-01-2022,500 mg,TABLET,QD,ORAL
2,999,S022,3,Panadol,Headache,16-01-2022,N,17-01-2022,500 mg,TABLET,QD,ORAL
3,999,S022,4,Panadol,Headache,17-01-2022,Y,,500 mg,TABLET,QD,ORAL
4,999,S022,5,Panadol,Headache,22-03-2023,N,22-03-2023,500 mg,TABLET,QD,ORAL


In [7]:
# Lastly, export the output to an excel spreadsheet:
df5.to_excel('Output\Duplicate & Overlapping CM Check Output.xlsx', sheet_name = 'Duplicate & Overlapping CMs', startrow = 0, startcol = 0, index = False, na_rep = '', header = True)