# Duplicate & Overlapping Medical History Check using GPT

In [8]:
# Import the necessary libraries, modules and functions:
import pandas as pd
import numpy as np
import openai
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity
from datetime import datetime
from getpass import getpass

# Prompt to enter OpenAI API key:
openai.api_key = getpass()

········


In [9]:
# Load the MH dataset as a dataframe:
dtypes1 = {'CENTRE': str, 'SUBJECT_ID': str, 'FORM_OCCURENCE': str, 'MHTERM': str, 'MHCAT': str, 'MHSTDAT_d': str, 'MHSTDAT_m': str, 'MHSTDAT_y': str, 'MHONGO': str, 'MHENDAT_d': str, 'MHENDAT_m': str, 'MHENDAT_y': str}
df1 = pd.read_csv('Input (EDC Datasets)\MH.csv', dtype = dtypes1)
df1 = df1.rename(columns = {'FORM_OCCURENCE': 'MH#'})

# Fill all empty MHTERM fields with a blank space so the 'get_embedding' function won't throw an error later on:
df1['MHTERM'] = df1['MHTERM'].fillna(" ")

# Fill all blank MH start and end date parts to their theoretical extremes. Missing data edit checks in the EDC will generally
# avoid the need for this, however blank dates may still occur on occasion. This allows column-wise date comparisons to still
# compute later on without error if blank dates are present:
df1['MHSTDAT_d'] = df1['MHSTDAT_d'].fillna('01')
df1['MHSTDAT_m'] = df1['MHSTDAT_m'].fillna('01')
df1['MHSTDAT_y'] = df1['MHSTDAT_y'].fillna('0001')
df1['MHENDAT_d'] = df1['MHENDAT_d'].fillna('31')
df1['MHENDAT_m'] = df1['MHENDAT_m'].fillna('12')
df1['MHENDAT_y'] = df1['MHENDAT_y'].fillna('9999')

# Define and apply a function to pad MH day and month parts with a leading zero if not already present. This ensures the
# concatenated date format will be accepted by the later date comparison function. This may or may not be necessary for your
# specific scenario:
def pad_with_zero(x):
    if pd.isnull(x):
        return x
    elif len(x) == 1:
        return x.zfill(2)
    else:
        return x
df1['MHSTDAT_d'] = df1['MHSTDAT_d'].apply(pad_with_zero)
df1['MHSTDAT_m'] = df1['MHSTDAT_m'].apply(pad_with_zero)
df1['MHENDAT_d'] = df1['MHENDAT_d'].apply(pad_with_zero)
df1['MHENDAT_m'] = df1['MHENDAT_m'].apply(pad_with_zero)

# If your MH dataset allows partially unknown dates, replace the raw EDC codelist values representing the unknown date parts
# with standard 'UNK' notation. This ensures the unknown date format will be accepted by the later date comparison function.
# This assumes your EDC platform captures the day, month and year parts as separate fields, otherwise equivalent operations can
# be performed on already combined date strings using regex:
df1['MHSTDAT_d'] = df1['MHSTDAT_d'].replace('UN', 'UNK')
df1['MHSTDAT_m'] = df1['MHSTDAT_m'].replace('UNK', 'UNK')
df1['MHSTDAT_y'] = df1['MHSTDAT_y'].replace('UKUK', 'UNK')
df1['MHENDAT_d'] = df1['MHENDAT_d'].replace('UN', 'UNK')
df1['MHENDAT_m'] = df1['MHENDAT_m'].replace('UNK', 'UNK')
df1['MHENDAT_y'] = df1['MHENDAT_y'].replace('UKUK', 'UNK')

# Concatenate the MH day, month and year parts into a single date string. This may or may not be necessary, depending upon your
# EDC's method of capturing partially unknown dates:
df1 = df1.drop(['MHSTDAT', 'MHENDAT'], axis = 1)
df1['MHSTDAT'] = df1['MHSTDAT_d'] + '-' + df1['MHSTDAT_m'] + '-' + df1['MHSTDAT_y']
df1['MHENDAT'] = df1['MHENDAT_d'] + '-' + df1['MHENDAT_m'] + '-' + df1['MHENDAT_y']

# For MH items that are procedures, set the blank end date values to be equal to the start date values. This is only applicable
# if your MH EDC form / dataset captures prior procedures together with MH (rather than in PR) and the end date fields therefore
# become dynamically hidden due to them being procedures. Set MHONGO to 'N/A' for procedures: 
df1.loc[df1['MHCAT'] == 'PR', 'MHENDAT'] = df1.loc[df1['MHCAT'] == 'PR', 'MHSTDAT']
df1.loc[df1['MHCAT'] == 'PR', 'MHONGO'] = 'N/A'

# Show only the necessary columns for the check:
df1 = df1[['CENTRE', 'SUBJECT_ID', 'MH#', 'MHTERM', 'MHCAT', 'MHSTDAT', 'MHONGO', 'MHENDAT']]

# Get the LLM vector embeddings for all MH terms. Vector embeddings are numerical representations of the underlying meaning
# (semantics) of natural language. This allows the program to perform computations on free text entries in the MHTERM field.
# With the code below, only a list of isolated MH terms will be sent to OpenAI, with no other context:
df1['MHTERM_Embedding'] = df1['MHTERM'].apply(lambda x: get_embedding(x, engine = 'text-embedding-ada-002'))

# Left join the MH dataframe to itself, using Centre and Subject_ID as keys. This provides all pairwise combinations of a
# subject's MH side-by-side:
df2 = pd.merge(df1, df1, on = ['CENTRE', 'SUBJECT_ID'], how = 'left')

# Identify all self-matched MH terms and filter these rows out of the dataframe:
df2['Self_Match'] = df2['MH#_x'] == df2['MH#_y']
df2 = df2[df2['Self_Match'] == False].reset_index(drop = True)

# Calculate the cosine similarity between the MH term embedding vectors. This represents the difference in angle between the MH
# term embedding vectors in n-dimensional space and numerically represents the semantic similarity between the two pieces of text:
df2['Similarity'] = df2.apply(lambda row: cosine_similarity(row['MHTERM_Embedding_x'], row['MHTERM_Embedding_y']), axis = 1)

# Identify all MH pairs where the cosine similarity is sufficiently high to indicate a semantic match. Tweak the similarity
# threshold to meet your specific needs:
df2['Similarity_Match'] = df2['Similarity'] >= 0.9

In [10]:
# Define a function to check if a date, which may be partially unknown, falls within a specified known date range. This function
# is used to check if the start and/or end dates of a given MH fall within the timeframe of its paired MH. It requires the input
# date strings to be in dd-mm-yyyy format (e.g. 01-01-2023). It first determines if the input date_str has any unknown parts,
# and then varies the unknown parts to encompass all possible values they could take, otherwise it keeps just the known part.
# The function then consecutively iterates over all combinations of day, month and year part ranges in a nested manner, combining
# them into single datetime objects which are then evaluated as either True or False according to: start_date <= date <= end_date.
# If at least one of the possible dates yields a True result (i.e. is within range), then the loop stops and the output of the
# function is True, otherwise the function continues until it reaches the end of its nested loops, where it will output False:

def is_within_range(start_date_str, end_date_str, date_str):
    start_date_formats = ['%d-%m-%Y', '%d-%b-%Y']
    end_date_formats = ['%d-%m-%Y', '%d-%b-%Y']

    for start_format in start_date_formats:
        try:
            start_date = datetime.strptime(start_date_str, start_format)
            break
        except ValueError:
            pass
    else:
        raise ValueError(f"Invalid start date format: {start_date_str}")

    for end_format in end_date_formats:
        try:
            end_date = datetime.strptime(end_date_str, end_format)
            break
        except ValueError:
            pass
    else:
        raise ValueError(f"Invalid end date format: {end_date_str}")

    date_parts = date_str.split('-')
    day_range = range(1, 32) if date_parts[0] == 'UNK' else [int(date_parts[0])]
    month_range = range(1, 13) if date_parts[1] == 'UNK' else [int(date_parts[1])]
    year_range = range(start_date.year, end_date.year + 1) if date_parts[2] == 'UNK' else [int(date_parts[2])]

    for day in day_range:
        for month in month_range:
            for year in year_range:
                try:
                    date = datetime(day=day, month=month, year=year)
                    if start_date <= date <= end_date:
                        return True
                except ValueError:
                    pass

    return False

In [11]:
# For each MH in the pair (x, y), set all unknown MH date parts to their lowermost or uppermost possible extremes, depending on
# whether they represent a start date or end date:
df2['MHSTDAT_x_reg'] = df2['MHSTDAT_x'].str.replace('^UNK-', '01-', regex = True).str.replace('-UNK-', '-01-').str.replace('-UNK$', '-0001', regex = True)
df2['MHENDAT_x_reg'] = df2['MHENDAT_x'].str.replace('^UNK-UNK-', '31-12-', regex = True)
df2['MHENDAT_x_reg'] = df2['MHENDAT_x_reg'].str.replace('-UNK-', '-12-').str.replace('-UNK$', '-9999', regex = True)
df2['MHENDAT_x_reg'] = df2['MHENDAT_x_reg'].str.replace(r'^(UNK-)(02-)', r'28-\2', regex = True).str.replace(r'^(UNK-)(04-|06-|09-|11-)', r'30-\2', regex = True).str.replace(r'^(UNK-)(01-|03-|05-|07-|08-|10-|12-)', r'31-\2', regex = True)

df2['MHSTDAT_y_reg'] = df2['MHSTDAT_y'].str.replace('^UNK-', '01-', regex = True).str.replace('-UNK-', '-01-').str.replace('-UNK$', '-0001', regex = True)
df2['MHENDAT_y_reg'] = df2['MHENDAT_y'].str.replace('^UNK-UNK-', '31-12-', regex = True)
df2['MHENDAT_y_reg'] = df2['MHENDAT_y_reg'].str.replace('-UNK-', '-12-').str.replace('-UNK$', '-9999', regex = True)
df2['MHENDAT_y_reg'] = df2['MHENDAT_y_reg'].str.replace(r'^(UNK-)(02-)', r'28-\2', regex = True).str.replace(r'^(UNK-)(04-|06-|09-|11-)', r'30-\2', regex = True).str.replace(r'^(UNK-)(01-|03-|05-|07-|08-|10-|12-)', r'31-\2', regex = True)

In [12]:
if not df2.empty:
    
    # Identify all MH pairs that temporally overlap with one another by applying the 'is_within_range' function:
    df2['MHSTDAT1_Within_Range'] = df2.apply(lambda row: is_within_range(row['MHSTDAT_y_reg'], row['MHENDAT_y_reg'], row['MHSTDAT_x']), axis = 1)
    df2['MHENDAT1_Within_Range'] = df2.apply(lambda row: is_within_range(row['MHSTDAT_y_reg'], row['MHENDAT_y_reg'], row['MHENDAT_x']), axis = 1)
    df2['MHSTDAT2_Within_Range'] = df2.apply(lambda row: is_within_range(row['MHSTDAT_x_reg'], row['MHENDAT_x_reg'], row['MHSTDAT_y']), axis = 1)
    df2['MHENDAT2_Within_Range'] = df2.apply(lambda row: is_within_range(row['MHSTDAT_x_reg'], row['MHENDAT_x_reg'], row['MHENDAT_y']), axis = 1)
    
    # Convert all x and y MH date strings into datetime format to allow for their ordinal comparison below:
    df2['MHSTDAT_x_reg'] = df2['MHSTDAT_x_reg'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y').date())
    df2['MHENDAT_x_reg'] = df2['MHENDAT_x_reg'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y').date())
    df2['MHSTDAT_y_reg'] = df2['MHSTDAT_y_reg'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y').date())
    df2['MHENDAT_y_reg'] = df2['MHENDAT_y_reg'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y').date())
    
    # The 'is_within_range' function allows the input date to vary its unknown parts to all possible values they could take,
    # however it only considers the MH start and end dates in isolation at any given time. It is therefore possible that when
    # the end date of MH x is before the start date of MH y (and vice versa), the 'is_within_range' function may vary the end
    # date of MH y to be before the start date of MH y (and vice versa), and the date may therefore flag as within range (True),
    # however this scenario is not possible in reality. The code below ensures this does not happen:
    df2.loc[df2['MHENDAT_x_reg'] < df2['MHSTDAT_y_reg'], 'MHSTDAT1_Within_Range'] = False
    df2.loc[df2['MHENDAT_y_reg'] < df2['MHSTDAT_x_reg'], 'MHSTDAT2_Within_Range'] = False
    df2.loc[df2['MHSTDAT_x_reg'] > df2['MHENDAT_y_reg'], 'MHENDAT1_Within_Range'] = False
    df2.loc[df2['MHSTDAT_y_reg'] > df2['MHENDAT_x_reg'], 'MHENDAT2_Within_Range'] = False

In [13]:
if not df2.empty:
    
    # Identify all MH pairs that semantically match AND temporally overlap with one another. If both conditions are met, then
    # the 'Similar_Overlap' column value is True, otherwise it is False:
    df2['Similar_Overlap'] = df2['Similarity_Match'] & ((df2['MHSTDAT1_Within_Range'] | df2['MHENDAT1_Within_Range']) | (df2['MHSTDAT2_Within_Range'] | df2['MHENDAT2_Within_Range']))
    
    # Filter the dataframe so it contains only the semantically matching AND temporally overlapping MH pairs:
    df3 = df2[df2['Similar_Overlap'] == True].reset_index(drop = True)
    
    if not df3.empty:
        
        # Revert all fields that were originally blank back to blank strings, so they simply show as blank spaces on the output:
        df3['MHSTDAT_x'] = df3['MHSTDAT_x'].replace('01-01-0001', '')
        df3['MHENDAT_x'] = df3['MHENDAT_x'].replace('31-12-9999', '')
        df3['MHONGO_x'] = df3['MHONGO_x'].fillna('')
        df3['MHTERM_x'] = df3['MHTERM_x'].fillna('')
        df3['MHCAT_x'] = df3['MHCAT_x'].fillna('')
        
        # Show only the necessary columns for the check output and rename them appropriately:
        df4 = df3[['CENTRE', 'SUBJECT_ID', 'MH#_x', 'MHTERM_x', 'MHCAT_x', 'MHSTDAT_x', 'MHONGO_x', 'MHENDAT_x']]
        df4 = df4.rename(columns = {'MH#_x': 'MH#', 'MHTERM_x': 'MHTERM', 'MHCAT_x': 'MHCAT', 'MHSTDAT_x': 'MHSTDAT', 'MHONGO_x': 'MHONGO', 'MHENDAT_x': 'MHENDAT'})
        
        # Drop all duplicate rows where the Centre, Subject_ID, and MH# column values are the same. These represent the same MHs:
        df5 = df4.drop_duplicates(subset = ['CENTRE', 'SUBJECT_ID', 'MH#']).reset_index(drop = True)
        
    else:
        df5 = pd.DataFrame({"No Issues Found": []})
else:
    df5 = pd.DataFrame({"No Issues Found": []})
df5

Unnamed: 0,CENTRE,SUBJECT_ID,MH#,MHTERM,MHCAT,MHSTDAT,MHONGO,MHENDAT
0,999,S002,2,Depression,MH,21-08-2003,Y,
1,999,S002,5,Anxiety,MH,24-11-2006,Y,
2,999,S022,1,Heart disease,MH,05-09-1998,Y,
3,999,S022,2,Cardiovascular disease,MH,15-03-2006,Y,
4,999,S022,3,Rhinoplasty,PR,20-05-2012,,20-05-2012
5,999,S022,4,Nose operation,PR,20-05-2012,,20-05-2012


In [7]:
# Lastly, export the output to an excel spreadsheet:
df5.to_excel('Output\Duplicate & Overlapping MH Check Output.xlsx', sheet_name = 'Duplicate & Overlapping MH', startrow = 0, startcol = 0, index = False, na_rep = '', header = True)