# Ongoing Medical History vs Adverse Event Similarity Check using GPT

In [1]:
# Import the necessary libraries, modules and functions:
import pandas as pd
import numpy as np
import openai
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity
from getpass import getpass

# Prompt to enter OpenAI API key:
openai.api_key = getpass()

········


In [2]:
# Load the MH dataset as a dataframe, showing only the necessary columns:
dtypes1 = {'CENTRE': str, 'SUBJECT_ID': str, 'FORM_OCCURENCE': str, 'MHTERM': str, 'MHONGO': str}
df1 = pd.read_csv('Input (EDC Datasets)\MH.csv', dtype = dtypes1)
df1 = df1[['CENTRE', 'SUBJECT_ID', 'FORM_OCCURENCE', 'MHTERM', 'MHONGO']]
df1 = df1.rename(columns = {'FORM_OCCURENCE': 'MH#'})

# Filter the MH dataframe to only show medical history items that are ongoing and drop rows where MHTERM is empty:
df1 = df1[df1['MHONGO'] == 'Y']
df1 = df1.dropna(subset = ['MHTERM']).reset_index(drop = True)

# Get the LLM vector embeddings for all MH terms. Vector embeddings are numerical representations of the underlying meaning
# (semantics) of natural language. This allows the program to perform computations on free text entries in the MH term field.
# With the code below, only a list of isolated MH terms will be sent to OpenAI, with no other context:
df1['MHTERM_Embedding'] = df1['MHTERM'].apply(lambda x: get_embedding(x, engine = 'text-embedding-ada-002'))

# Load the AE dataset as a dataframe, showing only the necessary columns:
dtypes2 = {'CENTRE': str, 'SUBJECT_ID': str, 'FORM_OCCURENCE': str, 'AETERM': str}
df2 = pd.read_csv('Input (EDC Datasets)\AE.csv', dtype = dtypes2)
df2 = df2[['CENTRE', 'SUBJECT_ID', 'FORM_OCCURENCE', 'AETERM']]
df2 = df2.rename(columns = {'FORM_OCCURENCE': 'AE#'})

# Drop rows from the AE dataframe where AETERM is empty:
df2 = df2.dropna(subset = ['AETERM']).reset_index(drop = True)

# Get the LLM vector embeddings for all AE terms. Vector embeddings are numerical representations of the underlying meaning
# (semantics) of natural language. This allows the program to perform computations on free text entries in the AE term field.
# With the code below, only a list of isolated AE terms will be sent to OpenAI, with no other context:
df2['AETERM_Embedding'] = df2['AETERM'].apply(lambda x: get_embedding(x, engine = 'text-embedding-ada-002'))

# Perform an inner join on the MH (left) and AE (right) dataframes using Centre and Subject_ID as the keys. This combines the
# MH and AE data into a single dataframe, providing all pairwise combinations of MHs and AEs side-by-side within only those
# subjects present in both dataframes:
df3 = pd.merge(df1, df2, on = ['CENTRE', 'SUBJECT_ID'], how = 'inner')

# Calculate the cosine similarity between the MH and AE term embedding vectors. This represents the difference in angle between
# the MH and AE term embedding vectors in n-dimensional space and numerically represents the semantic similarity between the two
# pieces of text:
df3['Similarity'] = df3.apply(lambda row: cosine_similarity(row['MHTERM_Embedding'], row['AETERM_Embedding']), axis = 1)

# Filter the dataframe for all MH-AE pairs where the cosine similarity is sufficiently high to indicate a semantic match. Tweak
# the similarity threshold to meet your specific needs:
df4 = df3[df3['Similarity'] >= 0.85].reset_index(drop = True)

# Show only the necessary columns for the check output and round the Similarity values to 2 decimal places:
df4 = df4[['CENTRE', 'SUBJECT_ID', 'MH#', 'MHTERM', 'MHONGO', 'AE#', 'AETERM', 'Similarity']]
df4['Similarity'] = df4['Similarity'].round(2)

if df4.empty:
    df4 = pd.DataFrame({"No Similar Terms Found": []})
df4

Unnamed: 0,CENTRE,SUBJECT_ID,MH#,MHTERM,MHONGO,AE#,AETERM,Similarity
0,999,S001,1,Hypertension,Y,2,High Blood Pressure,0.94
1,999,S001,2,headache.,Y,1,Headache,0.93
2,999,S001,4,Covid 19,Y,4,COVID-19,0.96
3,999,S002,1,Psoriasis,Y,1,Psoriasis,1.0
4,999,S002,1,Psoriasis,Y,4,Skin cancer,0.88
5,999,S002,2,Depression,Y,2,Low mood,0.88
6,999,S002,4,Melanoma,Y,4,Skin cancer,0.92
7,999,S003,3,Rheumatoid arthritis,Y,4,Joint pain,0.86
8,999,S003,4,Diabetes,Y,5,Hyperglycemia,0.91
9,999,S006,1,Hyperhydrosis,Y,1,Excessive sweating,0.94


In [3]:
# Lastly, export the output to an excel spreadsheet:
df4.to_excel('Output\MH vs AE Similarity Check Output.xlsx', sheet_name = 'MH vs AE Similarity Check', startrow = 0, startcol = 0, index = False, na_rep = '', header = True)