# MedDRA Autocoder using GPT

In [1]:
# Import the necessary libraries, modules and functions:
import pandas as pd
import numpy as np
import openai
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity
from getpass import getpass

# Prompt to enter OpenAI API key:
openai.api_key = getpass()

········


In [2]:
# Run below code only ONCE to obtain the MedDRA LLT vector embeddings for the first time OR if you wish to upgrade your existing
# MedDRA LLT embeddings file to a new MedDRA dictionary version. Vector embeddings are numerical representations of the underlying
# meaning (semantics) of natural language. They allow the program to perform computations between multiple pieces of unstructured
# text information, e.g. comparison of AETERMs (free-text) with LLTs. Use the raw llt.asc dictionary file provided under your
# organisation's MedDRA license. The last line of code saves the vector embeddings for all ~86,000 LLTs as arrays in a .parquet
# file which can be stored locally and then plugged-in (reused) each time you run the Autocoder program - This will limit the
# number of API calls made to the LLM embedding model to only the set of AE terms you whish to code:

#df1 = pd.read_csv('MedDRA Files\llt.asc', delimiter = '$', header = None, usecols = [0, 1, 2])
#df1.columns = ['LLT_Code', 'MedDRA_LLT', 'PT_Code']
#df1 = df1[['MedDRA_LLT', 'LLT_Code', 'PT_Code']]
#df1['LLT_Embedding'] = df1['MedDRA_LLT'].apply(lambda x: get_embedding(x, engine = 'text-embedding-ada-002'))
#df1.to_parquet('MedDRA Files\llt_embeddings.parquet', engine = 'pyarrow')

In [3]:
# Load the AE dataset as a dataframe, showing only the necessary columns:
dtypes1 = {'CENTRE': str, 'SUBJECT_ID': str, 'FORM_OCCURENCE': str, 'AETERM': str}
df2 = pd.read_csv('Input (EDC Datasets)\AE - Coding Example.csv', dtype = dtypes1)
df2 = df2[['CENTRE', 'SUBJECT_ID', 'FORM_OCCURENCE', 'AETERM']]
df2 = df2.rename(columns = {'FORM_OCCURENCE': 'AE#'})

# Drop rows where the AETERM is empty to avoid coding blank AE terms:
df2 = df2.dropna(subset = ['AETERM'])

# Get the LLM vector embeddings for all AE terms. Vector embeddings are numerical representations of the underlying meaning
# (semantics) of natural language. This allows the program to perform computations on free text entries in the AE term field.
# With the code below, only a list of isolated AE terms will be sent to OpenAI, with no other context:
df2['AE_Embedding'] = df2['AETERM'].apply(lambda x: get_embedding(x, engine = 'text-embedding-ada-002'))

# Add a new column to the AE dataframe called 'Key' which has a constant value of 1. The same will be added to the 'llt_embeddings'
# dataframe. This creates a key to join the two dataframes together, ensuring all pairwise AE-LLT combinations will result:
df2['Key'] = 1

# Load the previously saved 'llt_embeddings.parquet' file as a dataframe and add the 'Key' column with constant value of 1:
df3 = pd.read_parquet('MedDRA Files\llt_embeddings.parquet', engine = 'pyarrow')
df3['Key'] = 1

# Perform an outer join on the AE (left) and LLT (right) dataframes using the 'Key' column as the key. This combines the AE and
# LLT data into a single dataframe, providing all pairwise combinations of AEs and LLTs side-by-side:
df4 = pd.merge(df2, df3, on = 'Key', how = 'outer')

# Calculate the cosine similarity between the AETERM and LLT embedding vectors. This represents the difference in angle between
# the AETERM and LLT embedding vectors in n-dimensional space and numerically represents the semantic similarity between the two
# pieces of text:
df4['Precise_Similarity'] = df4.apply(lambda row: cosine_similarity(row['AE_Embedding'], row['LLT_Embedding']), axis = 1)

# Create a copy of the 'Precise_Similarity' column where the new values are rounded to 3 decimal places and converted to strings.
# This is purely for aesthetic purposes to allow the later 'styled' dataframe to display the rounded similarity values correctly
# whilst still being able to precisely identify maximum similarity values from the 'Precise_Similarity' column:
df4['Similarity'] = df4['Precise_Similarity'].apply(lambda x: '{:.3f}'.format(x))

# Show only the necessary columns for the program's output:
df4 = df4[['CENTRE', 'SUBJECT_ID', 'AE#', 'AETERM', 'MedDRA_LLT', 'Similarity', 'LLT_Code', 'PT_Code', 'Precise_Similarity']]

# Define the number of top matching LLTs you want the program to display for each AE:
top_llt_to_display = 5

# Sort all AE-LLT pairs from highest to lowest similarity within each Centre-SubjectID-AE# grouping and then filter the
# dataframe to show only the top X matching LLTs within each group according to the value specified for 'top_llt_to_display'.
# Please note that df6 is a boolean series of same length as the dataframe and is used to filter the dataframe:
df5 = df4.groupby(['CENTRE', 'SUBJECT_ID', 'AE#']).apply(lambda x: x.sort_values('Precise_Similarity', ascending = False)).reset_index(drop = True)
df6 = df5.groupby(['CENTRE', 'SUBJECT_ID', 'AE#']).cumcount() < top_llt_to_display
df7 = df5[df6].reset_index(drop = True)

# Create a new column flagging AE-LLT pairs with the highest similarity within each Centre-SubjectID-AE# grouping. This column
# contains boolean 'True' values which are used to colour-highlight dataframe rows corresponding to the highest LLT match within
# each AE:  
df7['Max_Similarity'] = df7.groupby(['CENTRE', 'SUBJECT_ID', 'AE#'])['Precise_Similarity'].transform(max) == df7['Precise_Similarity']

# Define a function to apply yellow highlighting to rows of a dataframe depending on the 'Max_Similarity' value:
def highlight_max_similarity(row):
    if row['Max_Similarity']:
        return ['background-color: yellow'] * len(row)
    return [''] * len(row)

# Apply the yellow highlighting function to flag all rows containing the highest LLT match for each AE, but only when the
# 'top_llt_to_display' value is greater than 1 (i.e. multiple LLT options are presented for each AE). The 'Precise_Similarity'
# and 'Max_Similarity' columns are then hidden from the resulting dataframes so they don't show on the final output:
if top_llt_to_display > 1:
    df8 = df7.style.apply(highlight_max_similarity, axis = 1)
    df8 = df8.hide(['Precise_Similarity', 'Max_Similarity'], axis = 1)
else:
    df8 = df7.style.hide(['Precise_Similarity', 'Max_Similarity'], axis = 1)

df8

Unnamed: 0,CENTRE,SUBJECT_ID,AE#,AETERM,MedDRA_LLT,Similarity,LLT_Code,PT_Code
0,999,S001,1,prickly feeling in skin,Pricking skin sensation,0.93,10036665,10033775
1,999,S001,1,prickly feeling in skin,Prick pain feeling,0.92,10036663,10033371
2,999,S001,1,prickly feeling in skin,Prickly heat,0.903,10036667,10027627
3,999,S001,1,prickly feeling in skin,Prick pain,0.9,10036662,10033371
4,999,S001,1,prickly feeling in skin,Prickling of hand,0.895,10036666,10033775
5,999,S001,2,hit by truck,Pedestrian in motor vehicle accident,0.885,10081207,10039203
6,999,S001,2,hit by truck,Traffic accident,0.883,10044332,10039203
7,999,S001,2,hit by truck,Road traffic accident,0.877,10039203,10039203
8,999,S001,2,hit by truck,Motor vehicle accident,0.874,10028008,10039203
9,999,S001,2,hit by truck,Bus accident,0.873,10006816,10039203


In [12]:
# Lastly, export the output to an excel spreadsheet:
df8.to_excel('Output\MedDRA Autocoder Output.xlsx', sheet_name = 'Coded AE Terms', startrow = 0, startcol = 0, index = False, na_rep = '', header = True, columns = df8.columns[:-2])