# eCRF Completion Guidelines Text Generator

In [1]:
# Import the necessary libraries and modules:
import pandas as pd
import numpy as np
import random
import re
import openai
import langchain
from langchain import OpenAI
from langchain import PromptTemplate
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_UNDERLINE, WD_LINE_SPACING
from docx.enum.style import WD_STYLE
from docx.shared import Inches
from docx.shared import Pt
from getpass import getpass

# Prompt to enter OpenAI API key:
openai.api_key = getpass()

# Initialise the large language model:
llm = OpenAI(openai_api_key = openai.api_key, temperature = 0.0, model_name = 'text-davinci-003')

········


In [22]:
# Import the Data Dictionary and EDC configuration reports as dataframes:
dhm = pd.read_csv('EDC Configuration Reports\DHM.csv', keep_default_na = False)
items = pd.read_csv('EDC Configuration Reports\Items Report.csv', keep_default_na = False, na_values = [''])
lists = pd.read_csv('EDC Configuration Reports\Lists Report.csv', keep_default_na = False)
formulas = pd.read_csv('EDC Configuration Reports\Formulas Report.csv', keep_default_na = False)
dependencies = pd.read_csv('EDC Configuration Reports\Dependencies Report.csv', keep_default_na = False)
forms_visits = pd.read_csv('EDC Configuration Reports\Forms & Visits Report.csv', keep_default_na = False, na_values = [''])
form_activations = pd.read_csv('EDC Configuration Reports\Form Activations.csv', keep_default_na = False)

# Split the form codes and form names into their own separate columns:
items.insert(0, 'Specific Form Name', items['Form Code (Ascending)'].apply(lambda x: x.split(' - ', 1)[1] if ' - ' in x else x))
items.insert(0, 'Specific Form Code', items['Form Code (Ascending)'].apply(lambda x: x.split(' - ')[0] if ' - ' in x else x))

# Create a new column called 'Parent Table Literal' which pulls the literal of the parent table for each item into it. This
# column is used later on to provide additional context for poorly labelled (i.e. overgeneralised) items that sit within a table.
# This additional contex is injected into the prompt templates as necessary.
tables = items[items['Data Type'] == 'Table of items']
formcode_itemcode_literal_dict = tables.set_index(['Specific Form Code', 'Item Code'])['Literal'].to_dict()
items['Parent Table Literal'] = items.apply(lambda row: pd.Series(formcode_itemcode_literal_dict.get((row['Specific Form Code'], row['Parent item code'])), dtype = 'object'), axis = 1)
items['Parent Table Literal'] = items['Parent Table Literal'].fillna('')

# Filter out all rows associated with hidden and read-only items, and all uneeded data types such as headers, line feeds and
# tables etc. Also filter out all blank literal rows, except rows where data type is sub-from, and fill NaN values in 'Unit of
# measure' column to empty strings, as the prompt template used later on will expect this:
items = items[(items['Hide Item'] == 'No') & (items['Read only'] == 'No')].reset_index(drop = True)
items = items[items['Data Type'].isin(['Date Time', 'Search list', 'File', 'Signature', 'PDF Document', 'Video', 'Group of Booleans', 'Boolean', 'Text memo', 'Date', 'List of values', 'Time', 'Drop-down list', 'Integer', 'Text', 'Sub-Form', 'Real'])].reset_index(drop = True)
items = items[(items['Literal'].notnull()) | (items['Data Type'] == 'Sub-Form')].reset_index(drop = True)
items['Unit of measure'] = items['Unit of measure'].fillna('')

# Pull the correct item sequence into the items dataframe from the dhm dataframe and call it 'Item Order':
dhm['CompositeKey'] = list(zip(dhm['Form Code'], dhm['Item Code']))
items['CompositeKey'] = list(zip(items['Specific Form Code'], items['Item Code']))
mapping_dict = dict(zip(dhm['CompositeKey'], dhm['Sequence']))
items['Item Order'] = items['CompositeKey'].map(mapping_dict)
items['Item Order'] = items['Item Order'].fillna(0).astype(int)

# Create dictionaries of parent forms and their sub-forms, for both codes and full names:
# Parent-Subform CODE dictionary:
subforms = items[items['Data Type'] == 'Sub-Form']
parent_subform_code_dict = subforms.groupby('Specific Form Code')['Sub-Form Code'].apply(list).to_dict()

# Parent-Subform NAME dictionary:
subform_list = subforms['Sub-Form Code'].tolist()
filtered_1 = items[items['Specific Form Code'].isin(subform_list)].copy()
filtered_1 = filtered_1.drop_duplicates(subset = 'Specific Form Code', keep = 'first')
subform_code_name_dict = dict(zip(filtered_1['Specific Form Code'], filtered_1['Specific Form Name']))
parent_subform_name_dict = {key: [subform_code_name_dict[value] for value in values_list] for key, values_list in parent_subform_code_dict.items()}
parent_forms = items[~items['Specific Form Code'].isin(subform_list)].copy()
parent_forms_unique = parent_forms.drop_duplicates(subset = 'Specific Form Code', keep = 'first')
parent_code_name_dict = dict(zip(parent_forms_unique['Specific Form Code'], parent_forms_unique['Specific Form Name']))
parent_subform_name_dict = {parent_code_name_dict[key]: values_list for key, values_list in parent_subform_name_dict.items()}

# Add a new column to the items dataframe indicating if the item sits within a sub-form:
items['In Sub-Form?'] = items['Specific Form Code'].map(lambda x: 'Yes' if any(x in sublist for sublist in parent_subform_code_dict.values()) else 'No')

# Add new columns in the items dataframe called 'Parent Form Name' and 'Parent Form Code' that will capture the overarching
# category of each form in the database, regardless of whether it is a parent vs sub-form. This will simplify the guidelines:
items['Parent Form Name'] = items['Specific Form Name'].apply(lambda x: next((key for key, values in parent_subform_name_dict.items() if x in values), x))
items['Parent Form Code'] = items['Specific Form Code'].apply(lambda x: next((key for key, values in parent_subform_code_dict.items() if x in values), x))

# Add a new column to the items dataframe called 'Specific Form Order' to help sequentially sort all sub-forms (if they exist)
# within the parent form (including the parent form itself) in ascending order. This means that all parent form items will be
# listed first and then the sub-form items beneath that, in the order that the sub-forms are embedded in the parent form:
subform_code_sequence_dict = dict(zip(subforms['Sub-Form Code'], subforms['Sequence']))
items['Specific Form Order'] = items['Specific Form Code'].apply(lambda code: subform_code_sequence_dict.get(code, 1))
items.loc[items['Data Type'] == 'Sub-Form', 'Specific Form Order'] = items['Sequence']
items.loc[items['Data Type'] == 'Sub-Form', 'Page'] = 1

# Create dictionaries to store form and item codes relating to boolean items. This allows all boolean items for which there is
# a parent 'Group of Booleans' item, to be identified and removed. The 'Item Order' values for the parent 'Group of Boolean'
# items are then updated based on the starting order position of their child booleans:
grouped_booleans = items[items['Data Type'] == 'Group of Booleans']
grouped_booleans_per_form = grouped_booleans.groupby('Specific Form Code')['Item Code'].agg(list).reset_index()
grouped_booleans_dict = dict(zip(grouped_booleans_per_form['Specific Form Code'], grouped_booleans_per_form['Item Code']))
items['Group of Booleans Present?'] = items.apply(lambda row: 'Yes' if row['Data Type'] == 'Boolean' and row['Parent item code'] in grouped_booleans_dict.get(row['Specific Form Code'], []) else 'No', axis = 1)
group_of_booleans_present = items[items['Group of Booleans Present?'] == 'Yes']
order_position_for_group_of_booleans = group_of_booleans_present.groupby(['Specific Form Code', 'Parent item code'])['Item Order'].min().reset_index()
items = items[items['Group of Booleans Present?'] == 'No'].drop(columns = ['Group of Booleans Present?'])
order_position_dict = order_position_for_group_of_booleans.set_index(['Specific Form Code', 'Parent item code'])['Item Order'].to_dict()
items['Item Order'] = items.apply(lambda row: order_position_dict[(row['Specific Form Code'], row['Item Code'])] if row['Data Type'] == 'Group of Booleans' and (row['Specific Form Code'], row['Item Code']) in order_position_dict else row['Item Order'], axis = 1)

# Hierarchically sort the dataframe by 'Parent Form Name' -> 'Parent Form Code' -> 'Specific Form Order' -> 'Page' -> 'Item Order'.
# This provides the overall correct sequence of events down the dataframe:
items = items.sort_values(by = ['Parent Form Name', 'Parent Form Code', 'Specific Form Order', 'Page', 'Item Order'])

# Pull the 'Specific Form Name' value into the 'Literal' column for all rows where data type is 'Sub-Form'. It pulls the
# 'Specific Form Name' value from the row immediately beneath, as this reflects the form name that the items sitting within each
# sub-form belong to:
subform_mask = items['Data Type'] == 'Sub-Form'
items.loc[subform_mask, 'Literal'] = items['Specific Form Name'].shift(-1)

# Retain only the necessary columns and reorder them: 
items = items[['Parent Form Name', 'Specific Form Name', 'Parent Form Code', 'Specific Form Code', 'In Sub-Form?', 'Specific Form Order', 'Page', 'Item Order', 'Item Code', 'Literal', 'Parent Table Literal', 'Data Type', 'Unit of measure', 'List Code', 'Parent item code']]

# Create a new dataframe containing only unique list codes from 'lists' as single rows, where the list element literals for each
# list are grouped / aggregated laterally into a single string from the vertically stacked rows. Map the aggregated element
# literals into the 'items' dataframe using the list Code as the common key:
aggregated_lists = lists.groupby('List Code (Ascending)')['Element - Literal'].agg(lambda x: '[' + ' | '.join(x[:3]) + ' | etc.]' if len(x) > 3 else '[' + ' | '.join(x) + ']').reset_index()
items['List Elements'] = items['List Code'].map(aggregated_lists.set_index('List Code (Ascending)')['Element - Literal'])

# Split the form code part out of the 'Item : Form Code' column in formulas into its own column called 'Form Code' and then create
# dictionaries for syntax and visit ID with dual key of form code and item code, for mapping formula information into items:
formulas['Form Code'] = formulas['Item : Form Code'].apply(lambda x: x.split(' - ')[0] if ' - ' in x else x)
formcode_itemcode_syntax_dict = formulas.set_index(['Form Code', 'Item : Item Code'])['Syntax'].to_dict()
formcode_itemcode_visitid_dict = formulas.set_index(['Form Code', 'Item : Item Code'])['Item : Visit ID'].to_dict()
formcode_itemcode_formulatype_dict = formulas.set_index(['Form Code', 'Item : Item Code'])['Range (Ascending)'].to_dict()

# Map the formula 'Syntax' and 'Visit ID' values from the formcode_itemcode_syntax_dict and formcode_itemcode_visitid_dict
# dictionaries into the items dataframe as new columns called 'Formula' and 'Formula Visit':
items['Formula'] = items.apply(lambda row: formcode_itemcode_syntax_dict.get((row['Specific Form Code'], row['Item Code']), np.nan) if row['Data Type'] != 'Sub-Form' else np.nan, axis = 1)
items['Formula Visit'] = items.apply(lambda row: formcode_itemcode_visitid_dict.get((row['Specific Form Code'], row['Item Code']), np.nan) if row['Data Type'] != 'Sub-Form' else np.nan, axis = 1)
items['Formula Type'] = items.apply(lambda row: formcode_itemcode_formulatype_dict.get((row['Specific Form Code'], row['Item Code']), np.nan) if row['Data Type'] != 'Sub-Form' else np.nan, axis = 1)

# Get all unique item codes in a list and add 'SYS_FORM_OCC' to end of the list to include this as a variable. Create a new
# column in the items dataframe called 'Formula Contains Variable?' which contains 'Yes' or 'No' depending on whether any of the
# list items are contained in the the 'Formula' column value: 
item_codes_list = items['Item Code'].unique().tolist() + ['SYS_FORM_OCC']
items['Formula Contains Variable?'] = items.loc[~items['Formula'].isna()].apply(lambda row: 'Yes' if any(code in row['Formula'] for code in item_codes_list) and '[' in row['Formula'] and ']' in row['Formula'] else 'No', axis = 1)

# Define a function that extracts the third-to-last sub-string that lies between pipe symbols within all sets of enclosed square
# brackets in a string and output the result as a list. This function will be used to extract the form code part of each variable
# in the inter-form formulas:
def extract_third_to_last_substring(input_string):
    pattern = r'\[([^\[\]]*?)\]'
    matches = re.finditer(pattern, input_string)
    result = []
    for match in matches:
        content_between_brackets = match.group(1)
        pipes_contents = content_between_brackets.split('|')
        if len(pipes_contents) >= 3:
            result.append(pipes_contents[-3])
    return result

# Define a function that extracts the last sub-string that lies between the last pipe symbol and the closing square bracket (])
# within all sets of enclosed square brackets in a string and output the result as a list. This function will be used to extract
# the item code part of each variable in the inter-form formulas:
def extract_last_substring(input_string):
    pattern = r'\[([^|\[\]]*\|){4,}([^|\[\]]*)\]'
    matches = re.finditer(pattern, input_string)
    result = []
    for match in matches:
        text_between = match.group(2)
        result.append(text_between)
    return result

# Define a function that extracts the sub-strings that lie between all sets of enclosed square brackets in a string and outputs
# the result as a list. This function will be used to extract the item codes in the intra-form formulas:
def extract_substring_between_brackets(input_string):
    pattern = r'\[([^\]]*)\]'
    matches = re.findall(pattern, input_string)
    return matches

# Add a new column to the items dataframe to store the unique concatenated form and item codes from the formulas. Do this for
# both inter and intra form formulas:
# Inter:
mask_1 = (items['Formula Type'] == 'Inter') & (items['Formula Contains Variable?'] == 'Yes')
items['Unique Formula Items'] = items.loc[mask_1, 'Formula'].apply(lambda formula: [form + item for form, item in zip(extract_third_to_last_substring(formula), extract_last_substring(formula))])
#Intra:
mask_2 = (items['Formula Type'] == 'Intra') & (items['Formula Contains Variable?'] == 'Yes')
items.loc[mask_2, 'Unique Formula Items'] = items.loc[mask_2, 'Formula'].apply(extract_substring_between_brackets)
# append the specific form code to the beginning of each list element:
items.loc[mask_2, 'Unique Formula Items'] = items[mask_2].apply(lambda row: [row['Specific Form Code'] + item for item in row['Unique Formula Items']], axis = 1)

# Create a dictionary where the keys are the concatenation of 'Specific Form Code' and 'Item Code' and the values are the
# corresponding item literals:
unique_items_dict = dict(zip(items.apply(lambda row: str(row['Specific Form Code']) + str(row['Item Code']), axis = 1), items['Literal']))
# Add all SYS_FORM_OCC combinations to unique_items_dict so these can also be looked up with literal of 'Form Occurrence Number':
unique_form_codes = items['Specific Form Code'].unique()
sys_form_occ_dict = {code + 'SYS_FORM_OCC': 'Form Occurrence Number' for code in unique_form_codes}
# Combine unique_items_dict and sys_form_occ_dict together:
unique_items_dict.update(sys_form_occ_dict)

# Add a new column to the items dataframe to store the item literals from the formulas. The item literals are accessed via the
# unique_items_dict dictionary created earlier. If an element from the 'Unique Formula Items' list is not found in the
# unique_items_dict, it won't be included in the 'List of Item Literals' list. Lastly, any empty 'List of Item Literals' lists
# are assigned as NaN and only unique list elements are retained:
items['List of Item Literals'] = items['Unique Formula Items'].apply(lambda x: [unique_items_dict[item] for item in x if item in unique_items_dict] if isinstance(x, list) else np.nan)
items['List of Item Literals'] = items['List of Item Literals'].apply(lambda x: np.nan if not x else x)
items['List of Item Literals'] = items['List of Item Literals'].apply(lambda x: list(set(x)) if isinstance(x, list) else np.nan)

# Prepare the 'Formula Visit' column ready to be input to the prompt templates:
items['Formula Visit'] = items['Formula Visit'].apply(lambda x: '' if isinstance(x, str) and x == 'All / None' else x)
items['Formula Visit'] = items['Formula Visit'].apply(lambda x: x if pd.isna(x) or x == '' else x.split(' - ', 1)[-1])

# Reload items as items2 to get the parent table codes for items within tables where the table itself is used in the dependency.
# This will allow dependency information to still show beside items that sit within tables in the guidelines: 
items2 = pd.read_csv('EDC Configuration Reports\Items Report.csv', keep_default_na = False, na_values = [''])
items2['Extracted Form Code'] = items2['Form Code (Ascending)'].apply(lambda x: x.split(' - ')[0] if ' - ' in x else x)
items2['Form-Item Code Concatenated'] = items2['Extracted Form Code'] + items2['Item Code']
items4 = items2[(items2['Data Type'] == 'Table of items') & (items2['Hide Item'] == 'No') & (items2['Read only'] == 'No')]
tables_formitem_code_list = items4['Form-Item Code Concatenated'].tolist()

# Split the form code out of the 'Item : Form Code' column in the dependencies dataframe and join it to the item code in a new
# column called 'Concatenated Form-Item Code'. This will allow all form-items not present in items to be identified and removed.
# Lastly, filter only for dependency rows we are interested in:
dependencies['Specific Form Code'] = dependencies['Item : Form Code'].apply(lambda x: x.split(' - ')[0] if ' - ' in x else x)
dependencies['Concatenated Form-Item Code'] = dependencies['Specific Form Code'] + dependencies['Item : Item Code']
concatenated_formitem_codes_list = (items['Specific Form Code'] + items['Item Code']).tolist()
concatenated_formitem_codes_list_including_tables = concatenated_formitem_codes_list + tables_formitem_code_list
# Filter only for dependencies that are intra-form and visible or enabled, and that have their dependent items present in items:
dependencies = dependencies[(dependencies['Range (Ascending)'] == 'Intra') & (dependencies['Type'].isin(['Visible', 'Enabled'])) & (dependencies['Concatenated Form-Item Code'].isin(concatenated_formitem_codes_list_including_tables))]
# Remove the last element 'SYS_FORM_OCC' from the previously defined item_codes_list:
item_codes_list.pop()
# Filter only for dependencies where the syntax contains a form code from the item_codes_list:
dependencies = dependencies[dependencies['Syntax'].apply(lambda x: any(code in x for code in item_codes_list))]

# Define a function that extracts sub-strings that lie between all sets of enclosed square brackets in a string, excluding
# the presence (@) or absence (!) symbols, and outputs the result as a list. This function will be used to extract the item
# codes from the dependency syntax:
def extract_substring_between_brackets_excluding_presence_absence_symbols(input_string):
    pattern = r'\[([!@]?)([^\]]*)\]'
    matches = re.findall(pattern, input_string)
    result = [prefix + substring if prefix and prefix not in '!@' else substring for prefix, substring in matches]
    result = list(set(result))
    return result

# Add a new column to the dependencies dataframe called 'Dependency Syntax Items Concatenated' to store item codes extracted from the dependency
# syntax, then concatenate this with the specific form code and map in the various form-page-item order values from the items
# dataframe to the dependencies dataframe so the dependency numbering / footnoting system can follow a logical order:
dependencies['Dependency Syntax Items Concatenated'] = dependencies.apply(lambda row: [row['Specific Form Code'] + item for item in extract_substring_between_brackets_excluding_presence_absence_symbols(row['Syntax'])], axis = 1)
dependencies['First Syntax Item Concatenated'] = dependencies['Dependency Syntax Items Concatenated'].apply(lambda x: x[0] if x else np.nan)
mapping_series = items.set_index(items['Specific Form Code'] + items['Item Code'])[['Specific Form Order', 'Page', 'Item Order']]
dependencies['Specific Form Order'] = dependencies['First Syntax Item Concatenated'].map(mapping_series['Specific Form Order'].get)
dependencies['Page'] = dependencies['First Syntax Item Concatenated'].map(mapping_series['Page'].get)
dependencies['Item Order'] = dependencies['First Syntax Item Concatenated'].map(mapping_series['Item Order'].get)
dependencies = dependencies.dropna(subset = ['Specific Form Order'])
dependencies[['Specific Form Order', 'Page', 'Item Order']] = dependencies[['Specific Form Order', 'Page', 'Item Order']].astype(int)

# Split the form name out of the 'Item : Form Code' column of dependencies and map to the corresponding parent form name using
# the previously created parent_subform_name_dict. Hierarchically sort the dependencies dataframe by 'Parent Form Name' ->
# 'Parent Form Code' -> 'Specific Form Order' -> 'Page' -> 'Item Order' to match the form sorting sequence in the items dataframe:
dependencies['Specific Form Name'] = dependencies['Item : Form Code'].apply(lambda x: x.split(' - ', 1)[1] if ' - ' in x else x)
dependencies['Parent Form Name'] = dependencies['Specific Form Name'].apply(lambda x: next((key for key, values in parent_subform_name_dict.items() if x in values), x))
specific_parent_code_dict = items.groupby('Specific Form Code')['Parent Form Code'].apply(lambda x: str(next(iter(x), ''))).to_dict()
dependencies['Parent Form Code'] = dependencies['Specific Form Code'].map(specific_parent_code_dict)
dependencies = dependencies.sort_values(by = ['Parent Form Name', 'Parent Form Code', 'Specific Form Order', 'Page', 'Item Order'], kind = 'mergesort')

# Assign a sequential number from 1 to n within each 'Code' category (dependency name group) within each parent form goup. The
# numbering resets to 1 again for each new parent form group. This numbering system will be used for the dependency footnote
# numbering system in the guidelines:
dependencies['Dependency Footnote'] = (dependencies.groupby('Parent Form Code')['Code'].transform(lambda x: pd.factorize(x)[0] + 1)).astype(str)
dependencies['Dependency Footnote'] = '(' + dependencies['Dependency Footnote'] + ')'

# Create dictionaries to map dependent items to their dependency footnotes, and independent items to their dependency footnotes
# and syntax:
dependent_items_footnote_dict = dependencies.set_index('Concatenated Form-Item Code')['Dependency Footnote'].to_dict()
independent_items_footnote_dict = dict(zip(zip(dependencies['Specific Form Code'], dependencies['Syntax'], dependencies['Dependency Footnote']), dependencies['Dependency Syntax Items Concatenated']))

# Create a new column in items called 'Dependent Item Footnote' which maps in the correct footnote from the dependent_items_footnote_dict
# dictionary. A second-pass mapping is then performed for all remaining NaN 'Dependent Item Footnote' values using specific form
# code concatenated with parent form code as the mapping key. This covers dependencies that reference table containers instead
# of individual items, to ensure items sitting within table containers are not missed:
items['Dependent Item Footnote'] = items.apply(lambda row: dependent_items_footnote_dict.get(row['Specific Form Code'] + row['Item Code'], np.nan), axis = 1)
items['Parent item code'] = items['Parent item code'].fillna('')
footnote_mask = items['Dependent Item Footnote'].isna()
items.loc[footnote_mask, 'Dependent Item Footnote'] = items[footnote_mask].apply(lambda row: dependent_items_footnote_dict.get(row['Specific Form Code'] + row['Parent item code'], np.nan), axis = 1)
items['Dependent Item Footnote'] = items['Dependent Item Footnote'].apply(lambda x: x + " " if pd.notna(x) else x)

# Add a new column called 'Response' that contains the sub-form section headings. Populate this column with the sub-form name +
# fixed string "...Log - Add Occurrences or Complete the Pre-Filled Lines as Applicable:". This column will also store the first
# layer of LLM-generated entry statements for all of the other items:
items.loc[items['Data Type'] == 'Sub-Form', 'Response'] = items.loc[items['Data Type'] == 'Sub-Form', 'Literal'] + " Log - Add Occurrences or Complete the Pre-Filled Lines as Applicable:"

# Create new columns that pull in the independent item dependency syntaxes and independent item footnotes as lists for each item
# in the 'items' dataframe:
items['Independent Item Dependency Syntax'] = items.apply(lambda row: [key[1] for key, value_list in independent_items_footnote_dict.items() if row['Specific Form Code'] + row['Item Code'] in value_list], axis = 1)
items.loc[items['Independent Item Dependency Syntax'].apply(len) == 0, 'Independent Item Dependency Syntax'] = np.nan
items['Independent Item Footnote'] = items.apply(lambda row: [key[2] for key, value_list in independent_items_footnote_dict.items() if row['Specific Form Code'] + row['Item Code'] in value_list], axis = 1)
items.loc[items['Independent Item Footnote'].apply(len) == 0, 'Independent Item Footnote'] = np.nan

# Create a new column that contains list 'value-literal' dictionaries for items / rows where there is a dependency sytnax and
# list code present. Note that the list value and literal strings are prepared with leading and ending quotation marks to allow
# for effective 'find and replace' procedure. The '==' prefix is added for scenarios where the list values are numerical:
lists_updated_1 = lists.copy()
lists_updated_1['Element - Value'] = '"' + lists_updated_1['Element - Value'] + '"'
lists_updated_1['Element - Literal'] = '"' + lists_updated_1['Element - Literal'] + '"'
lists_updated_2 = lists.copy()
lists_updated_2['Element - Value'] = '=' + lists_updated_2['Element - Value']
lists_updated_2['Element - Literal'] = '="' + lists_updated_2['Element - Literal'] + '"'
lists_updated_3 = pd.concat([lists_updated_1, lists_updated_2], ignore_index = True)
list_value_literal_dict = lists_updated_3.groupby('List Code (Ascending)')[['Element - Value', 'Element - Literal']].apply(lambda x: dict(zip(x['Element - Value'], x['Element - Literal']))).to_dict()
lists_mask = items['Independent Item Dependency Syntax'].notna() & items['List Code'].notna()
items.loc[lists_mask, 'List Value-Literal Dictionary'] = items.loc[lists_mask, 'List Code'].map(list_value_literal_dict)

# Define a lambda function that finds any substring keys from the 'List Value-Literal Dictionary' dictionaries and replaces them
# with the corresponding dictionary value string instead (i.e. the literal description). This effectively replaces the coded list
# values in the dependency syntax with the literal descriptions, allowing the syntax to be passed to the LLM via prompt template:
replace_substrings = lambda x, d: [re.sub('|'.join(map(re.escape, d.keys())), lambda m: d[m.group()], item) for item in x]
mask1 = items['List Value-Literal Dictionary'].notna()
items.loc[mask1, 'Dependency Syntax with Literal'] = items.loc[mask1].apply(lambda row: replace_substrings(row['Independent Item Dependency Syntax'], row['List Value-Literal Dictionary']), axis = 1)
mask2 = (items['Independent Item Dependency Syntax'].apply(type) == list) & (items['List Value-Literal Dictionary'].apply(type) != dict)
items.loc[mask2, 'Dependency Syntax with Literal'] = items.loc[mask2, 'Independent Item Dependency Syntax']

# Create 10 sets of 3 columns, each consisting of the nth extracted element of the 'Dependency Syntax with Literal' list, nth
# element of the 'Independent Item Footnote' list, and a newly initialised / blank response column ready for the LLM response to
# go into. This is done up to maximum of the 10th list element. The 10 responses will be laterally merged together later on:
# 1st Element:
items['Syntax 1'] = items['Dependency Syntax with Literal'].apply(lambda syntax_list: syntax_list[0] if isinstance(syntax_list, list) and len(syntax_list) >= 1 else np.nan)
items['Footnote 1'] = items['Independent Item Footnote'].apply(lambda footnote_list: footnote_list[0] if isinstance(footnote_list, list) and len(footnote_list) >= 1 else np.nan)
items['Response 1'] = ''
# 2nd Element:
items['Syntax 2'] = items['Dependency Syntax with Literal'].apply(lambda syntax_list: syntax_list[1] if isinstance(syntax_list, list) and len(syntax_list) >= 2 else np.nan)
items['Footnote 2'] = items['Independent Item Footnote'].apply(lambda footnote_list: footnote_list[1] if isinstance(footnote_list, list) and len(footnote_list) >= 2 else np.nan)
items['Response 2'] = ''
# 3rd Element:
items['Syntax 3'] = items['Dependency Syntax with Literal'].apply(lambda syntax_list: syntax_list[2] if isinstance(syntax_list, list) and len(syntax_list) >= 3 else np.nan)
items['Footnote 3'] = items['Independent Item Footnote'].apply(lambda footnote_list: footnote_list[2] if isinstance(footnote_list, list) and len(footnote_list) >= 3 else np.nan)
items['Response 3'] = ''
# 4th Element:
items['Syntax 4'] = items['Dependency Syntax with Literal'].apply(lambda syntax_list: syntax_list[3] if isinstance(syntax_list, list) and len(syntax_list) >= 4 else np.nan)
items['Footnote 4'] = items['Independent Item Footnote'].apply(lambda footnote_list: footnote_list[3] if isinstance(footnote_list, list) and len(footnote_list) >= 4 else np.nan)
items['Response 4'] = ''
# 5th Element:
items['Syntax 5'] = items['Dependency Syntax with Literal'].apply(lambda syntax_list: syntax_list[4] if isinstance(syntax_list, list) and len(syntax_list) >= 5 else np.nan)
items['Footnote 5'] = items['Independent Item Footnote'].apply(lambda footnote_list: footnote_list[4] if isinstance(footnote_list, list) and len(footnote_list) >= 5 else np.nan)
items['Response 5'] = ''
# 6th Element:
items['Syntax 6'] = items['Dependency Syntax with Literal'].apply(lambda syntax_list: syntax_list[5] if isinstance(syntax_list, list) and len(syntax_list) >= 6 else np.nan)
items['Footnote 6'] = items['Independent Item Footnote'].apply(lambda footnote_list: footnote_list[5] if isinstance(footnote_list, list) and len(footnote_list) >= 6 else np.nan)
items['Response 6'] = ''
# 7th Element:
items['Syntax 7'] = items['Dependency Syntax with Literal'].apply(lambda syntax_list: syntax_list[6] if isinstance(syntax_list, list) and len(syntax_list) >= 7 else np.nan)
items['Footnote 7'] = items['Independent Item Footnote'].apply(lambda footnote_list: footnote_list[6] if isinstance(footnote_list, list) and len(footnote_list) >= 7 else np.nan)
items['Response 7'] = ''
# 8th Element:
items['Syntax 8'] = items['Dependency Syntax with Literal'].apply(lambda syntax_list: syntax_list[7] if isinstance(syntax_list, list) and len(syntax_list) >= 8 else np.nan)
items['Footnote 8'] = items['Independent Item Footnote'].apply(lambda footnote_list: footnote_list[7] if isinstance(footnote_list, list) and len(footnote_list) >= 8 else np.nan)
items['Response 8'] = ''
# 9th Element:
items['Syntax 9'] = items['Dependency Syntax with Literal'].apply(lambda syntax_list: syntax_list[8] if isinstance(syntax_list, list) and len(syntax_list) >= 9 else np.nan)
items['Footnote 9'] = items['Independent Item Footnote'].apply(lambda footnote_list: footnote_list[8] if isinstance(footnote_list, list) and len(footnote_list) >= 9 else np.nan)
items['Response 9'] = ''
# 10th Element:
items['Syntax 10'] = items['Dependency Syntax with Literal'].apply(lambda syntax_list: syntax_list[9] if isinstance(syntax_list, list) and len(syntax_list) >= 10 else np.nan)
items['Footnote 10'] = items['Independent Item Footnote'].apply(lambda footnote_list: footnote_list[9] if isinstance(footnote_list, list) and len(footnote_list) >= 10 else np.nan)
items['Response 10'] = ''

# Reset the index of the 'items' dataframe so that it counts sequentially from first to last row:
items = items.reset_index(drop = True)

# Save a copy of the 'items' dataframe as 'items_copy', as it may be needed later on if filtering is performed in the next cell:
items_copy = items.copy()

In [23]:
# Select the forms you want to generate guidelines text for. If you want to generate text for all forms in the database, skip
# running the code in this cell:
items = items[items['Parent Form Code'].isin(['DM', 'VS_HTWT'])].reset_index(drop = True)

# Prompt templates covering the various data type, list and unit scenarios:

In [24]:
# Scenario 1: Shorten response options in the 'List Elements' column that are > 90 characters to < 90 characters:
template_1 = "Your task is to shorten a set of response options for a field in a clinical trial database to a total string \
            length of less than 90 characters. You'll be given the response options below, enclosed in square brackets. You \
            should shorten the length of any or all portions of text that lie between the first three | symbols, as necessary, \
            to get the overall string length of all response options combined to less than 90 characters (including the outer \
            square brackets). Any portions of text that you shorten must retain the same or very similar underlying meaning. \
            You must provide the response options back in the same structured format [X | Y | Z | … etc.] without changing the \
            '… etc.' part or removing the outer brackets or the | symbols. Your final response must be less than 90 characters \
            in total. Provide only the shortened response options in your response, nothing more.\n\n\
            \
            Response options: {list_elements}"
prompt_template_1 = PromptTemplate(input_variables = ['list_elements'], template = template_1)


# Scenario 2: Standard entry statement for 'Date Time', 'Text memo', 'Date', 'Time' and 'Search list' items:
template_2 = "Your task is to describe a field to be entered in a clinical trial database in the context of '{parent_form_name}'. \
            You'll be given the field name below and you should respond with a one-sentence statement instructing the user to \
            enter that field. For example, if the field name is 'Is the patient of child-bearing potential?', you would respond \
            with something like 'Enter the child-bearing potential'. If additional context is available in relation to the field \
            name, it will be shown below beside 'Additional context'. Provide only the one-sentence statement in your response, \
            nothing more.\n\n\
            \
            Field name: '{literal}'\n\n\
            \
            Additional context: '{parent_table_literal}'"
prompt_template_2 = PromptTemplate(input_variables = ['parent_form_name', 'literal', 'parent_table_literal'], template = template_2)


# Scenario 3: Entry statement for list-based items 'Drop-down list' and 'List of values':
template_3 = "Your task is to describe a field to be entered in a clinical trial database in the context of '{parent_form_name}'. \
            You'll be given the field name and some example response options enclosed in square brackets. You should respond \
            with a one-sentence statement instructing the user to enter that field. Place the example response options at the \
            end of the sentence, exactly as shown below. For example, if the field name is 'Is the patient of child-bearing \
            potential?' and the example response options are '[Yes | No | Unknown | … etc.]', you would respond with something \
            like 'Enter the child-bearing potential [Yes | No | Unknown | … etc.]'. If additional context is available in \
            relation to the field name, it will be shown below beside 'Additional context'. Provide only the one-sentence \
            statement in your response, nothing more.\n\n\
            \
            Field name: '{literal}'\n\n\
            \
            Example response options: {list_elements}\n\n\
            \
            Additional context: '{parent_table_literal}'"
prompt_template_3 = PromptTemplate(input_variables = ['parent_form_name', 'literal', 'list_elements', 'parent_table_literal'], template = template_3)


# Scenario 4: Entry statement for potential unit-based items 'Integer', 'Text' and 'Real':
template_4 = "Your task is to describe a field to be entered in a clinical trial database in the context of '{parent_form_name}'. \
            You'll be given the field name and its corresponding unit of measure below. You should respond with a one-sentence \
            statement instructing the user to enter that field in the units provided. For example, if the field name is 'What is \
            the patient's systolic blood pressure?' and the unit of measure is 'mmHg', you would respond with something like \
            'Enter the systolic blood pressure in mmHg'. If the unit of measure is not provided below, do not mention the unit \
            in your response, simply instruct the user to enter the field (e.g. 'Enter the systolic blood pressure'). If \
            additional context is available in relation to the field name, it will be shown below beside 'Additional context'. \
            Provide only the one-sentence statement in your response, nothing more.\n\n\
            \
            Field name: '{literal}'\n\n\
            \
            Unit of measure: '{unit_of_measure}'\n\n\
            \
            Additional context: '{parent_table_literal}'"
prompt_template_4 = PromptTemplate(input_variables = ['parent_form_name', 'literal', 'unit_of_measure', 'parent_table_literal'], template = template_4)


# Scenario 5: Entry statement for individual 'Boolean' items:
template_5 = "Your task is to describe a checkbox option that can be ticked in a database. You'll be given the checkbox name \
            below and you should respond with a one-sentence statement explaining that the checkbox can be ticked if it is \
            applicable. For example, if the checkbox name is 'Other', you would respond with something like 'If applicable, the \
            checkbox option 'Other' can be ticked'. If additional context is available in relation to the checkbox name, it will \
            be shown below beside 'Additional context'. Provide only the one-sentence statement in your response, nothing more.\n\n\
            \
            Checkbox name: '{literal}'\n\n\
            \
            Additional context: '{parent_table_literal}'"
prompt_template_5 = PromptTemplate(input_variables = ['literal', 'parent_table_literal'], template = template_5)


# Scenario 6: Entry statement for 'Group of Booleans' items:
template_6 = "Your task is to describe a group of checkbox options that can be selected in a clinical trial database in the \
            context of '{parent_form_name}'. You'll be given the checkbox group name below and you should respond with a \
            one-sentence statement explaining that the user should select all options that apply. For example, if the checkbox \
            group name is 'Which components did not perform?', you would respond with something like 'Select all components that \
            did not perform' or 'Tick all options that apply for components that did not perform'. If additional context is \
            available in relation to the checkbox group name, it will be shown below beside 'Additional context'. Provide only \
            the one-sentence statement in your response, nothing more.\n\n\
            \
            Checkbox group name: '{literal}'\n\n\
            \
            Additional context: '{parent_table_literal}'"
prompt_template_6 = PromptTemplate(input_variables = ['parent_form_name', 'literal', 'parent_table_literal'], template = template_6)


# Scenario 7: Entry statement for 'File Upload' items:
template_7 = "Your task is to describe a file upload field in a clinical trial database that files should be loaded into. You'll \
            be given the file upload field name below and you should respond with a one-sentence statement instructing the user \
            to upload files into that field. For example, if the file upload field name is 'DICOM file upload for abdominal x-rays', \
            you would respond with something like 'Upload the abdominal x-ray DICOM files into the file upload field'. If \
            additional context is available in relation to the file upload field name, it will be shown below beside 'Additional \
            context'. Provide only the one-sentence statement in your response, nothing more.\n\n\
            \
            File upload field name: '{literal}'\n\n\
            \
            Additional context: '{parent_table_literal}'"
prompt_template_7 = PromptTemplate(input_variables = ['literal', 'parent_table_literal'], template = template_7)


# Scenario 8: Entry statement for 'PDF Document' items:
template_8 = "Your task is to describe an embedded PDF document in a clinical trial database that should be read by the user. \
            You'll be given the PDF document name below and you should respond with a one-sentence statement instructing the \
            user to read the embedded PDF document. For example, if the PDF document name is 'XT001 Participant Information and \
            Consent Form', you would respond with something like 'Read the embedded PDF document relating to XT001 Participant \
            Information and Consent'. If additional context is available in relation to the PDF document name, it will be shown \
            below beside 'Additional context'. Provide only the one-sentence statement in your response, nothing more.\n\n\
            \
            PDF document name: '{literal}'\n\n\
            \
            Additional context: '{parent_table_literal}'"
prompt_template_8 = PromptTemplate(input_variables = ['literal', 'parent_table_literal'], template = template_8)


# Scenario 9: Entry statement for 'Video' items:
template_9 = "Your task is to describe an embedded video in a clinical trial database that the user should watch. You'll be \
            given the video name below and you should respond with a one-sentence statement instructing the user to watch the \
            embedded video. For example, if the video name is 'XT001 Study Introduction and Training', you would respond with \
            something like 'Watch the embedded video relating to XT001 Study Introduction and Training'. If additional context \
            is available in relation to the video name, it will be shown below beside 'Additional context'. Provide only the \
            one-sentence statement in your response, nothing more.\n\n\
            \
            Video name: '{literal}'\n\n\
            \
            Additional context: '{parent_table_literal}'"
prompt_template_9 = PromptTemplate(input_variables = ['literal', 'parent_table_literal'], template = template_9)


# Scenario 10: Entry statement for 'Signature' items:
template_10 = "Your task is to describe a signature field in a clinical trial database that the user should place their signature \
            into. You'll be given the signature field name below and you should respond with a one-sentence statement instructing \
            the user to place their signature into that field if they are authorised to do so. For example, if the signature field \
            name is 'Consenting physician signature', you would respond with something like 'If authorised, sign in the consenting \
            physician signature field'. If additional context is available in relation to the signature field name, it will be \
            shown below beside 'Additional context'. Provide only the one-sentence statement in your response, nothing more.\n\n\
            \
            Signature field name: '{literal}'\n\n\
            \
            Additional context: '{parent_table_literal}'"
prompt_template_10 = PromptTemplate(input_variables = ['literal', 'parent_table_literal'], template = template_10)

# Prompt templates updating the entry statement for formula items:

In [25]:
# Scenario 11: Update the entry statement for formula items to reference automatic completion by the system based on fields
# identified in the formula syntax:
template_11 = "Your task is to change the wording of a sentence relating to the entry of a field in a clinical trial database. \
            You'll be given the sentence below and you should transform it so that it instead informs the user that the field \
            will be automatically completed by the system (rather than entered by the user) based on information stored in other \
            fields of the database. The other fields controlling the automated completion by the system are listed below beside \
            'Other fields'. If the automated completion by the system is only applicable to a certain visit, this will be shown \
            below beside 'Visit'. If the visit is not provided below, do not mention anything about visit in your response. For \
            example, if the sentence is 'Enter the patient's age in years at time of consent' and the other fields are ['Date of \
            Birth', 'Date Patient Signed Main Study Consent'] and the visit is 'Screening', you would respond with something \
            like 'The patient's age in years at time of consent will be automatically completed by the system at the Screening \
            visit based on the date of birth and the date the patient signed the main study consent'. For the same example as \
            above, but if visit is not provided, you would respond with something like 'The patient's age in years at time of \
            consent will be automatically completed by the system based on the date of birth and the date the patient signed the \
            main study consent'. Remember that you must include reference to the visit in your response if it is provided below. \
            Provide only the single transformed sentence in your response, nothing more.\n\n\
            \
            Sentence: '{response}'\n\n\
            \
            Other fields: {list_of_item_literals}\n\n\
            \
            Visit: '{formula_visit}'"
prompt_template_11 = PromptTemplate(input_variables = ['response', 'list_of_item_literals', 'formula_visit'], template = template_11)


# Scenario 12: Update the entry statement for formula items to reference automatic completion by the system when fields are NOT
# identified in the formula syntax (i.e. generic statement):
template_12 = "Your task is to change the wording of a sentence relating to the entry of a field in a clinical trial database. \
            You'll be given the sentence below and you should transform it so that it instead informs the user that the field \
            will be automatically completed by the system (rather than entered by the user). If the automated completion by the \
            system is only applicable to a certain visit, this will be shown below beside 'Visit'. If the visit is not provided \
            below, do not mention anything about visit in your response. For example, if the sentence is 'Enter the patient's \
            age in years at time of consent' and the visit is 'Screening', you would respond with something like 'The patient's \
            age in years at time of consent will be automatically completed by the system at the Screening visit'. For the same \
            example as above, but if visit is not provided, you would respond with something like 'The patient's age in years \
            at time of consent will be automatically completed by the system'. Remember that you must include reference to the \
            visit in your response if it is provided below. Provide only the single transformed sentence in your response, \
            nothing more.\n\n\
            \
            Sentence: '{response}'\n\n\
            \
            Visit: '{formula_visit}'"
prompt_template_12 = PromptTemplate(input_variables = ['response', 'formula_visit'], template = template_12)

# Prompt template for the dependency footnote statements:

In [26]:
# Scenario 13: Dependency footnote statement:
template_13 = "Your task is to describe a field dependency in a clinical trial database. You'll be given the dependency syntax \
            below which contains the conditional logic (rules) controlling the visibility of other fields in the database. You \
            should respond with a one-sentence statement explaining how if a particular entry is made, the user should see items \
            at '(x)'. The '(x)' is a footnote that is provided below beside 'Footnote'. You should focus your attention only on \
            parts (expressions) in the dependency syntax that contain the 'Expression marker' provided below. The expression \
            marker will always be found inside a closed set of square brackets in the dependency syntax. If there is an '@' or \
            '!' symbol immediately preceding the expression marker in the dependency syntax, this signifies a boolean checkbox \
            situation, where the '@' symbol represents a ticked state and the '!' symbol represents an unticked state. Here are \
            some examples of how you would respond: If the dependency syntax is '\"[STDAT]\"==\"Not Done\" || \"[STDAT]\"==\"Maybe\
            \" && \"[PE_SUM]\"==\"No\" && [!PENA]', the footnote is '(1)', and the expression marker is 'STDAT', you would respond \
            with something like 'If 'Not Done' or 'Maybe', see items at (1)'. If the dependency syntax is '[!SVNA] && [@PEOTH]', \
            the footnote is '(2)', and the expression marker is 'SVNA', you would respond with something like 'If unticked, see \
            items at (2)'. If the dependency syntax is '[!CEACN] && [@EGNA]', the footnote is '(4)', and the expression marker \
            is 'EGNA', you would respond with something like 'If ticked, see items at (4)'. If the dependency syntax is '[LBORRES]\
            <=2.5 || [LBORRES]>=7.1 && \"[LBPERF]\"==\"Yes\"', the footnote is '(1)', and the expression marker is 'LBORRES', you \
            would respond with something like 'If less than or equal to 2.5 or greater than or equal to 7.1, see items at (1)'. \
            If the dependency syntax is '\"[DEYN]\"!=\"Yes\" || \"[XDYN]\"==\"No\"', the footnote is '(3)', and the expression \
            marker is 'DEYN', you would respond with something like 'If not equal to 'Yes', see items at (3)'. As per the examples \
            provided, if the syntax expression you're looking at contains '==', do not use the words 'equal to' in your response, \
            simply use the format 'If xxx, see items at (x)'. Provide only the one-sentence statement in your response, nothing \
            more.\n\n\
            \
            Dependency syntax: '{syntax}'\n\n\
            \
            Footnote: '{footnote}'\n\n\
            \
            Expression marker: '{item_code}'"
prompt_template_13 = PromptTemplate(input_variables = ['syntax', 'footnote', 'item_code'], template = template_13)

# Prompt template for the form-visit association statements:

In [27]:
# Scenario 14: Form-visit association statement:
template_14 = "Your task is to describe the association of visits (time points) to a form in a clinical trial database. You'll \
            be given the form name and its associated visits below and you should respond with a one-sentence statement informing \
            the user that the form appears in the EDC at those visits / time points. For example, if the form name is 'Demographics' \
            and the associated visit is ['Screening'], you would respond with something like 'The Demographics form appears in the \
            EDC at the Screening visit only'. If the form name is 'Mapping & Ablation Procedure' and the associated visits are \
            ['Day 1 - Index Procedure', 'Unscheduled Visit'], you would respond with something like 'The Mapping & Ablation \
            Procedure form appears in the EDC at the Day 1 - Index Procedure and Unscheduled visits'. If any of the associated \
            visit names contain the word 'visit' in them, omit this word when listing the visit names in your response, as you \
            will mention the word 'visit' at the very end of the sentence anyway. Provide only the one-sentence statement in your \
            response, nothing more.\n\n\
            \
            Form name: '{parent_form_name}'\n\n\
            \
            Associated visits: {form_visit_list}"
prompt_template_14 = PromptTemplate(input_variables = ['parent_form_name', 'form_visit_list'], template = template_14)

# Prompt template for the form visibility statements:

In [28]:
# Scenario 15: Form visibility statement:
template_15 = "Your task is to describe how the visibility of a form (it) is controlled by the completion of other forms in a \
            clinical trial database. You'll be given the other form names below and you should respond with a one-sentence \
            statement informing the user that its visibility is controlled by completion of those forms. For example, if the \
            other form name is ['Informed Consent'], you would respond with something like 'Its visibility is controlled by \
            completion of the Informed Consent form'. If the other form names are ['Study Visit', 'Eligibility Review Form'], \
            you would respond with something like 'Its visibility is controlled by completion of the Study Visit and Eligibility \
            Review forms'. If any of the other form names contain the word 'form' in them, omit this word when listing the form \
            names in your response, as you will mention the word 'form' at the very end of the sentence anyway. Provide only the \
            one-sentence statement in your response, nothing more.\n\n\
            \
            Other form names: {form_activation_list}"
prompt_template_15 = PromptTemplate(input_variables = ['form_activation_list'], template = template_15)

# Prompt template for the form summary description:

In [29]:
# Scenario 16: Form summary description:
template_16 = "Your task is to succinctly summarise the purpose and importance of a form in a clinical trial database. You'll be \
            given the form name and a list of fields that comprise the form below. Review the entire list of fields in the context \
            of the form name to get an idea of what the form is about, what type of information it captures, and why it matters. \
            You should respond with a concise three to four sentence paragraph (nothing more) broadly summarising the overall \
            purpose of the form. Where possible, try to summarise the information in more general overarching terms, rather than \
            listing off the exact verbatim details provided to you. Do not use the phrase 'clinical trial database' in your response. \
            If referring to 'clinical trial', please always phrase it as 'the trial'. Provide only the concise three to four sentence \
            paragraph in your response, nothing more.\n\n\
            \
            Form name: {parent_form_name}\n\n\
            \
            List of fields: {form_literals_list}"
prompt_template_16 = PromptTemplate(input_variables = ['parent_form_name', 'form_literals_list'], template = template_16)

In [30]:
# Shorten response options in the 'List Elements' column that are > 90 characters to < 90 characters:
for index, row in items.iterrows():
    if isinstance(row['List Elements'], str) and len(row['List Elements']) > 90:
        response = llm(prompt_template_1.format(list_elements = row['List Elements'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'List Elements'] = response
        print('\n' + response)

In [31]:
# Iterate through all items to retrieve the basic entry statement depending on the various data type / list / unit scenarios.
# This loop applies the correct prompt template (2-10) to each of the scenarios so that the correct request is made to the LLM:
for index, row in items.iterrows():
    if row['Data Type'] in ['Date Time', 'Text memo', 'Date', 'Time', 'Search list']:
        response = llm(prompt_template_2.format(parent_form_name = row['Parent Form Name'], literal = row['Literal'], parent_table_literal = row['Parent Table Literal'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response'] = response
        print('\n' + response)
    
    if row['Data Type'] in ['Drop-down list', 'List of values']:
        response = llm(prompt_template_3.format(parent_form_name = row['Parent Form Name'], literal = row['Literal'], list_elements = row['List Elements'], parent_table_literal = row['Parent Table Literal'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response'] = response
        print('\n' + response)
    
    if row['Data Type'] in ['Integer', 'Text', 'Real']:
        response = llm(prompt_template_4.format(parent_form_name = row['Parent Form Name'], literal = row['Literal'], unit_of_measure = row['Unit of measure'], parent_table_literal = row['Parent Table Literal'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response'] = response
        print('\n' + response)
    
    if row['Data Type'] in ['Boolean']:
        response = llm(prompt_template_5.format(literal = row['Literal'], parent_table_literal = row['Parent Table Literal'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response'] = response
        print('\n' + response)
        
    if row['Data Type'] in ['Group of Booleans']:
        response = llm(prompt_template_6.format(parent_form_name = row['Parent Form Name'], literal = row['Literal'], parent_table_literal = row['Parent Table Literal'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response'] = response
        print('\n' + response)
        
    if row['Data Type'] in ['File']:
        response = llm(prompt_template_7.format(literal = row['Literal'], parent_table_literal = row['Parent Table Literal'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response'] = response
        print('\n' + response)
        
    if row['Data Type'] in ['PDF Document']:
        response = llm(prompt_template_8.format(literal = row['Literal'], parent_table_literal = row['Parent Table Literal'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response'] = response
        print('\n' + response)
        
    if row['Data Type'] in ['Video']:
        response = llm(prompt_template_9.format(literal = row['Literal'], parent_table_literal = row['Parent Table Literal'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response'] = response
        print('\n' + response)
        
    if row['Data Type'] in ['Signature']:
        response = llm(prompt_template_10.format(literal = row['Literal'], parent_table_literal = row['Parent Table Literal'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response'] = response
        print('\n' + response)


Enter the patient's date of birth.

Enter the age in years at the time of consent.

Enter the patient's sex [Male | Female].

Enter whether the patient is of child-bearing potential [Yes | No].

Enter the contraception method(s) used by this patient.

Enter the reason for non-child-bearing potential.

Enter the patient's ethnicity [Hispanic or Latino | Not Hispanic or Latino | Unknown | etc.].

Enter the patient's race [White | Aboriginal and Torres Strait Islander | Black or African American | etc.].

Enter the other race.

Enter whether height and weight measurements were performed for BMI calculation [Yes | No].

Enter the date of measurement.

Enter the time of measurement.

Enter the reason height and weight were not measured.

Enter the height in metres.

Enter the weight in kilograms.

Enter the BMI in kg/m2.


In [32]:
# Iterate through all items to update the entry statement for formula items. This loop applies the correct prompt template (11-12)
# to each of the formula scenarios so that the correct request is made to the LLM:
for index, row in items.iterrows():
    if isinstance(row['List of Item Literals'], list):
        response = llm(prompt_template_11.format(response = row['Response'], list_of_item_literals = row['List of Item Literals'], formula_visit = row['Formula Visit'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response'] = response
        print('\n' + response)
        
    if not isinstance(row['List of Item Literals'], list) and pd.notna(row['Formula']):
        response = llm(prompt_template_12.format(response = row['Response'], formula_visit = row['Formula Visit'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response'] = response
        print('\n' + response)


The age in years at the time of consent will be automatically completed by the system based on the date the patient signed the main study consent and the date of birth.

The BMI in kg/m2 will be automatically completed by the system based on the height and weight.


In [33]:
# Iterate through all items that are present in one or more dependency syntaxes across all 10 available syntax-footnote colums
# and post the LLM response back to the corresponding response column. This covers all dependency footnote statements for each
# item, up to a maximum of 10 instances. These footnote statements link the independent items to their dependent items:
for index, row in items.iterrows():
    if pd.notna(row['Syntax 1']):
        response = llm(prompt_template_13.format(syntax = row['Syntax 1'], footnote = row['Footnote 1'], item_code = row['Item Code'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response 1'] = response
        print('\n' + response)
        
    if pd.notna(row['Syntax 2']):
        response = llm(prompt_template_13.format(syntax = row['Syntax 2'], footnote = row['Footnote 2'], item_code = row['Item Code'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response 2'] = response
        print('\n' + response)
        
    if pd.notna(row['Syntax 3']):
        response = llm(prompt_template_13.format(syntax = row['Syntax 3'], footnote = row['Footnote 3'], item_code = row['Item Code'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response 3'] = response
        print('\n' + response)
        
    if pd.notna(row['Syntax 4']):
        response = llm(prompt_template_13.format(syntax = row['Syntax 4'], footnote = row['Footnote 4'], item_code = row['Item Code'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response 4'] = response
        print('\n' + response)
        
    if pd.notna(row['Syntax 5']):
        response = llm(prompt_template_13.format(syntax = row['Syntax 5'], footnote = row['Footnote 5'], item_code = row['Item Code'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response 5'] = response
        print('\n' + response)
        
    if pd.notna(row['Syntax 6']):
        response = llm(prompt_template_13.format(syntax = row['Syntax 6'], footnote = row['Footnote 6'], item_code = row['Item Code'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response 6'] = response
        print('\n' + response)
        
    if pd.notna(row['Syntax 7']):
        response = llm(prompt_template_13.format(syntax = row['Syntax 7'], footnote = row['Footnote 7'], item_code = row['Item Code'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response 7'] = response
        print('\n' + response)
        
    if pd.notna(row['Syntax 8']):
        response = llm(prompt_template_13.format(syntax = row['Syntax 8'], footnote = row['Footnote 8'], item_code = row['Item Code'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response 8'] = response
        print('\n' + response)
        
    if pd.notna(row['Syntax 9']):
        response = llm(prompt_template_13.format(syntax = row['Syntax 9'], footnote = row['Footnote 9'], item_code = row['Item Code'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response 9'] = response
        print('\n' + response)
        
    if pd.notna(row['Syntax 10']):
        response = llm(prompt_template_13.format(syntax = row['Syntax 10'], footnote = row['Footnote 10'], item_code = row['Item Code'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Response 10'] = response
        print('\n' + response)


If 'Female', see items at (1).

If 'Yes', see items at (2).

If 'No', see items at (3).

If 'Other', see items at (4).

If 'Yes', see items at (1).

If 'No', see items at (2).


In [34]:
# Replace all NaN values in the 'Dependent Item Footnote' column with empty strings and add a blank space to the beginning of
# all non-empty strings in the 'Response 1 to 10' columns. Concatenate the 'Dependent Item Footnote', 'response' and 'Response 1
# to 10' columns together to form merged strings:
items['Dependent Item Footnote'] = items['Dependent Item Footnote'].fillna('')
items.loc[items['Response 1'].str.len() > 0, 'Response 1'] = ' ' + items.loc[items['Response 1'].str.len() > 0, 'Response 1']
items.loc[items['Response 2'].str.len() > 0, 'Response 2'] = ' ' + items.loc[items['Response 2'].str.len() > 0, 'Response 2']
items.loc[items['Response 3'].str.len() > 0, 'Response 3'] = ' ' + items.loc[items['Response 3'].str.len() > 0, 'Response 3']
items.loc[items['Response 4'].str.len() > 0, 'Response 4'] = ' ' + items.loc[items['Response 4'].str.len() > 0, 'Response 4']
items.loc[items['Response 5'].str.len() > 0, 'Response 5'] = ' ' + items.loc[items['Response 5'].str.len() > 0, 'Response 5']
items.loc[items['Response 6'].str.len() > 0, 'Response 6'] = ' ' + items.loc[items['Response 6'].str.len() > 0, 'Response 6']
items.loc[items['Response 7'].str.len() > 0, 'Response 7'] = ' ' + items.loc[items['Response 7'].str.len() > 0, 'Response 7']
items.loc[items['Response 8'].str.len() > 0, 'Response 8'] = ' ' + items.loc[items['Response 8'].str.len() > 0, 'Response 8']
items.loc[items['Response 9'].str.len() > 0, 'Response 9'] = ' ' + items.loc[items['Response 9'].str.len() > 0, 'Response 9']
items.loc[items['Response 10'].str.len() > 0, 'Response 10'] = ' ' + items.loc[items['Response 10'].str.len() > 0, 'Response 10']
items['Concatenated Responses'] = items['Dependent Item Footnote'] + items['Response'] + items['Response 1'] + items['Response 2'] + items['Response 3'] + items['Response 4'] + items['Response 5'] + items['Response 6'] + items['Response 7'] + items['Response 8'] + items['Response 9'] + items['Response 10']

# Insert page number headings for all 'Parent Form Code' groups (forms) that contain more than one page. This allows forms that
# contain more than one page to have page number headings placed above the items that sit on a given page in the guidelines.
# In order to achieve this, first group by 'Parent Form Code' and count the unique 'Page' categories within each group:
page_counts = items.groupby('Parent Form Code')['Page'].nunique()
# Then iterate over the 'Parent Form Code' groups with more than one unique 'Page' category:
for parent_form_code, count in page_counts[page_counts > 1].items():
    indices_to_insert = items[items['Parent Form Code'] == parent_form_code].index
    # Then iterate over the unique 'Page' categories within the 'Parent Form Code' group:
    for page in items[items['Parent Form Code'] == parent_form_code]['Page'].unique():
        # Find the first occurrence of each 'Page' category within the 'Parent Form Code' group:
        first_occurrence = items[(items['Parent Form Code'] == parent_form_code) & (items['Page'] == page)].index[0]
        # Insert a blank row at the top of each 'Page' category change within the 'Parent Form Code' group:
        items = pd.concat([items.iloc[:first_occurrence], pd.DataFrame({'Parent Form Code': [parent_form_code], 'Page': [page]}), items.iloc[first_occurrence:]], ignore_index = True)

# Add a new column to 'items' called 'Section Indicator' to dientify the various headings and section types in the guidelines.
# Add the category 'Page Heading' to it for the new page heading rows inserted above. This column will eventually contain the
# indicators for other headings and sections as well. Add page headings to the 'Concatenated Responses' column:
items['Section Indicator'] = ''
items.loc[items['Parent Form Name'].isna(), 'Section Indicator'] = 'Page Heading'
items['Parent Form Name'] = items['Parent Form Name'].fillna(items['Parent Form Name'].shift(-1))
items.loc[items['Section Indicator'] == 'Page Heading', 'Concatenated Responses'] = 'Page ' + items.loc[items['Section Indicator'] == 'Page Heading', 'Page'].astype(str) + ':'
items.loc[items['Data Type'] == 'Sub-Form', 'Section Indicator'] = 'Sub-Form Heading'

# Insert a blank row at the top of each form group ('Parent Form Code' category) ready for the form-visit assiciation descriptions
# to go into, then fill the 'Parent Form Code' and 'Parent Form Name' values from the row immediately beneath. Add section indicator
# as 'Visit Association Description':
code_change_mask = items['Parent Form Code'] != items['Parent Form Code'].shift()
code_change_indices = code_change_mask.index[code_change_mask].tolist()
blank_row = pd.DataFrame(index = [0], columns = items.columns)
for iteration, index in enumerate(code_change_indices):
    items = pd.concat([items.iloc[:index + iteration], blank_row, items.iloc[index + iteration:]]).reset_index(drop = True)
items.loc[items['Parent Form Name'].isna(), 'Section Indicator'] = 'Visit Association Description'
items['Parent Form Code'] = items['Parent Form Code'].fillna(items['Parent Form Code'].shift(-1))
items['Parent Form Name'] = items['Parent Form Name'].fillna(items['Parent Form Name'].shift(-1))

# Forward fill the visit name column in the 'forms_visits' dataframe and then create a form-visit dictionary and map the visits
# associated with each parent form code into 'items':
forms_visits['Visit name'] = forms_visits['Visit name'].ffill()
forms_visits_dict = forms_visits.groupby('Form code')['Visit name'].apply(list).to_dict()
items.loc[items['Section Indicator'] == 'Visit Association Description', 'Form-Visit List'] = items.loc[items['Section Indicator'] == 'Visit Association Description', 'Parent Form Code'].map(forms_visits_dict)

In [35]:
# Iterate through all rows where 'Section Indicator' is equal to 'Visit Association Description' and post the LLM's form-visit
# association descriptions into the 'Concatenated Responses' column:
for index, row in items.iterrows():
    if row['Section Indicator'] in ['Visit Association Description']:
        response = llm(prompt_template_14.format(parent_form_name = row['Parent Form Name'], form_visit_list = row['Form-Visit List'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Concatenated Responses'] = response
        print('\n' + response)


The Demographics form appears in the EDC at the Screening visit only.

The Height, Weight & BMI form appears in the EDC at the Screening and Unscheduled visits.


In [36]:
# Prepare to pull in the form activation information from the 'form_activations' dataframe.
# Firstly, extract the form code parts of the form activation syntax (as lists) into a new column called 'Syntax Form Codes':
form_activations['Syntax Form Codes'] = form_activations['Syntax'].apply(extract_third_to_last_substring)

# Create a form code - form name dictionary from the original 'items_copy' dataframe to map form codes to their full names and
# then create a new 'Syntax Form Names' column which stores the full form name lists:
form_code_name_dict = items_copy.set_index('Parent Form Code')['Parent Form Name'].to_dict()
form_activations['Syntax Form Names'] = form_activations['Syntax Form Codes'].apply(lambda x: [form_code_name_dict.get(item, item) for item in x])

# Group the unique form code categories and concatenate all of the 'Syntax Form Names' lists associated with each category into
# a single list. This creates a new dataframe that is used to create 'form_activations_dict':
grouped_form_codes = form_activations.groupby('Form Code')['Syntax Form Names'].agg('sum').reset_index()
form_activations_dict = {key: list(set(value)) for key, value in zip(grouped_form_codes['Form Code'], grouped_form_codes['Syntax Form Names'])}

# Map in the form name lists that control form visibility into a new 'Form-Activation List' column of 'items', only for rows
# where 'Section Indicator' equals 'Visit Association Description'. These lists will be passed to the LLM to generate the form
# visibility statements:
items.loc[items['Section Indicator'] == 'Visit Association Description', 'Form-Activation List'] = items.loc[items['Section Indicator'] == 'Visit Association Description', 'Parent Form Code'].map(form_activations_dict)

# Initialise an empty column in 'items' called 'Form Visibility Response' ready for the LLM form visibility response to go into:
items['Form Visibility Response'] = ''

In [37]:
# Iterate through all rows where the 'Form-Activation List' column contains a list and post the LLM's form visibility response
# into the 'Form Visibility Response' column:
for index, row in items.iterrows():
    if isinstance(row['Form-Activation List'], list):
        response = llm(prompt_template_15.format(form_activation_list = row['Form-Activation List'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Form Visibility Response'] = response
        print('\n' + response)


Its visibility is controlled by completion of the Study Visit form.

Its visibility is controlled by completion of the Study Visit and Eligibility Review forms.


In [38]:
# Add a blank space to the beginning of all non-empty strings in the 'Form Visibility Response' column and then concatenate the
# 'Concatenated Responses' and 'Form Visibility Response' columns together to form merged strings in a new column called
# 'Final Concatenated Responses':
items.loc[items['Form Visibility Response'].str.len() > 0, 'Form Visibility Response'] = ' ' + items.loc[items['Form Visibility Response'].str.len() > 0, 'Form Visibility Response']
items['Final Concatenated Responses'] = items['Concatenated Responses'] + items['Form Visibility Response']

# Insert a blank row at the top of each form group ('Parent Form Code' category) ready for the form summary to go into, then
# fill the 'Parent Form Code' and 'Parent Form Name' values from the row immediately beneath. Fill the 'Section Indicator'
# column values as 'Form Summary':
code_change_mask = items['Parent Form Code'] != items['Parent Form Code'].shift()
code_change_indices = code_change_mask.index[code_change_mask].tolist()
blank_row = pd.DataFrame(index = [0], columns = items.columns)
for iteration, index in enumerate(code_change_indices):
    items = pd.concat([items.iloc[:index + iteration], blank_row, items.iloc[index + iteration:]]).reset_index(drop = True)
items.loc[items['Parent Form Name'].isna(), 'Section Indicator'] = 'Form Summary'
items['Parent Form Code'] = items['Parent Form Code'].fillna(items['Parent Form Code'].shift(-1))
items['Parent Form Name'] = items['Parent Form Name'].fillna(items['Parent Form Name'].shift(-1))

# Group the unique parent form code categories in 'items' and aggregate all of the 'Literal' strings (i.e. field descriptions)
# associated with each group into lists. This creates a new dataframe that is used to create the 'formcode_literals_dict':
grouped_formcode_literals = items.dropna(subset = ['Literal']).groupby('Parent Form Code')['Literal'].agg(list).reset_index()
grouped_formcode_literals['Literal'] = grouped_formcode_literals.apply(lambda row: random.sample(row['Literal'], min(50, len(row['Literal']))) if len(row['Literal']) > 50 else row['Literal'], axis = 1)
formcode_literals_dict = grouped_formcode_literals.set_index('Parent Form Code')['Literal'].to_dict()

# Use the 'formcode_literals_dict' to map the literal field description lists into a new column of 'items' called 'Form Literals
# List'. Map in the lists only for rows where 'Section Indicator' equals 'Form Summary':
items.loc[items['Section Indicator'] == 'Form Summary', 'Form Literals List'] = items.loc[items['Section Indicator'] == 'Form Summary', 'Parent Form Code'].map(formcode_literals_dict)

In [39]:
# Iterate through all rows where 'Section Indicator' is equal to 'Form Summary' and post the LLM's form summary responses into
# the 'Final Concatenated Responses' column:
for index, row in items.iterrows():
    if row['Section Indicator'] in ['Form Summary']:
        response = llm(prompt_template_16.format(parent_form_name = row['Parent Form Name'], form_literals_list = row['Form Literals List'])).splitlines()[-1].strip().strip('"')
        items = items.copy()
        items.at[index, 'Final Concatenated Responses'] = response
        print('\n' + response)


The Demographics form captures important information about the patient's age, sex, reproductive status, ethnicity, and race. This information is essential for understanding the patient's background and potential risk factors that may affect the trial results. It also helps to ensure that the trial is conducted in an ethical and equitable manner.

The Height, Weight & BMI form captures important information about a participant's physical measurements, such as their height, weight and body mass index (BMI). This data is essential for understanding the participant's health and progress throughout the trial. The form also records any reasons why measurements were not taken, as well as the date and time of the measurements.


In [40]:
# Insert a blank row at the top of each form group ('Parent Form Code' category) ready for the form heading to go into, then
# fill the 'Parent Form Code' and 'Parent Form Name' values from the row immediately beneath. Fill the 'Section Indicator'
# column values as 'Form Heading'. Lastly, place the 'Parent Form Name' value followed by a colon (:) in the 'Final Concatenated
# Responses' column for all 'Form Heading' rows:
code_change_mask = items['Parent Form Code'] != items['Parent Form Code'].shift()
code_change_indices = code_change_mask.index[code_change_mask].tolist()
blank_row = pd.DataFrame(index = [0], columns = items.columns)
for iteration, index in enumerate(code_change_indices):
    items = pd.concat([items.iloc[:index + iteration], blank_row, items.iloc[index + iteration:]]).reset_index(drop = True)
items.loc[items['Parent Form Name'].isna(), 'Section Indicator'] = 'Form Heading'
items['Parent Form Code'] = items['Parent Form Code'].fillna(items['Parent Form Code'].shift(-1))
items['Parent Form Name'] = items['Parent Form Name'].fillna(items['Parent Form Name'].shift(-1))
items.loc[items['Section Indicator'] == 'Form Heading', 'Final Concatenated Responses'] = items.loc[items['Section Indicator'] == 'Form Heading', 'Parent Form Name'] + ':'

# Fill the remaining NaN 'Section Indicator' column values as either 'Dependent Item' or 'Independent Item':
items.loc[(items['Section Indicator'] == '') & (items['Dependent Item Footnote'] != ''), 'Section Indicator'] = 'Dependent Item'
items.loc[(items['Section Indicator'] == '') & (items['Dependent Item Footnote'] == ''), 'Section Indicator'] = 'Independent Item'

In [41]:
# Compile the 'Final Concatenated Responses' column into a formatted Word document and export - Now you have your AI-generated
# eCRF Completion Guidelines Text!
guidelines = Document()
guidelines.styles['Normal'].font.name = 'Calibri'
guidelines.styles['Normal'].paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
guidelines.styles['Normal'].paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE
guidelines.styles['Normal'].paragraph_format.space_before = Pt(0)
guidelines.styles['Normal'].paragraph_format.space_after = Pt(3)
guidelines.sections[0].left_margin = Inches(0.9)
guidelines.sections[0].right_margin = Inches(0.9)
guidelines.sections[0].top_margin = Inches(0.9)
guidelines.sections[0].bottom_margin = Inches(0.9)

for row, string in enumerate(items['Final Concatenated Responses']):
    paragraph = guidelines.add_paragraph(string)
    
    if items['Section Indicator'][row] == 'Form Heading':
        paragraph.style = 'Heading 1'
        paragraph.paragraph_format.space_before = Pt(20)
        paragraph.paragraph_format.space_after = Pt(12)
        paragraph.runs[0].underline = WD_UNDERLINE.SINGLE
        
    if items['Section Indicator'][row] == 'Form Summary':
        paragraph.paragraph_format.space_after = Pt(13)
        
    if items['Section Indicator'][row] == 'Visit Association Description':
        paragraph.paragraph_format.space_after = Pt(13)
        
    if items['Section Indicator'][row] == 'Page Heading':
        paragraph.paragraph_format.space_before = Pt(13)
        paragraph.runs[0].underline = WD_UNDERLINE.SINGLE
        paragraph.runs[0].italic = True
        
    if items['Section Indicator'][row] == 'Sub-Form Heading':
        paragraph.runs[0].underline = WD_UNDERLINE.SINGLE
        paragraph.runs[0].italic = True
        
    if items['Section Indicator'][row] == 'Dependent Item':
        paragraph.paragraph_format.left_indent = Pt(20)

if guidelines.paragraphs:
    first_heading = guidelines.paragraphs[0]
    first_heading.paragraph_format.space_before = Pt(0)

guidelines.save('Output\eCRF Completion Guidelines Text.docx')

In [30]:
# Run below code to export 'items' to excel if you need:
# items.to_excel('Output\Items.xlsx', sheet_name = 'Items', startrow = 0, startcol = 0, index = False, na_rep = '', header = True)