In [37]:
## Load essential packages ##

%reset -f

import os
import openai
import pandas as pd

In [38]:
## Load API Key from system memory

openai.api_key = os.getenv("OPENAI_API_KEY")

In [39]:
## Define the System Role for the main LLM agent and the new category creator

NewCategories = ''
NUMBEROFCATS = 30

def ConstructSystemRole(NewCategories,NUMBEROFCATS):

    SystemRole = """You are an assistant helping to categorize responses from caregivers that have decided to not vaccinate their children.  
The survey that was given to these caregivers contained pre-defined categories for why they have decided to not vaccinate their children.  
The following are the categories: 
[1] Vaccination site too far 
[2] Vaccination schedule not known 
[3] mother too busy 
[4] Family problems, including maternal illness 
[5] Sick child, not sent
[6] Sick child, sent but not vaccinated
[7] Long wait
[8] Rumors
[9] Don't believe in vaccination
[10] Fear of side effects
[11] Site and/or time of vaccination not known
[12] Ignore the need for vaccination  / unaware / uninformed
[13] Ignores need to return for 2nd or 3rd dose
[14] Bad ideas about contraindications
[15] Inappropriate timing of vaccination
[16] Absent vaccinator
[17] Vaccine not available
[18] Vaccination session canceled
[19] High cost of vaccination or SMC session
[20] Sick child, brought in but did not receive vaccination
[21] Religious belief
[22] Negative attitude of the spouse, father or guardian of the child towards vaccination
[23] Provider on strike
[24] COVID-19 lockdown
[25] Travel or displacement
[26] War, armed conflict, ethnic conflict
[27] Fear of COVID vaccine
[28] Don't know the reason
[29] Vaccine not mentioned on card
NEWCATEGORIES

Your role as the assistant is to assess the unstructured responses from the survey of parents.
Categorize the response into one of the categories, provide the category number in the format Category, []
If there are multiple reasons in the response and they fit multiple categories, provide the numbers in a list Category, [],[] 
In the following line of the output provide the reasoning for the decision including if a new category needs to be created in the format Reason, []
Only output the Category and Reason.  
    
Here are two examples:
    
user:  I could not find the time.
assistant: Category, 3 
Reason, The mother was not able to find the time and was therefore too busy.

user:  Mother is traveling.
assistant: Category, 25
Reason, The mother is traveling therefore category 25 since it indicates travel or displacement."""
      
    SummarizeRole = SummarizeRole.replace('NEWCATEGORIES',NewCategories)

    return SystemRole

def ConstructNewCategoryDescription(NewCategories):
    SummarizeRole = ''' Your role is to help create a concise category description that is outside of the following categories:
[1] Vaccination site too far 
[2] Vaccination schedule not known 
[3] mother too busy 
[4] Family problems, including maternal illness 
[5] Sick child, not sent
[6] Sick child, sent but not vaccinated
[7] Long wait
[8] Rumors
[9] Don't believe in vaccination
[10] Fear of side effects
[11] Site and/or time of vaccination not known
[12] Ignore the need for vaccination
[13] Ignores need to return for 2nd or 3rd dose
[14] Bad ideas about contraindications
[15] Inappropriate timing of vaccination
[16] Absent vaccinator
[17] Vaccine not available
[18] Vaccination session canceled
[19] High cost of vaccination or SMC session
[20] Sick child, brought in but did not receive vaccination
[21] Religious censorship
[22] Negative attitude of the spouse, father or guardian of the child towards vaccination
[23] Provider on strike
[24] COVID-19 lockdown
[25] Travel or displacement
[26] War, armed conflict, ethnic conflict
[27] Fear of COVID vaccine
[28] Don't know the reason
[29] Vaccine not mentioned on card
NEWCATEGORIES

You will be given the actual response and the assistant reasoning.  
Provide a descriptive title for the new category that is descriptive of the reason.  
Only give the title in the output.  '''

    SummarizeRole = SummarizeRole.replace('NEWCATEGORIES',NewCategories)
    
    return SummarizeRole

In [40]:
# Different models available via openai:  gpt-3.5-turbo, gpt-4, gpt-4-0314

SystemRole = ConstructSystemRole(NewCategories,NUMBEROFCATS)
SummarizeRole = ConstructSummarizeRole(NewCategories)

def request_completion(prompt,SystemRole):

    response = openai.ChatCompletion.create(
    model='gpt-4-0125-preview',
        messages = [
            {"role": "system", "content": SystemRole},
            {"role": "user", "content":prompt}
        ],
    temperature=1,
    max_tokens=2000,
    top_p=0.5,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None)

    #print(response['choices'][0]['message']['content'])
    return response['choices'][0]['message']['content']

In [None]:
# Import Response Data 

import csv
from io import StringIO

RealResponses = pd.read_csv('ecv_reasons_novx 1Clean.csv',encoding='ISO-8859-1') 

with open('ecv_reasons_novx 1.csv', 'r', encoding='utf-8', errors='ignore') as file:
    content = file.read()

# Assuming 'content' is your CSV file content as a string
content_io = StringIO(content)

# Create a CSV DictReader object
reader = csv.DictReader(content_io)

rows = list(reader)

print(rows[6]['reason'])

In [46]:
#Create a function that helps parse and store the output of the model in a Dictionary

# Define the specific keys you're interested in
keys_of_interest = ["Line Number","Category","NewCategory","Reason"]

def FillDictionary(forms_list, data_string):
    # Create a new dictionary for the current form
    form_dict = {}
    lines = data_string.split('\n')  # Split the form into lines
    
    for line in lines:
        parts = line.split(',', 1)  # Notice the removal of the space in split(',')
        if len(parts) == 2:  # Check if the line correctly splits into two parts
            key = parts[0].strip()  # Ensure any leading/trailing whitespace is removed from the key
            value = parts[1].strip()  # Ensure any leading/trailing whitespace is removed from the value
            form_dict[key] = value  # Add key-value pair to the form dictionary
        else:  # If the line doesn't split into two parts, print a warning
            print(f"Warning: Line does not contain expected format: '{line}'")
    
    # Add the current form's dictionary to the list of forms
    forms_list.append(form_dict)

    return form_dict

In [None]:
# Run each response through the model for categorization, when a new category is flagged, create a  new category.

import ast

# Initialize an empty dictionary to store the 
if 'data_dict' not in locals() or data_dict is None:
    data_dict = {}
else:
    data_dict.clear()
        
data_dict = {key: None for key in keys_of_interest}  # Pre-fill the dictionary with the keys and set their values to None
    
forms_list = []  # Initialize an empty list to hold all form dictionaries


#for index in range(1,len(RealResponses)): 
for index in range(0,400):  #Prototype on the first 400.

    response = str(RealResponses.iloc[index,1])
    response = str(rows[index]['reason'])
    assistantResponse = request_completion(response,SystemRole)

    assistantResponse = 'Line Number, '+str(index)+' \n' + assistantResponse
    data_dict = FillDictionary(forms_list,assistantResponse)

    try:
        category_value = data_dict['Category']
            
        # This is the adaptive topic identification part of the workflow.  If NEW, describe a new category and update the role descriptions

        if category_value == 'NEW':
            data_dict['Category'] = NUMBEROFCATS
            NUMBEROFCATS += 1
            newCatDescription = request_completion('User: '+str(response)+'\n Assistant: '+data_dict['Reason'],ConstructNewCategoryDescription)
            NewCategories = NewCategories + '\n' + '['+str(NUMBEROFCATS)+'] '+ newCatDescription
            SystemRole = ConstructSystemRole(NewCategories,NUMBEROFCATS)
            SummarizeRole = ConstructSummarizeRole(NewCategories)
            print('here')
            print(newCatDescription)

        print(str(index))
        print(str(category_value))
        
    except ValueError:
        # Handle cases where conversion to int or evaluation fails
        print("Category value is not a single number or a list of numbers.")
    except SyntaxError:
        # Handle cases where the string cannot be evaluated
        print("Category value format is incorrect.")
        
# Assuming 'forms_list' is your list of dictionaries
df = pd.DataFrame(forms_list)  # Convert the list of dictionaries to a pandas DataFrame
    
# Specify your Excel file name
excel_filename = 'TestRunRoyCategoriesTemp1.xlsx'
    
# Export the DataFrame to an Excel file
df.to_excel(excel_filename, index=False, engine='openpyxl')
print(f'Data saved to {excel_filename}')

In [None]:
# This is to do some matching with Roy's benchmarks.  

import pandas as pd
total_matches = 0

# Get the value from the DataFrame to compare against, ensuring it's an integer

data = []

for index in range(400):  # This automatically starts from 0

    try:
        # Attempt to convert the current value to an integer
        category_value = int(df.iloc[index]['Category'])
        current_value = int(rows[index]['roy_categorized'])
        # If the conversion succeeds and the value matches, increment the count
        if current_value == category_value:
            total_matches += 1
        else:
            data.append({'GPT4 Category': category_value, 'Roy Category': current_value,'Translation':rows[index]['translation'],'GPT4 Reason':df.iloc[index]['Reason']})
    except (ValueError, TypeError):
        # If the conversion fails (due to non-integer data) or it's a type that can't be converted (like NoneType), just continue
        continue

newdf = pd.DataFrame(data)
# Specify your Excel file name
excel_filename = 'Differences.xlsx'
    
# Export the DataFrame to an Excel filed
newdf.to_excel(excel_filename, index=False, engine='openpyxl')
print(f'Data saved to {excel_filename}')

print(total_matches)