In [9]:
import cohere
from cohere import ClassifyExample
import pandas as pd
import random
import json
cohereKey = "YPOV5Eud45eYSDtQUxsHUERJVWwHGXotsogyzH5j"
co = cohere.Client(cohereKey)

In [10]:
# Define a function to replace "not numbered" with the value in "Variable Label"
def replace_not_numbered(row):
    if row["Question number\n (Questionnaire file)"] == "not numbered":
        return row["Variable Label\n (Data files)"]
    else:
        return row["Question number\n (Questionnaire file)"]

In [11]:
survey_questions  = pd.read_excel('Survey Questions Overview.xlsx', sheet_name='Wave 1', engine='openpyxl')
# Clean the survey questions dataframe to make it more usable for mapping
# Fill forward non-null ADICO Category values to apply them to all relevant rows, Specify the columns to forward fill excluding "Values" and "Value labels"
columns_to_ffill = [col for col in survey_questions.columns if col not in ["Values", "Value labels"]]

# Forward fill the specified columns
survey_questions[columns_to_ffill] = survey_questions[columns_to_ffill].ffill()


# Apply the function to replace "not numbered" with the value in "Variable Label"
survey_questions["Question number\n (Questionnaire file)"] = survey_questions.apply(replace_not_numbered, axis=1)


# Set the index to {value of "Question number\n (Questionnaire file)"} + "_" + {str(value of "Values")}
survey_questions.set_index(survey_questions["Variable Label\n (Data files)"] + "_" + survey_questions["Values"].astype(str), inplace=True)
survey_questions.drop("ID_nan", inplace=True)

question_answers_list = []
# Define a function to create the combined string
def combine_description_and_labels(group, question_answers_list, qnum):
    question_subset = survey_questions[survey_questions["Question number\n (Questionnaire file)"] == group["Question number\n (Questionnaire file)"].iloc[0]]
    # Check if it's the first row instance with the current "Question number\n (Questionnaire file)" column value
    first_instance_index = question_subset[question_subset.duplicated(subset=["Question number\n (Questionnaire file)"], keep="first")].index
    combined_string = ""
    if first_instance_index.size != 0:
        first_description = question_subset["Description"].iloc[0]
        if first_description != group["Description"].iloc[0]:
         # If not the first instance, start with the first instance's "Description" column value
            combined_string += first_description
    # Concatenate the current row's "Description" and all "Value labels" values
    combined_string += str(group["Description"].iloc[0])# + " " + "; ".join(group["Value labels"].astype(str))
    question_answers_list = question_answers_list + [combined_string] * group.shape[0]  # Extend the list with the combined strings
    return question_answers_list

# Group by "Variable Label\n (Data files)" and apply the function to create the combined string
for group in survey_questions.groupby("Variable Label\n (Data files)",sort=False):
    question_answers_list = combine_description_and_labels(group[1], question_answers_list, group[1]["Question number\n (Questionnaire file)"].iloc[0])
survey_questions["question_answers_combined"] = question_answers_list


In [12]:
# Assuming survey_questions is your DataFrame
examples = []

for label in survey_questions['ADICO Category'].unique():
    for index, row in survey_questions[survey_questions['ADICO Category'] ==  label].sample(5).iterrows():
        text = row['question_answers_combined']
        examples.append(ClassifyExample(text=text, label=label))

In [13]:
# Since the dataset has multiple rows per question for different value labels, we'll create a unique mapping
# Create the new mapping dictionary
question_adico_mapping = survey_questions[['question_answers_combined','ADICO Category',"Variable Label\n (Data files)"]].drop_duplicates().set_index('question_answers_combined')['ADICO Category']


# Filter out questions that are categorized as Attributes, Conditions, or Aims for clarity in analysis
attributes = [k for k, v in question_adico_mapping.items() if 'Attribute' in str(v) or 'Attribute/Condition' in str(v)]
conditions = [k for k, v in question_adico_mapping.items() if 'Condition' in str(v) or 'Aim/Condition' in str(v) or 'Attribute/Condition' in str(v)]
aims = [k for k, v in question_adico_mapping.items() if 'Aim' in str(v) or 'Aim/Condition' in str(v)]

In [14]:
#For now we make a random selection of questions with assigned ADICO components but later we can provide the model with a predetermined selection
selected_attributes = random.sample(attributes, 3)
selected_conditions = random.sample(conditions, 3)
selected_aims = random.sample(aims, 3)

In [15]:
sampleinput = selected_attributes
response = co.classify(
  inputs=sampleinput,
  examples=examples,
)

for classification in response.classifications:
    print("Text:", classification.input)
    print("Classification:", classification.prediction)

Text: What category best describes your current home or accommodation?
Classification: Potential Shared Strategy 
Text: Employment status
Classification: Attribute
Text: Zipcode or postal code
Classification: Attribute


In [16]:
#Try for ESS Questions
ESSQuestionData = pd.read_csv('ESSQuestionData.csv')
ESS_Sample = list(ESSQuestionData['questiontext'].sample(5))

response = co.classify(
  inputs=ESS_Sample,
  examples=examples,
)

for classification in response.classifications:
    print("Text:", classification.input)
    print("Classification:", classification.prediction)

Text: End of section G
Classification: Attribute
Text: Too tired after work to enjoy things like doing at home, how often
Classification: Condition
Text: Ever belonging to particular religion or denomination
Classification: Attribute
Text: Partner's highest level of education, North Macedonia
Classification: Attribute
Text: Highest level of education, France
Classification: Attribute


In [17]:
# Constructing the prompt
prompt = "Generate an Institutional shared strategy statement using all of the following sets of questions and answers containing Aims and Conditions:\n\n"
prompt += "Aims:\n" + "\n".join(selected_aims) + "\n\n"
prompt += "Conditions:\n" + "\n".join(selected_conditions) + "\n\n"

#Explain to the GPT what their role is:
systemMessage = "You are a helpful assistant that converts questions and answers from surveys into Institutional Behaviour Statements. You always output a list of statements that follow the following JSON structure: \n\n"
systemMessage += '{"statements":["Attribute":"Households in the Netherlands", "Aim":"do {Aim}", "Condition":"if {Condition(s)}]"”"\n\n"'
systemMessage += "Your output will be read using the following: converted_questions = json.loads(response.choices[0].message.content) and converted_questions = pd.DataFrame(converted_questions_json['statements'])."

#Give them some context information:
contentMessage = "An institutional statement refers to a structured representation of institutions using specific elements such as Attribute, Deontic, Aim, Condition, and Or Else (ADICO). These statements are used to define and understand the impacts, actions, and conditions associated with institutional rules, norms, and shared strategies within social systems.\n\n" 
contentMessage += "Attribute: Who or what the impacts of the institution apply to, e.g. “Households in Indonesia” or “police officers”.\n\n" 
contentMessage += "Aim: The definition of the impact that is applied or the action performed, e.g. “reinforce their foundations bi-yearly” or “close the door”.\n\n"
contentMessage += "Condition: the conditions that need to be satisfied in order for the aim to occur, e.g. “if they live with more than four people” or “if the alarm goes off”\n\n"
contentMessage += "Combining all these elements produces the following Institutional shared strategy: “Police officers (Attribute) lock the doors of their vehicle (Aim) if they leave the vehicle (Condition)”'\n\n"


In [19]:
response = co.chat(
  preamble = systemMessage,
  message=prompt,
  model="command"
)
print(response)

TooManyRequestsError: status_code: 429, body: {'message': "You are using a Trial key, which is limited to 1000 API calls / month. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"}

In [None]:
# Parse JSON
converted_questions_json = json.loads(response.text.replace('```json\n', '').replace('```', ''))

# Convert JSON to dataframe
converted_questions = pd.DataFrame(converted_questions_json["statements"])

# Display dataframe
converted_questions

Unnamed: 0,Attribute,Aim,Condition
0,Households affected by the 2020 Jakarta Flood,do regularly read information about flooding a...,if they believe that implementing non-structur...
1,Households affected by the 2020 Jakarta Flood,are more likely to consider implementing non-s...,if they perceive the damage to their house fro...
2,Households affected by the 2020 Jakarta Flood,are willing to purchase sandbags or other wate...,if they believe that this measure is affordabl...


In [None]:
#Code to Check the current prompt and categorize it
def TaskInterpreter(response):
    
    # preamble containing instructions about the task and the desired style for the output.
    preamble = """
    ## Task & Context
    You read responses from llm agents and determine the type of task. You only output one of following options: "Fix", "Write", or "Run".

    ## Definitions
    Write: The message indicates that a script should be written
    Fix: The message indicates that a change should be made to the script
    Run: The message contains a python script that should be run and tested

    ## Style Guide
    You can only output a single word.
    """

    response = co.chat(
    message=response,
    preamble=preamble,
    model="command"
    )
    return response.text