In [9]:
import cohere
from cohere import ClassifyExample
import pandas as pd
import random
import json
from config import cohereKey
co = cohere.Client(cohereKey)

In [10]:
# Define a function to replace "not numbered" with the value in "Variable Label"
def replace_not_numbered(row):
    if row["Question number\n (Questionnaire file)"] == "not numbered":
        return row["Variable Label\n (Data files)"]
    else:
        return row["Question number\n (Questionnaire file)"]

In [11]:
survey_questions  = pd.read_excel('Survey Questions Overview.xlsx', sheet_name='Wave 1', engine='openpyxl')
# Clean the survey questions dataframe to make it more usable for mapping
# Fill forward non-null ADICO Category values to apply them to all relevant rows, Specify the columns to forward fill excluding "Values" and "Value labels"
columns_to_ffill = [col for col in survey_questions.columns if col not in ["Values", "Value labels"]]

# Forward fill the specified columns
survey_questions[columns_to_ffill] = survey_questions[columns_to_ffill].ffill()


# Apply the function to replace "not numbered" with the value in "Variable Label"
survey_questions["Question number\n (Questionnaire file)"] = survey_questions.apply(replace_not_numbered, axis=1)


# Set the index to {value of "Question number\n (Questionnaire file)"} + "_" + {str(value of "Values")}
survey_questions.set_index(survey_questions["Variable Label\n (Data files)"] + "_" + survey_questions["Values"].astype(str), inplace=True)
survey_questions.drop("ID_nan", inplace=True)

question_answers_list = []
# Define a function to create the combined string
def combine_description_and_labels(group, question_answers_list, qnum):
    question_subset = survey_questions[survey_questions["Question number\n (Questionnaire file)"] == group["Question number\n (Questionnaire file)"].iloc[0]]
    # Check if it's the first row instance with the current "Question number\n (Questionnaire file)" column value
    first_instance_index = question_subset[question_subset.duplicated(subset=["Question number\n (Questionnaire file)"], keep="first")].index
    combined_string = ""
    if first_instance_index.size != 0:
        first_description = question_subset["Description"].iloc[0]
        if first_description != group["Description"].iloc[0]:
         # If not the first instance, start with the first instance's "Description" column value
            combined_string += first_description
    # Concatenate the current row's "Description" and all "Value labels" values
    combined_string += str(group["Description"].iloc[0])# + " " + "; ".join(group["Value labels"].astype(str))
    question_answers_list = question_answers_list + [combined_string] * group.shape[0]  # Extend the list with the combined strings
    return question_answers_list

# Group by "Variable Label\n (Data files)" and apply the function to create the combined string
for group in survey_questions.groupby("Variable Label\n (Data files)",sort=False):
    question_answers_list = combine_description_and_labels(group[1], question_answers_list, group[1]["Question number\n (Questionnaire file)"].iloc[0])
survey_questions["question_answers_combined"] = question_answers_list


In [12]:
# Assuming survey_questions is your DataFrame
examples = []

for label in survey_questions['ADICO Category'].unique():
    for index, row in survey_questions[survey_questions['ADICO Category'] ==  label].sample(5).iterrows():
        text = row['question_answers_combined']
        examples.append(ClassifyExample(text=text, label=label))

In [13]:
# Since the dataset has multiple rows per question for different value labels, we'll create a unique mapping
# Create the new mapping dictionary
question_adico_mapping = survey_questions[['question_answers_combined','ADICO Category',"Variable Label\n (Data files)"]].drop_duplicates().set_index('question_answers_combined')['ADICO Category']


# Filter out questions that are categorized as Attributes, Conditions, or Aims for clarity in analysis
attributes = [k for k, v in question_adico_mapping.items() if 'Attribute' in str(v) or 'Attribute/Condition' in str(v)]
conditions = [k for k, v in question_adico_mapping.items() if 'Condition' in str(v) or 'Aim/Condition' in str(v) or 'Attribute/Condition' in str(v)]
aims = [k for k, v in question_adico_mapping.items() if 'Aim' in str(v) or 'Aim/Condition' in str(v)]

In [14]:
#For now we make a random selection of questions with assigned ADICO components but later we can provide the model with a predetermined selection
selected_attributes = random.sample(attributes, 3)
selected_conditions = random.sample(conditions, 3)
selected_aims = random.sample(aims, 3)

In [15]:
sampleinput = selected_attributes
response = co.classify(
  inputs=sampleinput,
  examples=examples,
)

for classification in response.classifications:
    print("Text:", classification.input)
    print("Classification:", classification.prediction)

Text: What category best describes your current home or accommodation?
Classification: Potential Shared Strategy 
Text: Employment status
Classification: Attribute
Text: Zipcode or postal code
Classification: Attribute


In [16]:
#Try for ESS Questions
ESSQuestionData = pd.read_csv('ESSQuestionData.csv')
ESS_Sample = list(ESSQuestionData['questiontext'].sample(5))

response = co.classify(
  inputs=ESS_Sample,
  examples=examples,
)

for classification in response.classifications:
    print("Text:", classification.input)
    print("Classification:", classification.prediction)

Text: End of section G
Classification: Attribute
Text: Too tired after work to enjoy things like doing at home, how often
Classification: Condition
Text: Ever belonging to particular religion or denomination
Classification: Attribute
Text: Partner's highest level of education, North Macedonia
Classification: Attribute
Text: Highest level of education, France
Classification: Attribute


In [17]:
# Constructing the prompt
prompt = "Generate an Institutional shared strategy statement using all of the following sets of questions and answers containing Aims and Conditions:\n\n"
prompt += "Aims:\n" + "\n".join(selected_aims) + "\n\n"
prompt += "Conditions:\n" + "\n".join(selected_conditions) + "\n\n"

#Explain to the GPT what their role is:
systemMessage = "You are a helpful assistant that converts questions and answers from surveys into Institutional Behaviour Statements. You always output a list of statements that follow the following JSON structure: \n\n"
systemMessage += '{"statements":["Attribute":"Households in the Netherlands", "Aim":"do {Aim}", "Condition":"if {Condition(s)}]"”"\n\n"'
systemMessage += "Your output will be read using the following: converted_questions = json.loads(response.choices[0].message.content) and converted_questions = pd.DataFrame(converted_questions_json['statements'])."

#Give them some context information:
contentMessage = "An institutional statement refers to a structured representation of institutions using specific elements such as Attribute, Deontic, Aim, Condition, and Or Else (ADICO). These statements are used to define and understand the impacts, actions, and conditions associated with institutional rules, norms, and shared strategies within social systems.\n\n" 
contentMessage += "Attribute: Who or what the impacts of the institution apply to, e.g. “Households in Indonesia” or “police officers”.\n\n" 
contentMessage += "Aim: The definition of the impact that is applied or the action performed, e.g. “reinforce their foundations bi-yearly” or “close the door”.\n\n"
contentMessage += "Condition: the conditions that need to be satisfied in order for the aim to occur, e.g. “if they live with more than four people” or “if the alarm goes off”\n\n"
contentMessage += "Combining all these elements produces the following Institutional shared strategy: “Police officers (Attribute) lock the doors of their vehicle (Aim) if they leave the vehicle (Condition)”'\n\n"


In [19]:
response = co.chat(
  preamble = systemMessage,
  message=prompt,
  model="command"
)
print(response)

TooManyRequestsError: status_code: 429, body: {'message': "You are using a Trial key, which is limited to 1000 API calls / month. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"}

In [None]:
# Parse JSON
converted_questions_json = json.loads(response.text.replace('```json\n', '').replace('```', ''))

# Convert JSON to dataframe
converted_questions = pd.DataFrame(converted_questions_json["statements"])

# Display dataframe
converted_questions

Unnamed: 0,Attribute,Aim,Condition
0,Households affected by the 2020 Jakarta Flood,do regularly read information about flooding a...,if they believe that implementing non-structur...
1,Households affected by the 2020 Jakarta Flood,are more likely to consider implementing non-s...,if they perceive the damage to their house fro...
2,Households affected by the 2020 Jakarta Flood,are willing to purchase sandbags or other wate...,if they believe that this measure is affordabl...


In [None]:
#Code to Check the current prompt and categorize it
def TaskInterpreter(response):
    
    # preamble containing instructions about the task and the desired style for the output.
    preamble = """
    ## Task & Context
    You read responses from llm agents and determine the type of task. You only output one of following options: "Fix", "Write", or "Run".

    ## Definitions
    Write: The message indicates that a script should be written
    Fix: The message indicates that a change should be made to the script
    Run: The message contains a python script that should be run and tested

    ## Style Guide
    You can only output a single word.
    """

    response = co.chat(
    message=response,
    preamble=preamble,
    model="command"
    )
    return response.text

ESS Tests:

In [None]:
import cohere
from cohere import ClassifyExample
import pandas as pd
import random
import json
from config import cohereKey
co = cohere.Client(cohereKey)

In [None]:
Survey_Overview_path = 'ESSQuestionData.csv'  # Update this path accordingly

Survey_Overview = pd.read_csv(Survey_Overview_path)
Survey_Overview

In [None]:
# Load the additional labeled examples provided for training the BERT model
training_data_path = 'adico_training_data.csv'
training_data = pd.read_csv(training_data_path).drop("Unnamed: 0", axis=1)

# Display the first few rows of the dataframe to understand its structure and labels
# training_data.head(), training_data.columns

training_data_resampled = pd.DataFrame(columns=training_data.columns)
for category in training_data['question_answers_combined'].unique():
    filtered_data = training_data[training_data['question_answers_combined'] == category]
    sample = filtered_data.sample(n=min(20, len(filtered_data)))
    
    training_data_resampled = pd.concat([training_data_resampled, sample])

In [None]:
print(training_data['ADICO_Category'].unique())
# Update rows where 'ADICO_Category' is 'Attribute' to 'Condition'
training_data.loc[training_data['ADICO_Category'].isin(["Attribute"]), 'ADICO_Category'] = "Condition"

# Update rows where 'ADICO_Category' is 'Attribute/Condition' to 'Condition'
training_data.loc[training_data['ADICO_Category'].isin(["Attribute/Condition"]), 'ADICO_Category'] = "Condition"

# Update rows where 'ADICO_Category' is 'Potential Shared Strategy ' to 'Condition'
training_data.loc[training_data['ADICO_Category'].isin(["Potential Shared Strategy "]), 'ADICO_Category'] = "Condition"

# Update rows where 'ADICO_Category' is 'Aim/Condition' to 'Aim'
training_data.loc[training_data['ADICO_Category'].isin(["Aim/Condition"]), 'ADICO_Category'] = "Aim"

['Attribute' 'Attribute/Condition' 'Condition' 'Aim/Condition'
 'Potential Shared Strategy ']


In [None]:
# Assuming Survey_Responses is your DataFrame
examples = []

for label in training_data['ADICO_Category'].unique():
    for index, row in training_data[training_data['ADICO_Category'] ==  label].sample(min(120,len(training_data[training_data['ADICO_Category'] ==  label]))).iterrows():
        text = row['question_answers_combined']
        examples.append(ClassifyExample(text=text, label=label))

In [None]:
response = co.classify(
    inputs=["text, hello", "other option"],
    examples=examples,
)
response



In [None]:
# Extracting all predictions and their respective confidences
predictions = [(item.prediction, item.confidence) for item in response.classifications]

# If you need more details, such as the confidence for each possible label
detailed_predictions = [{
    'input': item.input,
    'prediction': item.prediction,
    'confidence': item.confidence,
    'label_confidences': {label: value.confidence for label, value in item.labels.items()}
} for item in response.classifications]

In [None]:
predictions

[('Condition', 0.98777497), ('Condition', 0.99989146)]

In [None]:
questions = Survey_Overview['question_answers_combined'].tolist()
text_count = len(questions)

# Assuming you want to process the DataFrame in batches of 20 rows
batch_size = 60
Survey_Overview['ADICO_Category'] = ""

for i in range(0, text_count, batch_size):
    # Define the end index of the batch, making sure not to go out of bounds
    end = i + batch_size if (i + batch_size) <= text_count else text_count

    # Select the batch
    batch = questions[i:end]

    response = co.classify(
      inputs=batch,
      examples=examples,
    )

    # Extracting all predictions and their respective confidences
    predicted_labels = [item.prediction for item in response.classifications]


    # Update DataFrame with new column for predicted categories
    Survey_Overview.iloc[i:end, Survey_Overview.columns.get_loc('ADICO_Category')] = predicted_labels


In [None]:
# Save the updated DataFrame to a new CSV file
Survey_Overview_path = 'ESSQuestionData_withADICO.csv'  # Update this path accordingly
Survey_Overview.to_csv(Survey_Overview_path, index=False)

Survey_Overview.sample(10)

Unnamed: 0,id,questiontext,responseoptions,question_answers_combined,ADICO_Category
309,uempli,"Doing last 7 days: unemployed, not actively lo...",0: Not marked; 1: Marked,"Doing last 7 days: unemployed, not actively lo...",Condition
392,edlvfdch,"Father's highest level of education, Switzerland",1: Incompleted primary school; 2: Primary scho...,"Father's highest level of education, Switzerla...",Condition
137,rlgdeais,Religion or denomination belonging to in the p...,1: Kaþólsku kirkjunni; 2: Þjóðkirkjunni; 3: Fr...,Religion or denomination belonging to in the p...,Condition
416,edulvlmb,Mother's highest level of education,"0: Not completed ISCED level 1; 113: ISCED 1, ...",Mother's highest level of education0: Not comp...,Condition
100,imsmetn,Allow many/few immigrants of same race/ethnic ...,1: Allow many to come and live here; 2: Allow ...,Allow many/few immigrants of same race/ethnic ...,Condition
595,vdtpdkre,Respondent's experience of technical problems:...,0: Not marked; 1: Marked,Respondent's experience of technical problems:...,Condition
212,wpestopc,In country the will of the people cannot be st...,0: Does not apply at all; 1: 1; 2: 2; 3: 3; 4:...,In country the will of the people cannot be st...,Condition
609,inwde,End of interview,,End of interview,Condition
575,hapnwc19,Things happened since start of COVID-19: not i...,0: Not marked; 1: Marked,Things happened since start of COVID-19: not i...,Condition
518,mcwrkhom,Online/mobile communication makes it easy to w...,0: Not at all; 1: 1; 2: 2; 3: 3; 4: 4; 5: 5; 6...,Online/mobile communication makes it easy to w...,Condition


In [None]:
Survey_Overview_path = 'ESSQuestionData_withADICO.csv'  # Update this path accordingly

Survey_Overview_ADICO = pd.read_csv(Survey_Overview_path)

In [None]:
Survey_Overview_ADICO_path = 'ESSQuestionData_withADICO.csv'  # Update this path accordingly

Survey_Overview_ADICO = pd.read_csv(Survey_Overview_ADICO_path)

Survey_Overview_ADICO.set_index('id', inplace=True)
# Filter out questions that are categorized as Attributes, Conditions, or Aims for clarity in analysis
Attributes = Survey_Overview_ADICO.loc[Survey_Overview_ADICO['ADICO_Category'].str.contains('Attribute', na=False), 'question_answers_combined']
Conditions = Survey_Overview_ADICO.loc[Survey_Overview_ADICO['ADICO_Category'].str.contains('Condition', na=False), 'question_answers_combined']
Aims = Survey_Overview_ADICO.loc[Survey_Overview_ADICO['ADICO_Category'].str.contains('Aim', na=False), 'question_answers_combined']
# Attcons = list(set([Attcon for Attcon in Survey_Overview_ADICO[Survey_Overview_ADICO['ADICO_Category'].isin(["Attribute", "Condition",'Aim/Condition', "Attribute/Condition"])]['question_answers_combined']]))

# chosen_aims = ["R06a_media_freq",  "R2_implementation_NM4", "R2_implementation_NM5", "R2_implementation_SM3"]

In [None]:
Survey_Overview_ADICO[Survey_Overview_ADICO['ADICO_Category']=="Aim"].sample(10)

Unnamed: 0_level_0,questiontext,responseoptions,question_answers_combined,ADICO_Category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
contplt,Contacted politician or government official la...,1: Yes; 2: No; 7: Refusal*; 8: Don't know*; 9:...,Contacted politician or government official la...,Aim
rlgblg,Belonging to particular religion or denomination,1: Yes; 2: No; 7: Refusal*; 8: Don't know*; 9:...,Belonging to particular religion or denominati...,Aim
prtcldgr,"Which party feel closer to, Greece",1: ΝΔ; 2: ΣΥΡΙΖΑ; 3: ΚΙΝ.ΑΛ.; 4: ΚΚΕ; 5: Ελλην...,"Which party feel closer to, Greece1: ΝΔ; 2: ΣΥ...",Aim
manspeak,"Speak with line manager about work in person, ...",1: Several times a day; 2: Once a day; 3: Seve...,"Speak with line manager about work in person, ...",Aim
iplylfr,Important to be loyal to friends and devote to...,1: Very much like me; 2: Like me; 3: Somewhat ...,Important to be loyal to friends and devote to...,Aim
colcom,Communicate with colleagues about work via tex...,1: Several times a day; 2: Once a day; 3: Seve...,Communicate with colleagues about work via tex...,Aim
mcmsinf,Online/mobile communication exposes people to ...,0: Not at all; 1: 1; 2: 2; 3: 3; 4: 4; 5: 5; 6...,Online/mobile communication exposes people to ...,Aim
chpldm,Best for democracy: government changes policie...,1: Government should change its policies; 2: G...,Best for democracy: government changes policie...,Aim
trstep,Trust in the European Parliament,0: No trust at all; 1: 1; 2: 2; 3: 3; 4: 4; 5:...,Trust in the European Parliament0: No trust at...,Aim
prtclhhu,"Which party feel closer to, Hungary",1: DK (Demokratikus Koalíció); 2: Párbeszéd (P...,"Which party feel closer to, Hungary1: DK (Demo...",Aim


In [None]:
Survey_Responses

Unnamed: 0,name,essround,edition,proddate,idno,cntry,dweight,pspwght,pweight,anweight,...,vinwe,inwde,jinws,jinwe,inwtm,mode,domain,prob,stratum,psu
0,ESS10e03_2,10,3.2,02.11.2023,10038,BE,0.882220,0.972276,0.718075,0.698167,...,2022-09-01 17:47:00,2022-09-01 17:47:00,2022-09-01 17:47:00,2022-09-01 17:47:00,36.0,1,1.0,0.000397,188,2596
1,ESS10e03_2,10,3.2,02.11.2023,10053,BE,1.047643,0.888635,0.718075,0.638107,...,2022-04-08 11:07:00,2022-04-08 11:10:00,2022-04-08 11:07:00,2022-04-08 11:10:00,54.0,2,2.0,0.000334,194,2206
2,ESS10e03_2,10,3.2,02.11.2023,10055,BE,1.087741,0.722811,0.718075,0.519033,...,2022-05-20 11:08:00,2022-05-20 11:10:00,2022-05-20 11:08:00,2022-05-20 11:10:00,77.0,1,2.0,0.000322,198,2114
3,ESS10e03_2,10,3.2,02.11.2023,10062,BE,0.909910,1.005565,0.718075,0.722072,...,2022-05-22 13:58:00,2022-05-22 13:59:00,2022-05-22 13:58:00,2022-05-22 13:59:00,55.0,1,1.0,0.000385,150,2645
4,ESS10e03_2,10,3.2,02.11.2023,10064,BE,0.918949,0.638705,0.718075,0.458639,...,2022-05-18 11:44:00,2022-05-18 11:45:00,2022-05-18 11:44:00,2022-05-18 11:45:00,55.0,1,1.0,0.000381,149,2313
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37606,ESS10e03_2,10,3.2,02.11.2023,27808,SK,0.515714,0.339385,0.323800,0.109893,...,2021-06-08 14:28:34,2021-06-08 14:30:41,2021-06-08 14:29:01,2021-06-08 14:31:44,70.0,1,1.0,0.001522,2610,27206
37607,ESS10e03_2,10,3.2,02.11.2023,27826,SK,0.297974,0.196093,0.323800,0.063495,...,2021-08-02 10:33:27,2021-08-02 10:36:27,2021-08-02 10:35:22,2021-08-02 10:37:34,45.0,1,2.0,0.002635,2610,27217
37608,ESS10e03_2,10,3.2,02.11.2023,27834,SK,0.965931,0.857000,0.323800,0.277497,...,2021-06-26 20:52:15,2021-06-26 20:53:05,2021-06-26 20:52:27,2021-06-26 20:54:32,33.0,1,1.0,0.000813,2631,27134
37609,ESS10e03_2,10,3.2,02.11.2023,27846,SK,0.854279,0.624287,0.323800,0.202144,...,2021-07-21 14:14:41,2021-07-21 14:17:31,2021-07-21 14:16:38,2021-07-21 14:18:38,43.0,1,1.0,0.000919,2638,27183


In [None]:
Conditions.sample(10)

id
rlgdemk     Religion or denomination belonging to in the p...
cttresa     The courts treat everyone the same0: Not at al...
eiscedm     Mother's highest level of education, ES - ISCE...
mcinter     Online/mobile communication makes work and per...
vdtpnare    Respondent's experience of technical problems:...
lrscale     Placement on left right scale0: Left; 1: 1; 2:...
testii8     How likely, large numbers of people limit ener...
hmsfmlsh    Ashamed if close family member gay or lesbian1...
manhlp      Line manager gives work-related help, how like...
edumbgb2    Mother's highest level of education, United Ki...
Name: question_answers_combined, dtype: object