In [84]:
from bertopic import BERTopic
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=9)

In [2]:
# Define a function to replace "not numbered" with the value in "Variable Label"
def replace_not_numbered(row):
    if row["Question number\n (Questionnaire file)"] == "not numbered":
        return row["Variable Label\n (Data files)"]
    else:
        return row["Question number\n (Questionnaire file)"]

In [67]:
survey_questions  = pd.read_excel('Survey Questions Overview.xlsx', sheet_name='Wave 1', engine='openpyxl')
# Clean the survey questions dataframe to make it more usable for mapping
# Fill forward non-null ADICO Category values to apply them to all relevant rows, Specify the columns to forward fill excluding "Values" and "Value labels"
columns_to_ffill = [col for col in survey_questions.columns if col not in ["Values", "Value labels"]]

# Forward fill the specified columns
survey_questions[columns_to_ffill] = survey_questions[columns_to_ffill].ffill()


# Apply the function to replace "not numbered" with the value in "Variable Label"
survey_questions["Question number\n (Questionnaire file)"] = survey_questions.apply(replace_not_numbered, axis=1)


# Set the index to {value of "Question number\n (Questionnaire file)"} + "_" + {str(value of "Values")}
survey_questions.set_index(survey_questions["Variable Label\n (Data files)"] + "_" + survey_questions["Values"].astype(str), inplace=True)
survey_questions.drop("ID_nan", inplace=True)
survey_questions.drop_duplicates(inplace=True)
# Drop rows with NaN values in the "Values" column
survey_questions.dropna(subset=["Values"], inplace=True)


question_answers_list = []
# Define a function to create the combined string
def combine_description_and_labels(group, question_answers_list):
    question_subset = survey_questions[survey_questions["Question number\n (Questionnaire file)"] == group["Question number\n (Questionnaire file)"].iloc[0]]
    # Check if it's the first row instance with the current "Question number\n (Questionnaire file)" column value
    first_instance_index = question_subset[question_subset.duplicated(subset=["Question number\n (Questionnaire file)"], keep="first")].index
    first_description = ""
    combined_string = ""
    if first_instance_index.size != 0:
        first_description = question_subset["Description"].iloc[0]
        for i in group.index:  
            if first_description != group["Description"].loc[i]:
                # If not the first instance, start with the first instance's "Description" column value
                combined_string = first_description
            # Concatenate the current row's "Description" and all "Value labels" values
            combined_string += str(group["Description"].loc[i] + " Response: " + str(group["Value labels"].loc[i]))
            question_answers_list = question_answers_list + [combined_string]  # Extend the list with the combined strings
    
    else:question_answers_list = question_answers_list + [""]  # Extend the list with the no answer

    return question_answers_list

# Group by "Variable Label\n (Data files)" and apply the function to create the combined string
for group in survey_questions.groupby("Variable Label\n (Data files)",sort=False):
    question_answers_list = combine_description_and_labels(group[1], question_answers_list)
survey_questions["question_answers_combined"] = question_answers_list


# Since the dataset has multiple rows per question for different value labels, we'll create a unique mapping
# Create the new mapping dictionary
question_adico_mapping = survey_questions[['question_answers_combined','ADICO Category',"Variable Label\n (Data files)"]].drop_duplicates().set_index('question_answers_combined')

# Filter out questions that are categorized as Attributes, Conditions, or Aims for clarity in analysis
Attcons = survey_questions[survey_questions['ADICO Category'].isin(["Attribute", "Condition",'Aim/Condition', "Attribute/Condition"])]

In [85]:
survey_data = pd.read_csv("dataverse_files\Wave1\SCALAR_Coastal_Longitudinal_Study_Wave_1_NL.csv").set_index("ID")

# Convert the numerical values in survey_questions to integers
Attcons["Values"] = Attcons["Values"].astype(int)

# Create a mapping dictionary
mapping_dict = dict(zip(zip(Attcons["Variable Label\n (Data files)"], Attcons["Values"]), Attcons["Value labels"]))

for column in survey_data.columns:
    if column in Attcons["Variable Label\n (Data files)"].unique():
        try:
            # Map numerical values to string labels using the created dictionary
            survey_data[column] = survey_data[column].astype(int)  # Ensure Q0_gender is of integer type
            survey_data[column] = survey_data[column].map(lambda x: mapping_dict.get((column, x)))
        except: "not suitable column"

In [86]:
survey_data_list = []
for column in survey_data.columns:
    try:
        survey_data_list += (survey_questions[survey_questions['Variable Label\n (Data files)'] == column]['Description'].iat[0] + " Given answer: " + survey_data[column].astype(str)).to_list() 
    except: "not str"

# Fit the BERTopic model
topics, _ = topic_model.fit_transform(survey_data_list,)

In [87]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,86,-1_option_describes_select_valves,"[option, describes, select, valves, pipes, ant...",[Please select the option that describes your ...
1,0,1759,0_given_answer_flood_water,"[given, answer, flood, water, andor, measure, ...",[Raising the level of the ground floor above t...
2,1,614,1_household_power_answer_given,"[household, power, answer, given, savings, sup...",[How does your current TOTAL household savings...
3,2,380,2_yes_given_answer_organization,"[yes, given, answer, organization, organizatio...","[Are you self-employed? Given answer: Yes, No ..."
4,3,160,3_media_reasons_social_general,"[media, reasons, social, general, wechat, weib...","[From social media (i.e. Facebook, Instagram, ..."
5,4,80,4_climate_change_risks_following,"[climate, change, risks, following, global, ac...",[There is a lot of discussion about global cli...
6,5,61,5_antibackflow_valves_pipes_installing,"[antibackflow, valves, pipes, installing, stru...",[Installing anti-backflow valves on pipes Give...
7,6,40,6_identify_gender_age_female,"[identify, gender, age, female, male, 5564, 65...",[What gender do you identify with? Given answe...
8,7,20,7_feeling_city_attachment_area,"[feeling, city, attachment, area, given, answe...",[Feeling of attachment to the city/ area Given...


In [89]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

for Topic in topic_model.get_topic_info()['Topic']:
    # A selected topic representation
    # 'god jesus atheists atheism belief atheist believe exist beliefs existence'
    sequence_to_classify =  " ".join([word for word, _ in topic_model.get_topic(Topic)])

    # Our set of potential topic labels
    candidate_labels = [
    'Spatial: Where, Location or Direction',
    'Temporal: When, Point in time or Time Frame',
    'Procedural: Why, How, Activity or Topical Realm'
]
    display(classifier(sequence_to_classify, candidate_labels))

{'sequence': 'option describes select valves pipes antibackflow higher valuable elevated floors',
 'labels': ['Procedural: Why, How, Activity or Topical Realm',
  'Spatial: Where, Location or Direction',
  'Temporal: When, Point in time or Time Frame'],
 'scores': [0.4057141840457916, 0.30766525864601135, 0.2866205871105194]}

{'sequence': 'given answer flood water andor measure intend implement level floor',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Procedural: Why, How, Activity or Topical Realm',
  'Spatial: Where, Location or Direction'],
 'scores': [0.37263762950897217, 0.31447991728782654, 0.3128824830055237]}

{'sequence': 'household power answer given savings support current income spare buying',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Procedural: Why, How, Activity or Topical Realm',
  'Spatial: Where, Location or Direction'],
 'scores': [0.4157051146030426, 0.3279232978820801, 0.2563716769218445]}

{'sequence': 'yes given answer organization organizations club groups parent insurance single',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Procedural: Why, How, Activity or Topical Realm',
  'Spatial: Where, Location or Direction'],
 'scores': [0.4288536608219147, 0.3226414620876312, 0.2485048919916153]}

{'sequence': 'media reasons social general wechat weibo facebook instagram given answer',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Spatial: Where, Location or Direction',
  'Procedural: Why, How, Activity or Topical Realm'],
 'scores': [0.4300457239151001, 0.30918386578559875, 0.26077038049697876]}

 'labels': ['Temporal: When, Point in time or Time Frame',
  'Procedural: Why, How, Activity or Topical Realm',
  'Spatial: Where, Location or Direction'],
 'scores': [0.4572080075740814, 0.2816258370876312, 0.26116621494293213]}

{'sequence': 'antibackflow valves pipes installing structural implement intend measure given answer',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Procedural: Why, How, Activity or Topical Realm',
  'Spatial: Where, Location or Direction'],
 'scores': [0.40084969997406006, 0.3003448247909546, 0.2988055348396301]}

{'sequence': 'identify gender age female male 5564 65 4554 given answer',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Procedural: Why, How, Activity or Topical Realm',
  'Spatial: Where, Location or Direction'],
 'scores': [0.5957229137420654, 0.21319955587387085, 0.1910775750875473]}

{'sequence': 'feeling city attachment area given answer    ',
 'labels': ['Spatial: Where, Location or Direction',
  'Temporal: When, Point in time or Time Frame',
  'Procedural: Why, How, Activity or Topical Realm'],
 'scores': [0.41320911049842834, 0.4000754654407501, 0.18671546876430511]}

In [95]:
from transformers import pipeline
import numpy as np

# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define the candidate labels
candidate_labels = [
    'Spatial: Where, Location or Direction',
    'Temporal: When, Point in time or Time Frame',
    'Procedural: Why, How, Activity or Topical Realm'
]

# Initialize an empty list to store the predicted labels
predicted_labels = []

# Loop through each question in the 'Description' column of Attcons
for question in Attcons['question_answers_combined'].sample(n=10):
    # Classify the question using zero-shot classification
    classification_result = classifier(question, candidate_labels)
    display(classification_result)
#     # Get the label with the highest score
#     highest_score_index = np.argmax(classification_result["scores"])
#     predicted_label = classification_result["labels"][highest_score_index]
    
#     # Append the predicted label to the list
#     predicted_labels.append(predicted_label)

# # Add the predicted labels as a new column to the Attcons DataFrame
# Attcons["Predicted_Label"] = predicted_labels

{'sequence': 'Keeping a working flashlight and/or a battery-operated radio and/or emergency kit in a convenient locationInstalling a refuge zone, or an opening in the roof of your home or apartment Response: 3',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Spatial: Where, Location or Direction',
  'Procedural: Why, How, Activity or Topical Realm'],
 'scores': [0.3889363408088684, 0.35511139035224915, 0.25595226883888245]}

{'sequence': 'Employer typeIndustry type Response: Restaurant services',
 'labels': ['Procedural: Why, How, Activity or Topical Realm',
  'Temporal: When, Point in time or Time Frame',
  'Spatial: Where, Location or Direction'],
 'scores': [0.49895039200782776, 0.2899942994117737, 0.21105524897575378]}

{'sequence': 'Are you self-employed? Response: No',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Procedural: Why, How, Activity or Topical Realm',
  'Spatial: Where, Location or Direction'],
 'scores': [0.5666679739952087, 0.2737065851688385, 0.15962545573711395]}

{'sequence': 'Raising the level of the ground floor above the most likely flood levelInstalling a pump and/or one or more system(s) to drain flood water Response: I intend to implement this structural measure in the next 6 months',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Procedural: Why, How, Activity or Topical Realm',
  'Spatial: Where, Location or Direction'],
 'scores': [0.819290816783905, 0.0928700789809227, 0.08783909678459167]}

{'sequence': "No oneDon't know Response: Yes",
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Spatial: Where, Location or Direction',
  'Procedural: Why, How, Activity or Topical Realm'],
 'scores': [0.5193719267845154, 0.26384010910987854, 0.21678794920444489]}

{'sequence': 'Raising the level of the ground floor above the most likely flood levelFixing water barriers (e.g. water-proof basement windows) Response: I intend to implement this structural measure in the future, after 2 years',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Spatial: Where, Location or Direction',
  'Procedural: Why, How, Activity or Topical Realm'],
 'scores': [0.8373029828071594, 0.09192800521850586, 0.07076893746852875]}

{'sequence': 'Raising the level of the ground floor above the most likely flood levelInstalling a pump and/or one or more system(s) to drain flood water Response: 3',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Spatial: Where, Location or Direction',
  'Procedural: Why, How, Activity or Topical Realm'],
 'scores': [0.5732258558273315, 0.2272205799818039, 0.19955359399318695]}

{'sequence': 'Employer typeIndustry type Response: Construction',
 'labels': ['Procedural: Why, How, Activity or Topical Realm',
  'Temporal: When, Point in time or Time Frame',
  'Spatial: Where, Location or Direction'],
 'scores': [0.5424149036407471, 0.2619480788707733, 0.19563700258731842]}

{'sequence': 'Keeping a working flashlight and/or a battery-operated radio and/or emergency kit in a convenient locationAsking someone (local government, Civil Defense, etc.) for information about what to do in case of emergency Response: 5 - Very expensive',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Procedural: Why, How, Activity or Topical Realm',
  'Spatial: Where, Location or Direction'],
 'scores': [0.49666303396224976, 0.2744276225566864, 0.22890931367874146]}

{'sequence': 'Keeping a working flashlight and/or a battery-operated radio and/or emergency kit in a convenient locationBuying a spare power generator to power your home Response: 1 - I am unable',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Spatial: Where, Location or Direction',
  'Procedural: Why, How, Activity or Topical Realm'],
 'scores': [0.43018895387649536, 0.32772648334503174, 0.24208462238311768]}

In [94]:
classification_result

{'sequence': 'Age',
 'labels': ['Temporal: When, Point in time or Time Frame',
  'Spatial: Where, Location or Direction',
  'Procedural: Why, How, Activity or Topical Realm'],
 'scores': [0.5577120184898376, 0.24851831793785095, 0.19376972317695618]}

In [77]:
import numpy as np
import pandas as pd
import spacy

# Load pre-trained word embeddings model (you can use any suitable model)
nlp = spacy.load("en_core_web_md")

# Define the categories
categories = [
    'Spatial: Where, Location or Direction',
    'Temporal: When, Point in time or Time Frame',
    'Procedural: Why, How, Activity or Topical Realm'
]

# Calculate the average word embedding for each category
category_embeddings = {}
for category in categories:
    category_doc = nlp(category)
    category_embedding = np.mean([token.vector for token in category_doc if token.has_vector], axis=0)
    category_embeddings[category] = category_embedding

# Process each survey question and calculate its average word embedding
word_embeddings = []
for question in Attcons['question_answers_combined']:
    doc = nlp(question)
    question_embedding = np.mean([token.vector for token in doc if token.has_vector], axis=0)
    word_embeddings.append(question_embedding)

# Calculate the similarity between each question embedding and each category embedding
similarities = np.array([[np.dot(question_embedding, category_embedding) /
                          (np.linalg.norm(question_embedding) * np.linalg.norm(category_embedding))
                          for category_embedding in category_embeddings.values()]
                         for question_embedding in word_embeddings])

# Assign each question to the category with the highest similarity score
question_categories = [categories[np.argmax(similarity)] for similarity in similarities]

In [78]:
Attcons['category'] = question_categories
# question_categories
Attcons

Unnamed: 0,ADICO Category,Question number\n (Questionnaire file),Variable Label\n (Data files),Description,Values,Value labels,question_answers_combined,category
Q0_age_1.0,Attribute,Q0_age,Q0_age,Age,1.0,16-24,Age Response: 16-24,"Temporal: When, Point in time or Time Frame"
Q0_age_2.0,Attribute,Q0_age,Q0_age,Age,2.0,25-34,Age Response: 16-24Age Response: 25-34,"Spatial: Where, Location or Direction"
Q0_age_3.0,Attribute,Q0_age,Q0_age,Age,3.0,35-44,Age Response: 16-24Age Response: 25-34Age Resp...,"Spatial: Where, Location or Direction"
Q0_age_4.0,Attribute,Q0_age,Q0_age,Age,4.0,45-54,Age Response: 16-24Age Response: 25-34Age Resp...,"Spatial: Where, Location or Direction"
Q0_age_5.0,Attribute,Q0_age,Q0_age,Age,5.0,55-64,Age Response: 16-24Age Response: 25-34Age Resp...,"Spatial: Where, Location or Direction"
...,...,...,...,...,...,...,...,...
Q60a_parent_0.0,Condition,Q60a,Q60a_parent,Are you the parent or guardian of any children...,0.0,No,Are you the parent or guardian of any children...,"Spatial: Where, Location or Direction"
Q60a_parent_1.0,Condition,Q60a,Q60a_parent,Are you the parent or guardian of any children...,1.0,Yes,Are you the parent or guardian of any children...,"Spatial: Where, Location or Direction"
Q61_single_parent_0.0,Condition,Q61,Q61_single_parent,Are you a single parent?,0.0,No,Are you a single parent? Response: No,"Procedural: Why, How, Activity or Topical Realm"
Q61_single_parent_1.0,Condition,Q61,Q61_single_parent,Are you a single parent?,1.0,Yes,Are you a single parent? Response: NoAre you a...,"Procedural: Why, How, Activity or Topical Realm"
