In [1]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm






All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [2]:
# Define a function to replace "not numbered" with the value in "Variable Label"
def replace_not_numbered(row):
    if row["Question number\n (Questionnaire file)"] == "not numbered":
        return row["Variable Label\n (Data files)"]
    else:
        return row["Question number\n (Questionnaire file)"]

In [3]:
survey_questions  = pd.read_excel('Survey Questions Overview.xlsx', sheet_name='Wave 1', engine='openpyxl')
# Clean the survey questions dataframe to make it more usable for mapping
# Fill forward non-null ADICO Category values to apply them to all relevant rows, Specify the columns to forward fill excluding "Values" and "Value labels"
columns_to_ffill = [col for col in survey_questions.columns if col not in ["Values", "Value labels"]]

# Forward fill the specified columns
survey_questions[columns_to_ffill] = survey_questions[columns_to_ffill].ffill()


# Apply the function to replace "not numbered" with the value in "Variable Label"
survey_questions["Question number\n (Questionnaire file)"] = survey_questions.apply(replace_not_numbered, axis=1)


# Set the index to {value of "Question number\n (Questionnaire file)"} + "_" + {str(value of "Values")}
survey_questions.set_index(survey_questions["Variable Label\n (Data files)"] + "_" + survey_questions["Values"].astype(str), inplace=True)
survey_questions.drop("ID_nan", inplace=True)

question_answers_list = []
# Define a function to create the combined string
def combine_description_and_labels(group, question_answers_list, qnum):
    question_subset = survey_questions[survey_questions["Question number\n (Questionnaire file)"] == group["Question number\n (Questionnaire file)"].iloc[0]]
    # Check if it's the first row instance with the current "Question number\n (Questionnaire file)" column value
    first_instance_index = question_subset[question_subset.duplicated(subset=["Question number\n (Questionnaire file)"], keep="first")].index
    combined_string = ""
    if first_instance_index.size != 0:
        first_description = question_subset["Description"].iloc[0]
        if first_description != group["Description"].iloc[0]:
         # If not the first instance, start with the first instance's "Description" column value
            combined_string += first_description
    # Concatenate the current row's "Description" and all "Value labels" values
    combined_string += str(group["Description"].iloc[0])# + " " + "; ".join(group["Value labels"].astype(str)))
    question_answers_list = question_answers_list + [combined_string] * group.shape[0]  # Extend the list with the combined strings
    return question_answers_list

# Group by "Variable Label\n (Data files)" and apply the function to create the combined string
for group in survey_questions.groupby("Variable Label\n (Data files)",sort=False):
    question_answers_list = combine_description_and_labels(group[1], question_answers_list, group[1]["Question number\n (Questionnaire file)"].iloc[0])
survey_questions["question_answers_combined"] = question_answers_list


# Since the dataset has multiple rows per question for different value labels, we'll create a unique mapping
# Create the new mapping dictionary
question_adico_mapping = survey_questions[['question_answers_combined','ADICO Category',"Variable Label\n (Data files)"]].drop_duplicates().set_index('question_answers_combined')['ADICO Category']


# Filter out questions that are categorized as Attributes, Conditions, or Aims for clarity in analysis
attributes = [k for k, v in question_adico_mapping.items() if 'Attribute' in str(v) or 'Attribute/Condition' in str(v)]
conditions = [k for k, v in question_adico_mapping.items() if 'Condition' in str(v) or 'Aim/Condition' in str(v) or 'Attribute/Condition' in str(v)]
aims = [k for k, v in question_adico_mapping.items() if 'Aim' in str(v) or 'Aim/Condition' in str(v)]

In [9]:
survey_questions.at['Q4_home_size_US_98.0', 'question_answers_combined']

"How many square meters is your accommodation? If you don't know for sure, please provide your best estimation.How many square feet is your accommodation? If you don't know for sure, please provide your best estimation."

In [4]:
# Load the model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def categorizeADICO(question):
    prompt = f"Task: Categorization. Instructions: Categorize whether the survey question represents an 'Aim', a 'Condition' or 'Aim/Condition'. Survey Question: {question} Categories: - Aim: Actions performed or intended by the responder. - Condition: External factors influencing the responder's decision."

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    outputs = model.generate(input_ids, max_new_tokens=20, )
    return tokenizer.decode(outputs[0],skip_special_tokens=True)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
aim_quest = survey_questions[survey_questions['ADICO Category'].apply(lambda x: "Aim" in x)].head(20).copy()
# Sample questions
aim_quest['model_adico'] = aim_quest['question_answers_combined'].apply(categorizeADICO)
aim_quest

Unnamed: 0,ADICO Category,Question number\n (Questionnaire file),Variable Label\n (Data files),Description,Values,Value labels,question_answers_combined,model_adico
Q8_move_out_1.0,Aim/Condition,Q8,Q8_move_out,How much longer are you planning on staying in...,1.0,I am looking to move in 1 year or less,How much longer are you planning on staying in...,Aim
Q8_move_out_2.0,Aim/Condition,Q8,Q8_move_out,How much longer are you planning on staying in...,2.0,1-5 years,How much longer are you planning on staying in...,Aim
Q8_move_out_3.0,Aim/Condition,Q8,Q8_move_out,How much longer are you planning on staying in...,3.0,5-10 years,How much longer are you planning on staying in...,Aim
Q8_move_out_4.0,Aim/Condition,Q8,Q8_move_out,How much longer are you planning on staying in...,4.0,More than 10 years,How much longer are you planning on staying in...,Aim
Q8_move_out_98.0,Aim/Condition,Q8,Q8_move_out,How much longer are you planning on staying in...,98.0,Don’t know,How much longer are you planning on staying in...,Aim
Q9_easy_leave_1.0,Aim/Condition,Q9,Q9_easy_leave,How easy or difficult would it be to leave the...,1.0,1 - It would be difficult to leave this area,How easy or difficult would it be to leave the...,Aim
Q9_easy_leave_2.0,Aim/Condition,Q9,Q9_easy_leave,How easy or difficult would it be to leave the...,2.0,2,How easy or difficult would it be to leave the...,Aim
Q9_easy_leave_3.0,Aim/Condition,Q9,Q9_easy_leave,How easy or difficult would it be to leave the...,3.0,3,How easy or difficult would it be to leave the...,Aim
Q9_easy_leave_4.0,Aim/Condition,Q9,Q9_easy_leave,How easy or difficult would it be to leave the...,4.0,4,How easy or difficult would it be to leave the...,Aim
Q9_easy_leave_5.0,Aim/Condition,Q9,Q9_easy_leave,How easy or difficult would it be to leave the...,5.0,5 - I could leave this area very easily,How easy or difficult would it be to leave the...,Aim


In [None]:
#Rephrazing
# Load the model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def intoIG(question):
    prompt = f"Task: Rephrashing. Instructions: Convert the following Survey Question into an descriptive phrase. Phrase Types: e.g. 'reinforce their foundations bi-yearly' or  'if they live with more than four people'. Survey Question: {question} "

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    outputs = model.generate(input_ids, max_new_tokens=100, )
    return tokenizer.decode(outputs[0],skip_special_tokens=True)


In [None]:
aim_quest = survey_questions[survey_questions['ADICO Category'].apply(lambda x: "Aim" in x)].head(20).copy()
# Sample questions
aim_quest['IGVersion'] = aim_quest['question_answers_combined'].apply(intoIG)
aim_quest

In [None]:
aim_quest.at['Q8_move_out_1.0','IGVersion']