## Generate Responses to the Survey

In [39]:
import pandas as pd
import random
import io
import numpy as np

random.seed(123)

In [40]:
N = 1000  # Number of survey responses to generate
data = []

# Define all column names in the correct order
COLUMNS = [
    "Year_of_study", "Field_of_study", "Ever_used_platform", "Time_investing", 
    "Main_platform_type", "Confidence_in_terms", "Purchasing_power_q", 
    "Single_stock_risk_q", "Inflation_time_value_q", "Knowledge_improved", 
    "Confidence_interpreting_data", "Features_used_most", "Regretted_misinterpretation", 
    "Frequency_based_on_app_data", "Greater_influence", "Apps_oversimplify", 
    "Likely_to_use_new_features", "Data_viz_helps"
]

# Define Base Options and Probabilities
base_fields = ["Business / Economics", "Engineering/ Science", "Humanities / Arts", "Other"]
base_field_probs = [0.35, 0.25, 0.25, 0.15]

# Q7-Q9 Answers + Correct Answer Index
q_purchasing = (["It increases", "It stays the same", "It decreases", "Not sure"], 2)
q_risk = (["Less risky", "As risky", "More risky than investing in a diversified fund", "Not sure"], 2)
q_inflation = (["£100 today", "£110 in one year", "No difference", "Not sure"], 1)

# Q11, Q18 (Digital Literacy)
q_confidence_data = ["Not confident", "Somewhat confident", "Confident", "Very confident"]
q_viz_helps = ["Strongly disagree", "Disagree", "Neutral", "Agree", "Strongly agree"]

# Q6 
q_confidence_terms = ["Not confident", "Somewhat confident", "Confident", "Very confident"]

# Q14 
q_frequency = ["Never", "Rarely", "Sometimes", "Often", "Always"]

# Q15 
q_influence = ["Data and analytics", "Social media trends", "Advice from others", "Personal decisions"]

# Q17
q_likely = ["Very unlikely", "Unlikely", "Neutral", "Likely", "Very likely"]



In [41]:
# define a function to work out the correct answer using the probs from above
def get_answer(question_tuple, prob_correct):
    options, correct_idx = question_tuple
    if np.random.rand() < prob_correct: return options[correct_idx]
    else:
        wrong_options = [opt for i, opt in enumerate(options) if i != correct_idx]
        return np.random.choice(wrong_options)

In [42]:
# Start Loop to Generate N Rows 
for _ in range(N):
    row = {}
    
    #  Generate Independent Variables (Section 1)
    row["Year_of_study"] = np.random.choice(["1st year", "2nd year", "3rd year", "Postgraduate"], p=[0.2, 0.3, 0.3, 0.2])
    row["Field_of_study"] = np.random.choice(base_fields, p=base_field_probs)
    
    # H1: `Field_of_study` -> `Financial_Literacy_Score` 
    if row["Field_of_study"] == "Business / Economics":
        prob_correct = 0.7  # 70% chance
    elif row["Field_of_study"] == "Engineering/ Science":
        prob_correct = 0.5  # 50% chance
    else: # Humanities / Other
        prob_correct = 0.3  # 30% chance


    row["Purchasing_power_q"] = get_answer(q_purchasing, prob_correct)
    row["Single_stock_risk_q"] = get_answer(q_risk, prob_correct)
    row["Inflation_time_value_q"] = get_answer(q_inflation, prob_correct)

    # Calculate the financial score for this row
    fin_score = 0
    if row["Purchasing_power_q"] == "It decreases": fin_score += 1
    if row["Single_stock_risk_q"] == "More risky than investing in a diversified fund": fin_score += 1
    if row["Inflation_time_value_q"] == "£110 in one year": fin_score += 1
    
    # Field_of_study -> Platform_type
    prob_use_platform = 0.8 if row["Field_of_study"] == "Business / Economics" else 0.5 # if they study bus + econ 80% change of using platform
    row["Ever_used_platform"] = "Yes" if np.random.rand() < prob_use_platform else "No"
    
    if row["Ever_used_platform"] == "Yes": # if they use the platform
        
        if row["Field_of_study"] == "Business / Economics": platform_probs = [0.8, 0.15, 0.05] # if they study bus + econ - 80% change of using trading platform
        elif row["Field_of_study"] == "Humanities / Arts": platform_probs = [0.3, 0.6, 0.1] # if the study hum + arts 60% chance of robo-advisor
        else: platform_probs = [0.6, 0.3, 0.1] # if it is science or other give 60% chance of trading platform

        row["Main_platform_type"] = np.random.choice(["Trading app", "Robo-advisor", "Other"], p=platform_probs) # use the probs from above to get main platform

        row["Time_investing"] = np.random.choice(["Less than 6 months", "6-12 months", "1-2 years", "Over 2 years"], p=[0.3, 0.3, 0.25, 0.15]) # set the time investing 

        # Time_investing -> Confidence_in_terms & Knowledge_improved
        if row["Time_investing"] == "Over 2 years": # if time investing is over 2 years
            prob_confidence_terms = [0.05, 0.1, 0.35, 0.5] # set confidence likely to be high
            prob_knowledge_improved = [0.05, 0.15, 0.3, 0.5] # set knoweldge improved likely to be high

        elif row["Time_investing"] == "1-2 years": # if time investing is between 1-2 years
            prob_confidence_terms = [0.1, 0.2, 0.4, 0.3] # set confidence likely to be medium
            prob_knowledge_improved = [0.1, 0.2, 0.4, 0.3] # set knoweldge improved likely to be medium

        else: # Less than 1 year
            prob_confidence_terms = [0.2, 0.5, 0.2, 0.1]  # set confidence likely to be low
            prob_knowledge_improved = [0.2, 0.5, 0.2, 0.1] # set knoweldge improved likely to be low
            
        # use the probabilities from above to generate choices
        row["Confidence_in_terms"] = np.random.choice(q_confidence_terms, p=prob_confidence_terms)
        row["Knowledge_improved"] = np.random.choice(["No", "Slightly", "Moderately", "Significantly"], p=prob_knowledge_improved)
    
    else: # Not an investor
        row["Main_platform_type"] = "N/A - Not an investor"
        row["Time_investing"] = "N/A - Not an investor"
        row["Knowledge_improved"] = "N/A - Not an investor"
        row["Confidence_in_terms"] = np.random.choice(q_confidence_terms, p=[0.4, 0.5, 0.05, 0.05]) # Low confidence


    # Knoweldge -> Behaviour
    if fin_score >= 2: # if individual has high fin score

        prob_confidence_data = [0.05, 0.15, 0.4, 0.4] # likely to have high confidence when intepreting graphs
        prob_viz_helps = [0.01, 0.04, 0.1, 0.35, 0.5] # likely to have high confidence in data for investment decisions

        # High-knowledge people are more active and data-driven

        prob_q14 = [0.05, 0.1, 0.3, 0.35, 0.2] # they will likely use data to make decisions
        prob_q15 = [0.7, 0.1, 0.1, 0.1] # likely to use data and analytics to make decisions
        prob_q17 = [0.05, 0.05, 0.1, 0.4, 0.4] # likely to use new ai features

    else: # if indivudal has low fin score
        prob_confidence_data = [0.3, 0.4, 0.2, 0.1] # likely to have lower confidence when intepreting graphs
        prob_viz_helps = [0.1, 0.2, 0.3, 0.3, 0.1] # likely to have lower confidence in data for investment decisions

        # Low-knowledge people are less active and more influenced by others

        prob_q14 = [0.2, 0.4, 0.3, 0.05, 0.05] # less likely to use data to make decieions
        prob_q15 = [0.1, 0.3, 0.3, 0.3] # more likely to not use data and use others/social media
        prob_q17 = [0.2, 0.3, 0.3, 0.1, 0.1] # not likely to use new ai features
    
    # set all the answers using the probabilities from above    
    row["Confidence_interpreting_data"] = np.random.choice(q_confidence_data, p=prob_confidence_data)
    row["Data_viz_helps"] = np.random.choice(q_viz_helps, p=prob_viz_helps)
    row["Frequency_based_on_app_data"] = np.random.choice(q_frequency, p=prob_q14)
    row["Greater_influence"] = np.random.choice(q_influence, p=prob_q15)
    row["Likely_to_use_new_features"] = np.random.choice(q_likely, p=prob_q17)

    # Fill the rest of the rows as these are likely to have low correaltion
    row["Features_used_most"] = np.random.choice(["Portfolio analytics", "Stock performance charts", "Market news", "AI or data insights", "Other"], p=[0.3, 0.35, 0.2, 0.1, 0.05])
    row["Regretted_misinterpretation"] = np.random.choice(["Yes", "No", "Not sure"], p=[0.2, 0.7, 0.1])
    row["Apps_oversimplify"] = np.random.choice(["Strongly disagree", "Disagree", "Neutral", "Agree", "Strongly agree"], p=[0.05, 0.15, 0.2, 0.35, 0.25])
    
    data.append(row)

In [43]:
#  Create DataFrame and Save
df = pd.DataFrame(data, columns=COLUMNS)

df.to_csv('../data/survey_responses.csv', index=False)

print(f"Generated {len(df)} survey responses.")

Generated 1000 survey responses.
