## Generate Responses to the Survey

In [1]:
import pandas as pd
import random

In [2]:
# define survey structure + add weights to the questions so some are more likely
SURVEY_QUESTIONS = [
    # Section 1: Background
    ("Year_of_study", ["1st year", "2nd year", "3rd year", "Postgraduate"], [0.2, 0.3, 0.3, 0.2]),
    ("Field_of_study", ["Business / Economics", "Engineering/ Science", "Humanities / Arts", "Other"], [0.35, 0.25, 0.25, 0.15]),
    ("Ever_used_platform", ["Yes", "No"], [0.65, 0.35]),
    # Q4 and Q5 are CONDITIONAL on Q3 being "Yes"
    ("Time_investing", ["Less than 6 months", "6-12 months", "1-2 years", "Over 2 years"], [0.3, 0.3, 0.25, 0.15]),
    ("Main_platform_type", ["Trading app", "Robo-advisor", "Other"], [0.6, 0.3, 0.1]),
    
    # Section 2: Financial Literacy
    ("Confidence_in_terms", ["Not confident", "Somewhat confident", "Confident", "Very confident"], [0.1, 0.3, 0.4, 0.2]),
    ("Purchasing_power_q", ["It increases", "It stays the same", "It decreases", "Not sure"], [0.1, 0.1, 0.6, 0.2]),
    ("Single_stock_risk_q", ["Less risky", "As risky", "More risky than investing in a diversified fund", "Not sure"], [0.05, 0.1, 0.65, 0.2]),
    ("Inflation_time_value_q", ["£100 today", "£110 in one year", "No difference", "Not sure"], [0.2, 0.5, 0.1, 0.2]),
    # Q10 is conditionally relevant to Q3
    ("Knowledge_improved", ["No", "Slightly", "Moderately", "Significantly"], [0.15, 0.3, 0.35, 0.2]),

    # Section 3: Digital Literacy
    ("Confidence_interpreting_data", ["Not confident", "Somewhat confident", "Confident", "Very confident"], [0.05, 0.2, 0.45, 0.3]),
    ("Features_used_most", ["Portfolio analytics", "Stock performance charts", "Market news", "AI or data insights", "Other"], [0.3, 0.35, 0.2, 0.1, 0.05]),
    ("Regretted_misinterpretation", ["Yes", "No", "Not sure"], [0.2, 0.7, 0.1]),

    # Section 4: Investment Behaviour & Attitudes
    ("Frequency_based_on_app_data", ["Never", "Rarely", "Sometimes", "Often", "Always"], [0.05, 0.2, 0.4, 0.25, 0.1]),
    ("Greater_influence", ["Data and analytics", "Social media trends", "Advice from others", "Personal decisions"], [0.4, 0.15, 0.25, 0.2]),
    ("Apps_oversimplify", ["Strongly disagree", "Disagree", "Neutral", "Agree", "Strongly agree"], [0.05, 0.15, 0.2, 0.35, 0.25]),
    ("Likely_to_use_new_features", ["Very unlikely", "Unlikely", "Neutral", "Likely", "Very likely"], [0.05, 0.1, 0.2, 0.4, 0.25]),
    ("Data_viz_helps", ["Strongly disagree", "Disagree", "Neutral", "Agree", "Strongly agree"], [0.01, 0.04, 0.1, 0.45, 0.4])
]

# define a function to create 1 response
def generate_response():
    response = {}
    
    # generate response to q3 first
    q3_col, q3_options, q3_weights = next(q for q in SURVEY_QUESTIONS if q[0] == "Ever_used_platform")
    investor_status = random.choices(q3_options, weights=q3_weights, k=1)[0] # randomly set investor status
    response[q3_col] = investor_status # add that response to the response dictionary
    
    # Iterate through all other questions
    for column_name, options, weights in SURVEY_QUESTIONS:
        
        # skip q3 as already filled
        if column_name == "Q3_Ever_used_platform":
            continue

        # set conditiona questions
        is_conditional = column_name in ["Time_investing", "Main_platform_type", "Knowledge_improved"]
        
        # if the question is conditional and response to q3 is no make answers N/A
        if is_conditional and investor_status == "No":
            response[column_name] = "N/A - Not an investor"
        else:
            # Generate a random choice using weights
            response[column_name] = random.choices(options, weights=weights, k=1)[0]
            
    return response

# Generate the Dataset and Export
def generate_dataset(num_responses, output_filename="../data/survey_responses.csv"): # set the output location
    
    print(f"Generating {num_responses} synthetic survey responses...")
    
    # create a list of the survey dictionary
    data = [generate_response() for _ in range(num_responses)]
    
    # convert the list into a dataframe
    column_order = [q[0] for q in SURVEY_QUESTIONS] # set the coloumn order so it is in survey order
    df = pd.DataFrame(data).reindex(columns=column_order)
    
    # Save to CSV
    df.to_csv(output_filename, index=False)
    
    print(f"Dataset successfully generated and saved to {output_filename}")
    print("\nFirst 5 rows of the generated data:")
    print(df.head())



In [3]:
NUMBER_OF_RESPONSES = 1000

generate_dataset(NUMBER_OF_RESPONSES)

Generating 1000 synthetic survey responses...
Dataset successfully generated and saved to ../data/survey_responses.csv

First 5 rows of the generated data:
  Year_of_study        Field_of_study Ever_used_platform  \
0      2nd year     Humanities / Arts                Yes   
1      2nd year  Engineering/ Science                 No   
2      3rd year  Engineering/ Science                 No   
3      1st year  Engineering/ Science                 No   
4      3rd year  Business / Economics                 No   

          Time_investing     Main_platform_type Confidence_in_terms  \
0              1-2 years            Trading app      Very confident   
1            6-12 months            Trading app       Not confident   
2              1-2 years            Trading app  Somewhat confident   
3     Less than 6 months                  Other           Confident   
4  N/A - Not an investor  N/A - Not an investor  Somewhat confident   

  Purchasing_power_q                              Single