## Generate Responses to the Survey

In [1]:
import pandas as pd
import numpy as np

np.random.seed(123) # set a seed for the randomness to it is reproducible

In [None]:
N = 1000  # Number of survey responses to generate
data = []

# define the columns we need in the dataset
COLUMNS = [
    # Section 1: Background
    "Year_of_study", "Field_of_study", "Ever_used_platform", "Time_investing", 
    "Main_platform_type", 

    # Section 2: Financial Literacy
    "Confidence_in_terms",          
    "Return_in_30_years",          
    "Purchasing_power_q",          
    "Single_stock_risk_q",          
    "Risk_Reward",                 
    "Inflation_time_value_q",    
    "Diversification_purpose_q",    
    "Inflation_rate_q",             
    "Knowledge_improved",           

    # Section 3: Digital Literacy 
    "Confidence_interpreting_data", 
    "Features_used_most",           
    "Regretted_misinterpretation",  

    # Section 4: Investment Behaviour & Attitudes 
    "Frequency_based_on_app_data",  
    "Greater_influence",            
    "Apps_oversimplify",            
    "Likely_to_use_new_features",  
    "Data_viz_helps"               
]

# Fielf of study
base_fields = ["Business / Economics", "Engineering/ Science", "Humanities / Arts", "Other"]
base_field_probs = [0.35, 0.25, 0.25, 0.15] # probaility of being that field

# F literacy

# Q7: If you invest £100 per month into a fund that returns 10% annually, after 30 years...
q_return_30_years = (["Capital (Your Investment)", "Compound interest/growth", "Will be equal", "Not Sure"], 1) # Compound interest/growth

# Q8: If prices increase by 3% but your savings earn 1% interest, what happens to your purchasing power?
q_purchasing = (["It increases", "It stays the same", "It decreases", "Not sure"], 2) # It decreases

# Q9: Investing in a single company's stock is generally:
q_risk = (["Less risky", "As risky", "More risky than investing in a diversified fund", "Not sure"], 2) # More risky than investing in a diversified fund

# Q10: Which statement best describes the relationship between risk and reward in investing?
q_risk_reward = (["Higher risk generally leads to lower potential reward", "Higher risk generally leads to higher potential reward", "Risk and reward are not related", "Not sure"], 1) # Correct: Higher risk generally leads to higher potential reward

# Q11: If you were offered £100 today or £110 in one year, which is generally better if inflation is 5%?
q_inflation = (["£100 today", "£110 in one year", "No difference", "Not sure"], 1) #  £110 in one year

# Q12: What is the primary purpose of diversification in a stock portfolio?
q_diversification = (["To guarantee higher overall returns", "To minimize fees and expenses", "To reduce the overall risk of the portfolio", "Not sure"], 2) # Correct: To reduce the overall risk of the portfolio

# Q13: If the current UK inflation rate is 10%, how much interest would a savings account need to pay to preserve your purchasing power?
q_inflation_rate = (["Exactly 0% interest", "Less than 10% interest", "Exactly 10% interest", "Not sure"], 2) # Exactly 10% interest


# Other questions options
q_confidence_terms = ["Not confident", "Somewhat confident", "Confident", "Very confident"]
q_confidence_data = ["Not confident", "Somewhat confident", "Confident", "Very confident"]
q_viz_helps = ["Strongly disagree", "Disagree", "Neutral", "Agree", "Strongly agree"]
q_frequency = ["Never", "Rarely", "Sometimes", "Often", "Always"]
q_influence = ["Data and analytics", "Social media trends", "Advice from others", "Personal decisions"]
q_simplify = ["Strongly disagree", "Disagree", "Neutral", "Agree", "Strongly agree"]
q_likely = ["Very unlikely", "Unlikely", "Neutral", "Likely", "Very likely"]

In [None]:
# define a function to work out the correct answer using the probs from above
def get_answer(question, prob_correct):
    options, correct_idx = question # get the options and correct option 
    if np.random.rand() < prob_correct: # if the random generated value if less than the probability
        return options[correct_idx] # they get it correct
    else:
        # get only the wrong options
        wrong_options = [opt for i, opt in enumerate(options) if i != correct_idx]
        # make them get it incorrect
        return np.random.choice(wrong_options) # doesn't matter which wrong option they get

In [None]:
for _ in range(N):
    row = {} # define a dictionary to hold the answers for this person / row
    
    # set year and field of study
    row["Year_of_study"] = np.random.choice(["1st year", "2nd year", "3rd year", "Postgraduate"], p=[0.2, 0.3, 0.3, 0.2])
    row["Field_of_study"] = np.random.choice(base_fields, p=base_field_probs)
    
    # set probs for getting questions correct for each field
    if row["Field_of_study"] == "Business / Economics":
        prob_correct = 0.7 
    elif row["Field_of_study"] == "Engineering/ Science":
        prob_correct = 0.5 
    else: 
        prob_correct = 0.3 

    # generate FLS answers - only dependent on the course; education in the concepts
    row["Return_in_30_years"] = get_answer(q_return_30_years, prob_correct)
    row["Purchasing_power_q"] = get_answer(q_purchasing, prob_correct)
    row["Single_stock_risk_q"] = get_answer(q_risk, prob_correct)
    row["Risk_Reward"] = get_answer(q_risk_reward, prob_correct)
    row["Inflation_time_value_q"] = get_answer(q_inflation, prob_correct)
    row["Diversification_purpose_q"] = get_answer(q_diversification, prob_correct)
    row["Inflation_rate_q"] = get_answer(q_inflation_rate, prob_correct)

    # calculate the fls score from the 7 questions
    fin_score = 0

    # if there answer is equal to the correct answer increase score
    if row["Return_in_30_years"] == q_return_30_years[0][q_return_30_years[1]]:
        fin_score += 1
    if row["Purchasing_power_q"] == q_purchasing[0][q_purchasing[1]]: 
        fin_score += 1
    if row["Single_stock_risk_q"] == q_risk[0][q_risk[1]]: 
        fin_score += 1
    if row["Risk_Reward"] == q_risk_reward[0][q_risk_reward[1]]: 
        fin_score += 1
    if row["Inflation_time_value_q"] == q_inflation[0][q_inflation[1]]: 
        fin_score += 1
    if row["Diversification_purpose_q"] == q_diversification[0][q_diversification[1]]: 
        fin_score += 1
    if row["Inflation_rate_q"] == q_inflation_rate[0][q_inflation_rate[1]]: 
        fin_score += 1
    
    # generate platform use probability based on the field they study
    # make it 0.8 if the field is bus/econ else make it 0.5
    if row["Field_of_study"] == "Business / Economics":
        prob_use_platform = 0.8
    else:
        prob_use_platform = 0.5 

    # set the value if they have used a investment platform
    if np.random.rand() < prob_use_platform:
        row["Ever_used_platform"] = "Yes" 
    else:
        row["Ever_used_platform"] = "No"
    
    # set the probability of answering to be confident in finance terms depending on fin score
    if fin_score <= 1: 
        prob_confidence_terms = [0.5, 0.3, 0.1, 0.1] # low
    elif fin_score <= 4: 
        prob_confidence_terms = [0.1, 0.4, 0.3, 0.2] # medium
    else: 
        prob_confidence_terms = [0.05, 0.1, 0.35, 0.5] # high confidence
        
    # if the individual invests
    if row["Ever_used_platform"] == "Yes": 

        # set probabilities based on their field
        if row["Field_of_study"] == "Business / Economics": 
            platform_probs = [0.8, 0.15, 0.05] 
        elif row["Field_of_study"] == "Humanities / Arts":
            platform_probs = [0.3, 0.6, 0.1] 
        else:
            platform_probs = [0.6, 0.3, 0.1] 

        # generate the platform they use using the probs from above
        row["Main_platform_type"] = np.random.choice(["Trading app", "Robo-advisor", "Other"], p=platform_probs) 

        # set the investment experience 
        row["Time_investing"] = np.random.choice(["Less than 6 months", "6-12 months", "1-2 years", "Over 2 years"], p=[0.3, 0.3, 0.25, 0.15]) 

        # set probabilities for if they have improved knowledge depending on investment experience
        if row["Time_investing"] == "Over 2 years": 
            prob_knowledge_improved = [0.05, 0.1, 0.25, 0.6] 
        elif row["Time_investing"] == "1-2 years": 
            prob_knowledge_improved = [0.1, 0.2, 0.4, 0.3] 
        else: 
            prob_knowledge_improved = [0.2, 0.5, 0.2, 0.1] 
            
        # set confidence in terms based on the probs from fin score
        row["Confidence_in_terms"] = np.random.choice(q_confidence_terms, p=prob_confidence_terms)

        # set if their knowledge has improved based on investment experience
        row["Knowledge_improved"] = np.random.choice(["No", "Slightly", "Moderately", "Significantly"], p=prob_knowledge_improved)
    
    else: # Not an investor
        # set all these to not an investor as these depend on using an investment platform
        row["Main_platform_type"] = "N/A - Not an investor"
        row["Time_investing"] = "N/A - Not an investor"
        row["Knowledge_improved"] = "N/A - Not an investor"

        # set confidence as set based on fin score
        row["Confidence_in_terms"] = np.random.choice(q_confidence_terms, p=prob_confidence_terms)


    # set probs of being confident in data based on fin score
    if fin_score >= 5: 
        prob_confidence_data = [0.05, 0.15, 0.4, 0.4] 
    elif fin_score >= 2: 
        prob_confidence_data = [0.1, 0.3, 0.4, 0.2] 
    else: 
        prob_confidence_data = [0.3, 0.4, 0.2, 0.1]
    
    # set the answers to confidence in data and analytics based on the probs from above on fin-score
    row["Confidence_interpreting_data"] = np.random.choice(q_confidence_data, p=prob_confidence_data)


    # ------------------------------------------------------------------------------------------------------

    # Behaviour Quesions Generation 
    
    # define an influence score to define how much each variable affects behaviour
    # high score means they will have good investment behaviours

    influence_score = 0

    # confidence in fin terms - increase influence depending on answer
    if row["Confidence_in_terms"] == "Very confident":
        influence_score += 3.0
    elif row["Confidence_in_terms"] == "Confident":
        influence_score += 1.5

    # confidence interpreting data - increase influence depending on answer
    if row["Confidence_interpreting_data"] == "Very confident":
        influence_score += 2.5
    elif row["Confidence_interpreting_data"] == "Confident":
        influence_score += 1.0

    # increase confidence based on fin literacy
    if fin_score >= 5:
        influence_score += 2.0
    elif fin_score >= 2:
        influence_score += 0.5

    # If the score is already high, we assume they will pick 'Data and analytics' later
    if influence_score >= 6:
        influence_score += 0.5 

    #  if investment experience is high the investment behaviour score should be higher
    if row["Time_investing"] == "Over 2 years":
        influence_score += 1.0

    # Business / Economics should have a correlation with investment behaviour
    # so increase influence
    if row["Field_of_study"] == "Business / Economics":
        influence_score += 1.0


    # define probs of each answer depending on influence of other variables
    if influence_score >= 7.5:  

        prob_q18 = [0.0, 0.05, 0.1, 0.4, 0.45]       # prob of frequency
        prob_q21 = [0.0, 0.05, 0.1, 0.4, 0.45]       # prob of using new features
        prob_q22 = [0.0, 0.0, 0.05, 0.35, 0.6]       # prob of believing that vis helps
        prob_q20 = [0.6, 0.3, 0.1, 0.0, 0.0]         # prob that they disagree that apps oversimplify datga
        prob_q19 = [0.9, 0.03, 0.02, 0.05]           # prob that people use data and analytics  

    elif influence_score >= 4.0: 

        # Moderate investment behaviours
        prob_q18 = [0.1, 0.2, 0.4, 0.2, 0.1]
        prob_q21 = [0.1, 0.2, 0.3, 0.2, 0.2]
        prob_q22 = [0.05, 0.1, 0.2, 0.35, 0.3]
        prob_q20 = [0.1, 0.2, 0.3, 0.25, 0.15]
        prob_q19 = [0.5, 0.2, 0.1, 0.2] 

    else:

        # Lowest investment behaviours
        prob_q18 = [0.2, 0.4, 0.3, 0.05, 0.05]
        prob_q21 = [0.2, 0.3, 0.3, 0.1, 0.1]
        prob_q22 = [0.1, 0.2, 0.3, 0.3, 0.1]
        prob_q20 = [0.05, 0.1, 0.1, 0.4, 0.35]
        prob_q19 = [0.1, 0.3, 0.3, 0.3]

        
    # generate behavioural variables
    row["Data_viz_helps"] = np.random.choice(q_viz_helps, p=prob_q22)
    row["Frequency_based_on_app_data"] = np.random.choice(q_frequency, p=prob_q18)
    row["Greater_influence"] = np.random.choice(q_influence, p=prob_q19)
    row["Likely_to_use_new_features"] = np.random.choice(q_likely, p=prob_q21)
    row["Apps_oversimplify"] = np.random.choice(q_simplify, p=prob_q20)

    # Fill the remaining rows
    row["Features_used_most"] = np.random.choice(["Portfolio analytics", "Stock performance charts", "Market news", "AI or data insights", "Other"], p=[0.3, 0.35, 0.2, 0.1, 0.05])
    row["Regretted_misinterpretation"] = np.random.choice(["Yes", "No", "Not sure"], p=[0.2, 0.7, 0.1])
        
    data.append(row) # add the row to the data

In [5]:
#  Create full dataFrame and save the dataset
df = pd.DataFrame(data, columns=COLUMNS)

df.to_csv('../data/survey_responses.csv', index=False)

print(f"Generated {len(df)} survey responses.")

Generated 1000 survey responses.


## Reference

- https://www.datacamp.com/tutorial/synthetic-data-generation