Imports and Configuration

In [10]:
import pandas as pd
import numpy as np
import random

# --- Configuration ---
NUM_STUDENTS = 10000
OUTPUT_CSV_FILE = "synthetic_student_data.csv"

# Define possible scales based on survey info
LIKERT_SCALE_1_7 = list(range(1, 8))  # e.g., Strongly Disagree to Strongly Agree
K6_SCALE_1_5 = list(range(1, 6))      # Frequency scale for K6 questions
LANGUAGE_SCALE = [0, 1]               # 0: English only, 1: Other/Mixed
PWI_SCALE = list(range(0, 11))        # 0-10 scale for PWI Wellbeing

# Possible school activities
SCHOOL_ACTIVITIES = [
    'Debate Club', 'Sports Team', 'Music Band', 'Art Club',
    'Chess Club', 'Volunteering', 'Drama Club', 'Science Club',
    'Student Government', 'Photography Club'
]

Generate Student IDs

In [11]:
def generate_student_ids(n):
  """Generates a list of unique student IDs with 5-digit padding."""
  return [f"S{i:05d}" for i in range(1, n + 1)]  # Changed to 5-digit format

Generate Network Links

In [12]:
def generate_network_links(student_id, all_ids, max_links=5):
    """Generates a list of random peer IDs for network questions."""
    possible_peers = [pid for pid in all_ids if pid != student_id]
    num_links = random.randint(0, max_links)
    selected_peers = random.sample(possible_peers, min(num_links, len(possible_peers)))
    return ", ".join(selected_peers)  # Store as comma-separated string

Generate Activities

In [13]:
def generate_activities(max_activities=3):
    """Generates a list of random activities."""
    num_activities = random.randint(0, max_activities)
    selected_activities = random.sample(SCHOOL_ACTIVITIES, num_activities)
    return ", ".join(selected_activities)

Generate Synthetic Data

In [14]:
def generate_synthetic_data(num_students):
    """Generates synthetic student data."""
    student_ids = generate_student_ids(num_students)
    data = []

    print(f"Generating data for {num_students} students...")

    for student_id in student_ids:
        # Academic Performance (Synthesized - normally distributed around 70)
        academic_performance = max(0, min(100, round(np.random.normal(70, 15))))

        # Generate survey responses
        manbox5_scores = {f"Manbox5_{i}": random.choice(LIKERT_SCALE_1_7) for i in range(1, 6)}
        k6_scores = {f"k6_{i}": random.choice(K6_SCALE_1_5) for i in range(1, 7)}

        # Other Likert-scale questions
        isolated = random.choice(LIKERT_SCALE_1_7)
        women_different = random.choice(LIKERT_SCALE_1_7)
        covid_worried = random.choice(LIKERT_SCALE_1_7)
        criticises = random.choice(LIKERT_SCALE_1_7)
        men_better_stem = random.choice(LIKERT_SCALE_1_7)
        intelligence1 = random.choice(LIKERT_SCALE_1_7)
        intelligence2 = random.choice(LIKERT_SCALE_1_7)
        soft = random.choice(LIKERT_SCALE_1_7)
        opinion = random.choice(LIKERT_SCALE_1_7)
        nerds = random.choice(LIKERT_SCALE_1_7)
        comfortable = random.choice(LIKERT_SCALE_1_7)
        future = random.choice(LIKERT_SCALE_1_7)
        bullying = random.choice(LIKERT_SCALE_1_7)

        # Categorical / Specific Scales
        language = random.choices(LANGUAGE_SCALE, weights=[0.8, 0.2], k=1)[0]
        pwi_wellbeing = random.choice(PWI_SCALE)

        # Network Questions
        friends = generate_network_links(student_id, student_ids, max_links=7)
        influential = generate_network_links(student_id, student_ids, max_links=5)
        feedback = generate_network_links(student_id, student_ids, max_links=4)
        more_time = generate_network_links(student_id, student_ids, max_links=6)
        advice = generate_network_links(student_id, student_ids, max_links=4)
        disrespect = generate_network_links(student_id, student_ids, max_links=3)

        # Activities
        school_activity_net = generate_activities(max_activities=3)

        # Store data
        student_data = {
            "StudentID": student_id,
            "Academic_Performance": academic_performance,
            "isolated": isolated,
            "WomenDifferent": women_different,
            "language": language,
            "COVID": covid_worried,
            "criticises": criticises,
            "MenBetterSTEM": men_better_stem,
            "pwi_wellbeing": pwi_wellbeing,
            "Intelligence1": intelligence1,
            "Intelligence2": intelligence2,
            "Soft": soft,
            "opinion": opinion,
            "Nerds": nerds,
            "comfortable": comfortable,
            "future": future,
            "bullying": bullying,
            "Friends": friends,
            "Influential": influential,
            "Feedback": feedback,
            "MoreTime": more_time,
            "Advice": advice,
            "Disrespect": disrespect,
            "SchoolActivityNet": school_activity_net,
            **manbox5_scores,
            **k6_scores,
        }
        data.append(student_data)

    return pd.DataFrame(data)

Derived Fields Calculation

In [15]:
def calculate_derived_fields(df):
    """Calculates derived fields based on survey expressions."""
    # Manbox5_overall
    df['Manbox5_overall'] = df[[f"Manbox5_{i}" for i in range(1, 6)]].mean(axis=1)

    # Masculinity_contrained
    df['Masculinity_contrained'] = df[['Soft', 'WomenDifferent', 'Nerds', 'MenBetterSTEM']].mean(axis=1)

    # GrowthMindset
    df['GrowthMindset'] = ((8.0 - df['Intelligence1']) + (8.0 - df['Intelligence2'])) / 2.0

    # k6_overall
    df['k6_overall'] = df[[f"k6_{i}" for i in range(1, 7)]].sum(axis=1)

    # School_support_engage6
    df['School_support_engage6'] = (
        (8.0 - df['isolated']) +
        (8.0 - df['opinion']) +
        df['criticises'] +
        df['comfortable'] +
        df['bullying'] +
        df['future']
    ) / 6.0

    # School_support_engage
    df['School_support_engage'] = df[['criticises', 'comfortable', 'bullying', 'future']].mean(axis=1)

    return df

Save to CSV

In [16]:
def save_to_csv(df, output_file):
    """Saves the DataFrame to a CSV file."""
    print(f"Saving data to {output_file}...")
    df.to_csv(output_file, index=False)
    print("Data saved successfully.")

Execution

In [17]:
# Generate synthetic data
df = generate_synthetic_data(NUM_STUDENTS)

# Calculate derived fields
df = calculate_derived_fields(df)

# Save to CSV
save_to_csv(df, OUTPUT_CSV_FILE)

# Display summary
print("Synthetic data generation complete.")
print(f"Generated {len(df)} student records.")
print(df.head())
print("\nColumn Info:")
df.info()

Generating data for 10000 students...
Saving data to synthetic_student_data.csv...
Data saved successfully.
Synthetic data generation complete.
Generated 10000 student records.
  StudentID  Academic_Performance  isolated  WomenDifferent  language  COVID  \
0    S00001                    77         7               7         0      2   
1    S00002                    69         1               6         0      3   
2    S00003                    83         7               7         1      1   
3    S00004                    67         6               7         1      4   
4    S00005                    51         4               4         0      4   

   criticises  MenBetterSTEM  pwi_wellbeing  Intelligence1  ...  k6_3  k6_4  \
0           6              7              1              4  ...     2     4   
1           7              6              5              4  ...     3     3   
2           7              4              2              5  ...     1     1   
3           1             