## Quiz 1 synthetic dataset generation

In [None]:
import json
from collections import defaultdict

In [None]:
# Load Quiz 1 performance data
with open("/content/simulated_quiz_1_performance.json", "r") as file:
    quiz_1_data = json.load(file)

# Load question bank
with open("/content/questions.json", "r") as file:
    questions = json.load(file)

In [None]:
# mapping of question IDs to their details
question_lookup = {q["question_id"]: q for q in questions}

In [None]:
from collections import defaultdict

def calculate_features(quiz_1_data, questions):
    """
    Calculate features for each user based on Quiz 1 data.
    Generates a feature set for machine learning.
    """
    user_features = []

    # Lookup for question details
    question_lookup = {q["question_id"]: q for q in questions}

    for user_data in quiz_1_data:
        user_id = user_data["user_id"]
        answers = user_data["answers"]

        # Calculating overall accuracy
        total_questions = len(answers)
        correct_answers = sum(1 for ans in answers if ans["is_correct"])
        overall_accuracy = correct_answers / total_questions if total_questions > 0 else 0

        # Calculating category performance
        category_correct = defaultdict(int)
        category_total = defaultdict(int)

        for ans in answers:
            category = ans["category"]
            category_total[category] += 1
            if ans["is_correct"]:
                category_correct[category] += 1


        # Generating feature vectors for each question answered by the user
        for ans in answers:
            question_id = ans["question_id"]
            question_details = question_lookup[question_id]
            category = question_details["category"]

            # Category performance for the question's category
            category_perf = (
                category_correct[category] / category_total[category]
                if category_total[category] > 0
                else 0
            )

            # Importance weight
            importance_weight = question_details["importance_weight"]

            # Difficulty level as numerical encoding
            difficulty_mapping = {"Easy": 1, "Medium": 2, "Hard": 3}
            difficulty_numeric = difficulty_mapping.get(question_details["difficulty"], 0)

            user_features.append({
                "user_id": user_id,
                "question_id": question_id,
                "category": category,
                "difficulty": question_details["difficulty"],
                "difficulty_numeric": difficulty_numeric,
                "overall_accuracy": overall_accuracy,
                "category_performance": category_perf,
                "importance_weight": importance_weight,
                "is_correct": ans["is_correct"],
            })

    return user_features

In [None]:
features = calculate_features(quiz_1_data, questions)

In [None]:
output_file = "quiz_1_features.json"
with open(output_file, "w") as file:
    json.dump(features, file, indent=4)
