# PRIMMDebug Log Data Analysis Notebook
This notebook displays all of the analysis of the log data that took place in the PRIMMDebug initial research paper.

The log data was collected from five schools between December 2024-February 2025. It is divided into the following sections:
1. **Summary statistics:** An overview of the scale of the data and a breakdown of the participants.
2. **Correctness:**
3. 

All you need to do is run the notebooks in order and the statistics that appear in the paper will be displayed. If there are any issues, please report them in the Issues section of the GitHub repository.

Before we run anything else, let's first import all of the necessary libraries and data.

In [1]:
from analysis.log_data_analysis.classes.stage_log import StageLog
from analysis.log_data_analysis.classes.exercise_log import ExerciseLog
from analysis.log_data_analysis.classes.student_id import StudentId
from analysis.log_data_analysis.classes.exercise_classes.exercise import Exercise
from analysis.log_data_analysis.classes.processors.exercise_log_processor import ExerciseLogProcessor
from analysis.log_data_analysis.classes.processors.stage_log_processor import StageLogProcessor

from analysis.log_data_analysis.loading_services.fetch_logs_from_file import fetch_data_from_json
from analysis.log_data_analysis.loading_services.parse_logs import *

from analysis.log_data_analysis.testing_service.docker_interface import DockerInterface
from analysis.log_data_analysis.testing_service.test_report import TestReport

from analysis.log_data_analysis.constants import *
from analysis.log_data_analysis.notebook_utils import *

import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
from statistics import median
from math import isnan
import datetime
from collections import defaultdict
from pandas import DataFrame, Index, read_csv, cut
import numpy as np
import scipy.stats as stats


print("Loading logs")
EXERCISES: list[Exercise] = parse_exercises(fetch_data_from_json("data/exercises"))
STAGE_LOGS: list[StageLog] = parse_stage_logs(fetch_data_from_json("data/stage_logs"))
EXERCISE_LOGS: list[ExerciseLog] = parse_exercise_logs(STAGE_LOGS, fetch_data_from_json("data/exercise_logs"))
STUDENT_IDS: list[StudentId] = parse_student_ids(fetch_data_from_json("data/student_ids"), EXERCISE_LOGS)

#Initiate Docker container
run_tests: bool = True
if run_tests:
    print("Adding test harness data")
    docker_interface: DockerInterface = DockerInterface.get_instance()
    docker_interface.create_docker_container()
    test_reports: list[TestReport] = []

    for exercise_log in EXERCISE_LOGS:
        exercise_test_report: TestReport = ExerciseLogProcessor.test_final_program(exercise_log, docker_interface)
        if exercise_test_report is not None:
            test_reports.append(exercise_test_report)
            exercise_log._test_report = exercise_test_report

    docker_interface.close_docker_container()

print("Adding session data")
EXERCISE_LOGS_PER_SESSION: dict[int, list[ExerciseLog]] = {}
for exercise_log in EXERCISE_LOGS:
    exercise_log = ExerciseLogProcessor.add_correctness_to_test_stages(exercise_log)
    session_number: int = exercise_log.session
    EXERCISE_LOGS_PER_SESSION[session_number] = EXERCISE_LOGS_PER_SESSION.get(session_number, []) + [exercise_log]

Loading logs
Adding test harness data
Docker image built
Docker container created
Adding session data


## 1. Summary Statistics

### Log Data Summary
This data displays the following summary statistics to give information into the scale of the data we collected. We report below on:
- Number of exercises
  - Per student
  - Per session
- Number of PRIMMDebug stages
  - Per challenge attempt

In [2]:
print(f"Number of attempted PRIMMDebug challenges: {len(EXERCISE_LOGS)}")
print(f"Number of completed PRIMMDebug stages: {len(STAGE_LOGS)}")
total_time: float = sum([ExerciseLogProcessor.get_time_on_exercise(exercise_log) for exercise_log in EXERCISE_LOGS])
print(f"Total time on PRIMMDebug challenges: {datetime.timedelta(seconds=total_time)}\n")

print(f"Number of students who consented to taking part in the study: {11+16+25+26+16}")
print(f"Total number of students who attempted at least one PRIMMDebug challenge: {len(STUDENT_IDS)}\n") #TODO: Clean number of student IDs to those who only attempted one exercise

#Number of attempts at each PRIMMDebug challenge
challenge_attempts: dict[str, int] = {}
for exercise_log in EXERCISE_LOGS:
    challenge_attempts[exercise_log.exercise_name] = challenge_attempts.get(exercise_log.exercise_name, 0) + 1
challenge_attempts = dict(sorted(challenge_attempts.items(), key=lambda item: item[1], reverse=True)) #Sort by frequency
px.bar(
    x = challenge_attempts.keys(),
    y = challenge_attempts.values(),
    labels = {"x": "Challenge Name", "y": "Frequency"},
    title=f"Number of attempts for each PRIMMDebug challenge (n={len(EXERCISE_LOGS)})"
).show()

#Number of challenges attempted by each student
challenges_per_student: dict[str, int] = {}
for exercise in EXERCISE_LOGS:
    student_id: str = exercise.student_id
    challenges_per_student[student_id] = challenges_per_student.get(student_id, 0) + 1
px.histogram(
    challenges_per_student.values(),
    marginal="box",
    labels={"value": "Number of challenges", "count": "Frequency"},
    title=f"Number of attempted PRIMMDebug challenges, broken down by student (n={len(challenges_per_student)})"
).show()

# Number of stages per PRIMMDebug challenge attempt
stages_per_challenge_attempt: list[int] = [len(exercise.stage_logs) for exercise in EXERCISE_LOGS]
px.histogram(
    stages_per_challenge_attempt,
    marginal="box",
    labels={"value": "Number of stages", "count": "Frequency"},
    title=f"Number of stages per PRIMMDebug challenge attempt (n={len(stages_per_challenge_attempt)})"
).show()

# Number of challenge attempts per session
attempts_per_session: dict[int, int] = {}
for session, logs in EXERCISE_LOGS_PER_SESSION.items():
    attempts_per_session[session] = len(logs)
px.bar(
    x=attempts_per_session.keys(),
    y=attempts_per_session.values(),
    labels={"x": "Session", "y": "Frequency"},
    title=f"Number of PRIMMDebug challenges in each session (n={sum(attempts_per_session.values())})"
).show()

# Final stage of PRIMMDebug challenge attempts
challenge_end_stages: dict[str, int] = dict(Counter([ExerciseLogProcessor.get_last_stage(exercise_log).stage_name.name for exercise_log in EXERCISE_LOGS]))
challenge_end_stages = {stage.name: challenge_end_stages.get(stage.name, 0) for stage in DebuggingStage if stage.name not in ["completed_test", "exit"]}
px.bar(
    x=list(challenge_end_stages.keys()),
    y=list(challenge_end_stages.values()),
    labels={"x": "PRIMMDebug stage", "y": "Frequency"},
    title=f"Final stage of PRIMMDebug challenge attempts (n={len(EXERCISE_LOGS)})"
).show()

Number of attempted PRIMMDebug challenges: 377
Number of completed PRIMMDebug stages: 4273
Total time on PRIMMDebug challenges: 2 days, 1:48:42.253000

Number of students who consented to taking part in the study: 94
Total number of students who attempted at least one PRIMMDebug challenge: 69



### Student Demographics

Number of students:
- By gender
- By year group
- By school


In [3]:
print(f"Number of participating students: {len(STUDENT_IDS)}") #TODO: Implement as csv file to save
school_split_fig = px.bar(x = get_school_split().keys(), y = get_school_split().values(), labels={"x": "School", "y": "Frequency"})
school_split_fig.show()

Number of participating students: 69


### 2. Time Taken

We now present the distribution of times taken for each PRIMMDebug challenge and stage.

In [4]:
#Time taken per PRIMMDebug challenge attempt
time_per_challenge_attempt: list[float] = [ExerciseLogProcessor.get_time_on_exercise(exercise) for exercise in EXERCISE_LOGS if hasattr(exercise,"end_time")]

skewness = stats.skew(time_per_challenge_attempt, nan_policy='omit')
kurtosis = stats.kurtosis(time_per_challenge_attempt, nan_policy='omit')
sd = np.nanstd(time_per_challenge_attempt)

px.histogram(
    time_per_challenge_attempt, marginal="box",
    labels={"value": "Time taken (seconds)", "count": "Count"},
    title=f"Time taken per PRIMMDebug challenge (skewness={skewness:.2f}, kurtosis={kurtosis:.2f}, std={sd:.2f})"
).show()

#Time series for time per PRIMMDebug challenge attempt
median_time_per_challenge_per_session: dict[int, float] = {}
for session_id, logs in EXERCISE_LOGS_PER_SESSION.items():
    median_time_per_challenge = median([ExerciseLogProcessor.get_time_on_exercise(log) for log in logs if hasattr(log, "end_time")])
    median_time_per_challenge_per_session[session_id] = median_time_per_challenge

median_time_per_challenge_per_session = dict(sorted(median_time_per_challenge_per_session.items()))
px.line(
    x=median_time_per_challenge_per_session.keys(),
    y=median_time_per_challenge_per_session.values(),
    labels={"x": "Session", "y": "Median time (seconds)"},
    title="Median time spent on each PRIMMDebug challenge per session"
).show()

#Time taken per stage log
time_per_stage: list[float] = [StageLogProcessor.get_time_on_stage(stage) for stage in STAGE_LOGS if StageLogProcessor.get_time_on_stage(stage) is not None]
skewness_stage = stats.skew(time_per_stage, nan_policy='omit')
kurtosis_stage = stats.kurtosis(time_per_stage, nan_policy='omit')
std_stage = np.nanstd(time_per_stage)

px.histogram(
    time_per_stage,
    marginal="box",
    labels={"value": "Time taken (seconds)", "count": "Count"},
    title=f"Time taken per PRIMMDebug stage (skewness={skewness_stage:.2f}, kurtosis={kurtosis_stage:.2f}, std={std_stage:.2f})"
).show()

#### Time taken per PRIMMDebug stage
This contains more of the interesting data relating to each stage of the PRIMMDebug process, including:
- Time taken per PRIMMDebug stage
  - As a histogram
  - As a bar chart
- How this varies over number of sessions

In [5]:
#Time taken for each stage of PRIMMDebug (TODO: Add confidence intervals)
time_by_primmdebug_stage = {"stage": [], "time": []}

for stage in STAGE_LOGS:
    if stage.stage_name != DebuggingStage.exit:
        time_by_primmdebug_stage["stage"].append(stage.stage_name.value)
        time_by_primmdebug_stage["time"].append(StageLogProcessor.get_time_on_stage(stage))
px.histogram(time_by_primmdebug_stage, x="time", color="stage", nbins=50, marginal="box", labels={"value": "Time taken (seconds)", "count": "Count"}, title="Time taken per PRIMMDebug stage").show()

# Group times by stage
stage_times = defaultdict(list)
for stage, time in zip(time_by_primmdebug_stage["stage"], time_by_primmdebug_stage["time"]):
    if time is not None:
        stage_times[stage].append(time)

median_time_by_primmdebug_stage = {"stage": [], "time": []}
for stage in DebuggingStage:
    stage_name = stage.value
    if stage_name in stage_times and stage_times[stage_name]:
        median_time_by_primmdebug_stage["stage"].append(stage_name)
        median_time_by_primmdebug_stage["time"].append(median(stage_times[stage_name]))

# Print skewness for each stage in median_time_by_primmdebug_stage
for stage, median_time in zip(median_time_by_primmdebug_stage["stage"], median_time_by_primmdebug_stage["time"]):
    stage_times_list = stage_times[stage]
    stage_skewness = stats.skew(stage_times_list, nan_policy='omit')
    print(f"Skewness of times spent on the {stage} stage: {stage_skewness:.2f}")

px.bar(
    median_time_by_primmdebug_stage,
    x="stage",
    y="time",
    labels={"stage": "PRIMMDebug stage", "time": "Median time on stage (seconds)"},
    height=550,
    width=700
).update_layout(font=dict(size=14)).show()

#Time series for time per PRIMMDebug stage
median_time_per_stage_per_session = {"session": [], "stage": [], "median_time": []}
# Iterate through each session and calculate median time for each stage
for session_id, logs in EXERCISE_LOGS_PER_SESSION.items():
    stage_times = {stage: [] for stage in DebuggingStage if stage != DebuggingStage.exit}
    for log in logs:
        for stage_log in log.stage_logs:
            if stage_log.stage_name != DebuggingStage.exit:
                time_on_stage = StageLogProcessor.get_time_on_stage(stage_log)
                if time_on_stage is not None:
                    stage_times[stage_log.stage_name].append(time_on_stage)
    for stage, times in stage_times.items():
        if times:
            median_time_per_stage_per_session["session"].append(session_id)
            median_time_per_stage_per_session["stage"].append(stage.value)
            median_time_per_stage_per_session["median_time"].append(median(times))

median_time_per_stage_per_session_df = DataFrame(median_time_per_stage_per_session).sort_values(by="session")
px.line(
    median_time_per_stage_per_session_df,
    x="session",
    y="median_time",
    color="stage",
    labels={"session": "Session", "median_time": "Median time (seconds)", "stage": "PRIMMDebug stage"},
    title="Median time spent on each PRIMMDebug stage per session"
).show()

Skewness of times spent on the predict stage: 5.64
Skewness of times spent on the run stage: 10.89
Skewness of times spent on the spot_the_defect stage: 3.76
Skewness of times spent on the inspect_the_code stage: 4.21
Skewness of times spent on the find_the_error stage: 12.50
Skewness of times spent on the fix_the_error stage: 8.05
Skewness of times spent on the test stage: 15.95
Skewness of times spent on the modify stage: 3.68


## 3. Correctness and Outcomes of Exercise Attempts
These findings relate to any metrics of success that can be measured, which are described below:
- **Successful** challenge attempts: Where the final snapshot of a students' challenge attempt passes all the test harnesses.
- Challenge attempts where the final snapshot of a challenge attempt successfully executes (runs without raising error messages). This is different to whether programs passed the test or not; a snapshot could run without raising errors but still be logically correct
- **"Entirely completed"** challenge attempts: Where the student has reached the final *Make* stage of PRIMM. This required them to self-report their success at the *Test* stage.
- Number of *Find the Error* stages where students correctly responded (this was required for all but one of the PRIMMDebug challenges).

In [6]:
successful_attempts: int = len([test_report for test_report in test_reports if test_report.n_successful_tests == test_report.n_total_tests])
print(f"Number of attempts where final program passes test harnesses: {display_percentage_string(successful_attempts, len(test_reports))}")

final_program_states: list[bool] = [ExerciseLogProcessor.is_final_program_erroneous(exercise) for exercise in EXERCISE_LOGS]
number_successful_final_program_states: list[bool] = len([final_program_state for final_program_state in final_program_states if final_program_state])
print(f"Proportion of PRIMMDebug challenges where last program run successfully executed: {display_percentage_string(number_successful_final_program_states, len(EXERCISE_LOGS))}")

number_completed_exercises: int = len([exercise_log for exercise_log in EXERCISE_LOGS if ExerciseLogProcessor.get_last_stage(exercise_log).stage_name == DebuggingStage.modify])
print(f"Number of entirely completed PRIMMDebug challenges (where students reached the Make stage): {display_percentage_string(number_completed_exercises, len(EXERCISE_LOGS))}\n")

find_error_stages_with_correct_field: list[StageLog] = [stage_log for stage_log in STAGE_LOGS if stage_log.stage_name == DebuggingStage.find_error and stage_log.correct is not None]
correct_find_error_stages: int = len([stage_log for stage_log in find_error_stages_with_correct_field if stage_log.correct])
print(f"Number of find the error stages where the correct response was entered (for challenges where students had to pinpoint a line): {display_percentage_string(correct_find_error_stages, len(find_error_stages_with_correct_field))}")

first_find_error_stages_with_correct_field: list[StageLog] = [ExerciseLogProcessor.get_first_find_the_error_stage(exercise_log) for exercise_log in EXERCISE_LOGS if ExerciseLogProcessor.get_first_find_the_error_stage(exercise_log) is not None and ExerciseLogProcessor.get_first_find_the_error_stage(exercise_log).correct is not None]
correct_first_find_error_stages: int = len([stage_log for stage_log in first_find_error_stages_with_correct_field if stage_log.correct])
print(f"Number of first-time find the error stages where the correct response was entered (for challenges where students had to pinpoint a line): {display_percentage_string(correct_first_find_error_stages, len(first_find_error_stages_with_correct_field))}")

Number of attempts where final program passes test harnesses: 157/372 (42.20%)
Proportion of PRIMMDebug challenges where last program run successfully executed: 274/377 (72.68%)
Number of entirely completed PRIMMDebug challenges (where students reached the Make stage): 53/377 (14.06%)

Number of find the error stages where the correct response was entered (for challenges where students had to pinpoint a line): 364/545 (66.79%)
Number of first-time find the error stages where the correct response was entered (for challenges where students had to pinpoint a line): 255/308 (82.79%)


### Differences in correctness
These figures give more context as to how some of these measures change over time.

In [7]:
#Change in correctness over time
correctness_per_session = {"session": [], "correctness": [], "total": [], "percent_correct": []}

for session_id, logs in EXERCISE_LOGS_PER_SESSION.items():
    logs_with_test_report: list[ExerciseLog] = [log for log in logs if hasattr(log, "test_report") and log.test_report is not None]
    correctness_per_session["session"].append(session_id)
    n_correct_exercises: int = len([log for log in logs_with_test_report if log.test_report.n_successful_tests == log.test_report.n_total_tests])
    correctness_per_session["percent_correct"].append((n_correct_exercises / len(logs_with_test_report) * 100) if logs_with_test_report else 0)
    correctness_per_session["correctness"].append(n_correct_exercises)
    correctness_per_session["total"].append(len(logs_with_test_report))

# Order the dictionary by session_id
correctness_per_session_df: DataFrame = DataFrame(correctness_per_session).sort_values(by="session")
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=correctness_per_session_df["session"],
    y=correctness_per_session_df["total"],
    mode='lines',
    name='Total',
    fill='tozeroy'
))
fig.add_trace(go.Scatter(
    x=correctness_per_session_df["session"],
    y=correctness_per_session_df["correctness"],
    mode='lines',
    name='Correctness',
    fill='tozeroy'
))
fig.update_layout(
    title="Change in Correctness and Total Attempts Over Sessions",
    xaxis_title="Session",
    yaxis_title="Count",
    legend_title="Metric"
)
fig.show()

px.line(
    correctness_per_session_df,
    x="session",
    y="percent_correct",
    labels={"session": "Session", "percent_correct": "Percent Correct"},
    title="Change in correctness over sessions"
).show()

#Change in correct first time Find the Error stage by session?
find_the_error_correctness_per_session = {"session": [], "correctness": [], "total": [], "percent_correct": []}

for session_id, logs in EXERCISE_LOGS_PER_SESSION.items():
    find_the_error_correctness_per_session["session"].append(session_id)
    n_correct: int = len([log for log in logs if ExerciseLogProcessor.does_find_the_error_stage_have_correct_response(log)])
    find_the_error_correctness_per_session["correctness"].append(n_correct)
    n_exercises: int = len([log for log in logs if ExerciseLogProcessor.get_first_find_the_error_stage(log) is not None])
    find_the_error_correctness_per_session["total"].append(n_exercises)
    find_the_error_correctness_per_session["percent_correct"].append((n_correct / n_exercises * 100) if n_exercises > 0 else 0)


find_the_error_correctness_per_session_df = DataFrame(find_the_error_correctness_per_session).sort_values(by="session")
px.line(
    find_the_error_correctness_per_session_df,
    x="session",
    y="percent_correct",
    labels={"session": "Session", "percent_correct": "Percent Correct"},
    title="Change in correctness of first Find the Error stage over sessions (correct response entered)"
).show()


## 4. Engagement with Certain PRIMMDebug Stages
Some more measures providing some context into how much students engaged with certain restricted or unrestricted parts of the tool.

### Students' runtime behaviour in the *Inspect The Code* and *Test* stages

In [8]:
n_runs_inspect_the_code: list[int] = [StageLogProcessor.get_number_of_runs(stage_log) for stage_log in STAGE_LOGS if stage_log.stage_name == DebuggingStage.inspect_code]
print("Number of inspect the code stages:", len(n_runs_inspect_the_code))
n_runs_test: list[int] = [StageLogProcessor.get_number_of_runs(stage_log) for stage_log in STAGE_LOGS if stage_log.stage_name == DebuggingStage.test]
print("Number of test stages:", len(n_runs_test))

n_non_zero_runs_inspect_the_code: list[int] = [StageLogProcessor.get_number_of_runs(stage_log) for stage_log in STAGE_LOGS if stage_log.stage_name == DebuggingStage.inspect_code and StageLogProcessor.get_number_of_runs(stage_log) > 0]
n_non_zero_of_runs_test: list[int] = [StageLogProcessor.get_number_of_runs(stage_log) for stage_log in STAGE_LOGS if stage_log.stage_name == DebuggingStage.test and StageLogProcessor.get_number_of_runs(stage_log) > 0]

df_runs = DataFrame({
    "Number of Runs": n_runs_inspect_the_code + n_runs_test,
    "Stage": (["Inspect the Code"] * len(n_runs_inspect_the_code)) + (["Test"] * len(n_runs_test))
})
df_non_zero_runs = DataFrame({
    "Number of Runs": n_non_zero_runs_inspect_the_code + n_non_zero_of_runs_test,
    "Stage": (["Inspect the Code"] * len(n_non_zero_runs_inspect_the_code)) + (["Test"] * len(n_non_zero_of_runs_test))
})

px.histogram(
    df_non_zero_runs,
    x="Number of Runs",
    color="Stage",
    marginal="box",
    barmode="overlay",
    labels={"Number of Runs": "Number of runs", "count": "Frequency"},
    title="Number of runs for Inspect the Code and Test stages"
).show()

labels = ['0', '1', '2', '3', '4', '5+']
df_runs['Runs Grouped'] = cut(df_runs['Number of Runs'], bins=[-0.1, 0.9, 1.9, 2.9, 3.9, 4.9, float('inf')], labels=labels, right=True)
px.bar(
    df_runs.groupby(['Stage', 'Runs Grouped'], observed=True).size().reset_index(name='Count'),
    x='Runs Grouped',
    y='Count',
    color='Stage',
    barmode='group',
    labels={"Runs Grouped": "Number of Runs (Grouped)", "Count": "Frequency"},
    title="Number of runs for Inspect the Code and Test stages (Grouped)"
).show()

time_between_runs_inspect_the_code: list[float] = [time for stage_log in STAGE_LOGS if stage_log.stage_name == DebuggingStage.inspect_code for time in StageLogProcessor.get_time_between_runs(stage_log) if StageLogProcessor.get_time_between_runs(stage_log)]
time_between_runs_test: list[float] = [time for stage_log in STAGE_LOGS if stage_log.stage_name == DebuggingStage.test for time in StageLogProcessor.get_time_between_runs(stage_log) if StageLogProcessor.get_time_between_runs(stage_log)]
df_time_between_runs = DataFrame({
    "Time Between Runs (seconds)": time_between_runs_inspect_the_code + time_between_runs_test,
    "Stage": ([f"Inspect the Code (n={len(time_between_runs_inspect_the_code)})"] * len(time_between_runs_inspect_the_code)) + ([f"Test (n={len(time_between_runs_test)})"] * len(time_between_runs_test))
})

px.histogram(
    df_time_between_runs,
    x="Time Between Runs (seconds)",
    color="Stage",
    marginal="box",
    barmode="overlay",
    labels={"Time Between Runs (seconds)": "Time between runs (seconds)", "count": "Frequency"},
    title="Time between runs for Inspect the Code and Test stages)"
).show() #TODO: Check potential errors as graphs indicate discrepancy

Number of inspect the code stages: 425
Number of test stages: 464


In places where students had the choice of which stage to go to, where did they go?

In [9]:
# Create a DataFrame to track succeeding stages after incorrect Find the Error or Test stage
df_succeeding_stages = DataFrame(columns=["Stage", "Succeeding Stage"])

for exercise_log in EXERCISE_LOGS:
    incorrect_find_the_error_stages: list[StageLog] = [stage_log for stage_log in exercise_log.stage_logs if stage_log.stage_name == DebuggingStage.find_error and not stage_log.correct]
    incorrect_test_stages: list[StageLog] = [stage_log for stage_log in exercise_log.stage_logs if stage_log.stage_name == DebuggingStage.test and not stage_log.correct]
    for stage_log in incorrect_find_the_error_stages:
        succeeding_stage: StageLog = ExerciseLogProcessor.get_succeeding_stage(exercise_log, stage_log)
        if succeeding_stage is not None:
            df_succeeding_stages.loc[len(df_succeeding_stages)] = [DebuggingStage.find_error.name, succeeding_stage.stage_name.name]
    for stage_log in incorrect_test_stages:
        succeeding_stage: StageLog = ExerciseLogProcessor.get_succeeding_stage(exercise_log, stage_log)
        if succeeding_stage is not None:
                        df_succeeding_stages.loc[len(df_succeeding_stages)] = [DebuggingStage.test.name, succeeding_stage.stage_name.name]


px.bar(
    df_succeeding_stages,
    color="Stage",
    barmode="group",
    labels={"count": "Frequency", "value": "Succeeding Stage"},
    title="Succeeding stage after incorrect Find the Error or Test stage"
).show()


For the *Test* stages where students reported their code as correct, what was the actual correctness of students' changes?

In [10]:
n_self_reported_correct_test_stages: int = 0
n_actual_correct_test_stages: int = 0

docker_interface: DockerInterface = DockerInterface.get_instance()
docker_interface.create_docker_container()

for exercise_log in EXERCISE_LOGS:
    correct_test_stages: list[StageLog] = [stage_log for stage_log in exercise_log.stage_logs if stage_log.stage_name == DebuggingStage.test and stage_log.correct]
    for stage_log in correct_test_stages:
        if stage_log.program_logs is not None and len(stage_log.program_logs) > 0:
            n_self_reported_correct_test_stages += 1
            last_snapshot_in_stage: str = stage_log.program_logs[-1].snapshot
            test_report: TestReport = docker_interface.test_student_program(last_snapshot_in_stage, exercise_log.student_id, exercise_log.exercise_name)
            if test_report.n_successful_tests == test_report.n_total_tests:
                n_actual_correct_test_stages += 1

docker_interface.close_docker_container()
print(f"Number of actual correct Test stages for students who self-reported correctness on the Test stage: {display_percentage_string(n_actual_correct_test_stages, n_self_reported_correct_test_stages)} ")

Docker image built
Docker container created
Number of actual correct Test stages for students who self-reported correctness on the Test stage: 38/50 (76.00%) 


Students who reported being successful vs. unsuccessful in the test stage.\
Note that this was not properly logged due to an error in the logging system for the tool. Therefore, we have inferred students' response where possible, but this lowers the sample size.

In [11]:
correct_test_stages: list[StageLog] = [stage_log for stage_log in STAGE_LOGS if stage_log.stage_name == DebuggingStage.test and stage_log.correct]
incorrect_test_stages: list[StageLog] = [stage_log for stage_log in STAGE_LOGS if stage_log.stage_name == DebuggingStage.test and not stage_log.correct]
print(f"Number of correct Test stages: {len(correct_test_stages)}")
correct_test_stages_with_zero_runs: list[StageLog] = [stage_log for stage_log in correct_test_stages if StageLogProcessor.get_number_of_runs(stage_log) == 0]
print(f"Number of correct Test stages with zero runs: {len(correct_test_stages_with_zero_runs)}")
print(f"Number of incorrect Test stages: {len(incorrect_test_stages)}")

Number of correct Test stages: 75
Number of correct Test stages with zero runs: 25
Number of incorrect Test stages: 389


## 5. Correlation Between Log Data and Survey Responses
Based on students' survey responses (see `survey_analysis_results.ipynb`), we wanted to investigate whether students' log data was associated with these survey responses, which is what this cells investigates. First, we prepare the following data in a `DataFrame` for each student who attempted at least one exercise and responded to the survey (n=45):
- Survey responses:
  - Mean response to the survey items related to the restrictive aspects of the tool.
  - Response to the utility of the underlying SIFFT process.
- Log data:
  - Number of attempted challenges.
  - Mean time per challenge.
  - Mean time per stage.
  - Number of successful challenges.

In [15]:
survey_responses: DataFrame = read_csv("data/survey_responses_numeric.csv")
print(f"Number of students who completed the survey: {len(survey_responses)}")

survey_response_log_data_mapping: DataFrame = DataFrame(columns=["Perceived restrictive features utility", "Perceived SIFFT utility", "Number of attempted challenges", "Number of successful challenges", "Mean time per challenge", "Mean time per stage"], index=Index([], name="student_id"))
survey_responses = survey_responses[["student_id","Q2_1", "Q2_2", "Q2_3", "Q2_4", "Q5"]] #Only keep relevant columns for analysis
for row in survey_responses.itertuples():
    student_id: str = row.student_id
    student_exercise_logs: list[ExerciseLog] = [exercise_log for exercise_log in EXERCISE_LOGS if exercise_log.student_id == student_id]
    if len(student_exercise_logs) > 0:
        #Calcuate survey response metrics
        restrictive_feature_responses: float = np.mean([row.Q2_1, row.Q2_2, row.Q2_3, row.Q2_4])
        sifft_utility_response: float = row.Q5

        #Calculate log data metrics
        n_attempted_challenges: int = len(student_exercise_logs)
        total_time_on_challenges: float = sum([ExerciseLogProcessor.get_time_on_exercise(exercise_log) for exercise_log in student_exercise_logs])
        mean_time_per_challenge: float = total_time_on_challenges / n_attempted_challenges if n_attempted_challenges > 0 else 0
        student_stage_logs: list[StageLog] = [stage_log for exercise_log in student_exercise_logs for stage_log in exercise_log.stage_logs]
        mean_time_per_stage: float = np.mean([StageLogProcessor.get_time_on_stage(stage_log) for stage_log in student_stage_logs if StageLogProcessor.get_time_on_stage(stage_log) is not None])
        n_successful_challenges: int = len([exercise_log for exercise_log in student_exercise_logs if exercise_log.test_report is not None and exercise_log.test_report.n_successful_tests == exercise_log.test_report.n_total_tests])
        
        survey_response_log_data_mapping.loc[student_id] = [
            restrictive_feature_responses,
            sifft_utility_response,
            n_attempted_challenges,
            mean_time_per_challenge,
            mean_time_per_stage,
            n_successful_challenges
        ]

Number of students who completed the survey: 45


Before plotting a correlation matrix, we need to decide which correlation coefficient to use, which is influenced by whether the data in the `DataFrame` is parametric or non-parametric. To find this out, we perform Shapiro-Wilk and Kolmogorov-Smirnov tests for each column.

In [16]:
for column in survey_response_log_data_mapping.columns:
    shapiro_wilk_value: stats.shapiro = stats.shapiro(survey_response_log_data_mapping[column].dropna())
    print(f"Shapiro-Wilk value for {column}: {shapiro_wilk_value.statistic:.4f}, p-value: {shapiro_wilk_value.pvalue:.4f}")
    kolmogorov_smirnov_value: stats.kstest = stats.kstest(survey_response_log_data_mapping[column].dropna(), 'norm')
    print(f"Kolmogorov-Smirnov value for {column}: {kolmogorov_smirnov_value.statistic:.4f}, p-value: {kolmogorov_smirnov_value.pvalue:.4f}\n")


Shapiro-Wilk value for Perceived restrictive features utility: 0.9518, p-value: 0.0874
Kolmogorov-Smirnov value for Perceived restrictive features utility: 0.8413, p-value: 0.0000

Shapiro-Wilk value for Perceived SIFFT utility: 0.8430, p-value: 0.0001
Kolmogorov-Smirnov value for Perceived SIFFT utility: 0.8413, p-value: 0.0000

Shapiro-Wilk value for Number of attempted challenges: 0.8634, p-value: 0.0002
Kolmogorov-Smirnov value for Number of attempted challenges: 0.8522, p-value: 0.0000

Shapiro-Wilk value for Number of successful challenges: 0.9133, p-value: 0.0048
Kolmogorov-Smirnov value for Number of successful challenges: 1.0000, p-value: 0.0000

Shapiro-Wilk value for Mean time per challenge: 0.8828, p-value: 0.0006
Kolmogorov-Smirnov value for Mean time per challenge: 1.0000, p-value: 0.0000

Shapiro-Wilk value for Mean time per stage: 0.8492, p-value: 0.0001
Kolmogorov-Smirnov value for Mean time per stage: 0.5163, p-value: 0.0000



### Correlation Coefficients
Almost all of the data is not normally distributed, so a non-parametric correlation measure is required. Most of this data is ordinal in practice, some of which may contain lots of ties, we use Kendall's tau correlation

In [17]:
import plotly.graph_objects as go

correlation_matrix: DataFrame = DataFrame(columns=["Perceived restrictive features utility", "Perceived SIFFT utility"])
correlation_matrix_text: DataFrame = DataFrame(columns=["Perceived restrictive features utility", "Perceived SIFFT utility"])
for row in ["Number of attempted challenges", "Number of successful challenges", "Mean time per challenge", "Mean time per stage"]:
    correlations_restrictive_features: stats.kendalltau = stats.kendalltau(survey_response_log_data_mapping["Perceived restrictive features utility"], survey_response_log_data_mapping[row])
    correlations_sifft_utility: stats.kendalltau = stats.kendalltau(survey_response_log_data_mapping["Perceived SIFFT utility"], survey_response_log_data_mapping[row])
    correlation_matrix.loc[row] = [
        correlations_restrictive_features.correlation,
        correlations_sifft_utility.correlation
    ]
    correlation_matrix_text.loc[row] = [
        f"{correlations_restrictive_features.correlation:.2f} (p={correlations_restrictive_features.pvalue:.3f})",
        f"{correlations_sifft_utility.correlation:.2f} (p={correlations_sifft_utility.pvalue:.3f})"
    ]

def add_line_breaks(label, every=3):
    words = label.split()
    return '<br>'.join([' '.join(words[i:i+every]) for i in range(0, len(words), every)])

go.Figure(
    data=go.Heatmap(
        z=correlation_matrix.values,
        x=[add_line_breaks(label, every=2) for label in correlation_matrix.columns],
        y=[add_line_breaks(label) for label in correlation_matrix.index],
        text=correlation_matrix_text.values,
        texttemplate="%{text}",
        zmin=-0.5, zmax=0.5,
    ),
    layout=dict(height=550, width=700, font=dict(size=16))
).show()