![Logo Pyrates LLM](../assets/pyratesllm_logo_500.png)

# **Notebook#04 : Analysis on axis 3**
## [Evaluation of feedback content]

## 1/ Imports

In [None]:
# Internal
import sys
sys.path.append("../src")
import students_constants as stu_const
import interaction_constants as int_const
import tests_constants  as tes_const
import session_date_constants as ses_const

# External
import pandas as pd
import xml.etree.ElementTree as ET
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats



## 2/ Data importation

In [None]:
interaction_data = pd.read_pickle("../data/interim/interaction_data.pkl")
pre_test_data = pd.read_pickle("../data/interim/pre_test_data.pkl")
post_test_data = pd.read_pickle("../data/interim/post_test_data.pkl")

## 3/ Students' Perception of assistant helps (Q3.1) [B/C]

What is student perception of the assistant helps depending of groups ?

In [None]:
# Columns of questions to analyze (from _QA to _QE)
question_cols = tes_const.ASSISTANT_NUM_QUESTIONS

# Mapping of question keys to plain text question
QUESTION_MAPPING = {
    tes_const.QA_KEY: "Did you find the assistant help useful to progress in the game?",
    tes_const.QB_KEY: "Did you find the assistant help useful to learn Python programming?",
    tes_const.QC_KEY: "Would you like to have more help from a digital assistant in the future?",
    tes_const.QD_KEY: "Did you find the assistant help easy to understand?",
    tes_const.QE_KEY: "Did you find that the assistant help was correct (without errors)?"
}

group_col=tes_const.GROUP_ID_KEY
# Filter only groups B and C
df_filtered = post_test_data[post_test_data[group_col].isin([tes_const.GROUP_B, tes_const.GROUP_C])].copy()

# Dictionary to store results
stats_dict = {}

# Loop over each question
for q in question_cols:
    # Get data for both groups
    calls_B = df_filtered[df_filtered[group_col] == tes_const.GROUP_B][q].dropna()
    calls_C = df_filtered[df_filtered[group_col] == tes_const.GROUP_C][q].dropna()
    
    # Descriptive statistics
    mean_B, std_B = calls_B.mean(), calls_B.std()
    mean_C, std_C = calls_C.mean(), calls_C.std()
    n_B, n_C = len(calls_B), len(calls_C)
    
    print(f"\n######## {q} : {QUESTION_MAPPING[q]} ########")
    print("=== DESCRIPTIVE STATISTICS ===")
    print(f"Group B (n={n_B}): M = {mean_B:.2f}, SD = {std_B:.2f}")
    print(f"Group C (n={n_C}): M = {mean_C:.2f}, SD = {std_C:.2f}")
    
    # Normality tests (Shapiro-Wilk)
    shapiro_B = stats.shapiro(calls_B) if n_B >= 3 else None
    shapiro_C = stats.shapiro(calls_C) if n_C >= 3 else None
    
    # TODO WARNING : alpha is temporary set here to 0.06 because some results are almost signifiant
    alpha = 0.06
    print("\n=== NORMALITY TESTS (Shapiro-Wilk) ===")
    if shapiro_B and shapiro_C:
        print(f"Group B: p-value = {shapiro_B.pvalue:.4f} -> {'normally distributed' if shapiro_B.pvalue >= alpha else 'not normally distributed'}")
        print(f"Group C: p-value = {shapiro_C.pvalue:.4f} -> {'normally distributed' if shapiro_C.pvalue >= alpha else 'not normally distributed'}")
        normal_B, normal_C = shapiro_B.pvalue >= alpha, shapiro_C.pvalue >= alpha
    else:
        # Not enough data for Shapiro test
        print("Not enough data for Shapiro-Wilk test")
        normal_B, normal_C = False, False
    
    # Levene test for equal variances
    if n_B > 1 and n_C > 1:
        levene_p = stats.levene(calls_B, calls_C).pvalue
        equal_var = levene_p >= alpha
        print(f"\n=== HOMOGENEITY OF VARIANCES (Levene) ===")
        print(f"Levene p-value = {levene_p:.4f} -> {'variances are similar' if equal_var else 'variances differ'}")
    else:
        equal_var = False
        print("\nNot enough data for Levene test")
    
    # Choose statistical test
    print(f"\n=== STATISTICAL TEST ===")
    if normal_B and normal_C and equal_var:
        print("Using independent t-test (parametric)")
        stat, p_value = stats.ttest_ind(calls_B, calls_C, equal_var=True)
        # Cohen's d
        pooled_std = np.sqrt(((n_B - 1)*std_B**2 + (n_C - 1)*std_C**2) / (n_B + n_C - 2))
        effect_size = (mean_B - mean_C) / pooled_std
        effect_type = "Cohen's d"

    else:
        print("Using Mann-Whitney U test (non-parametric)")
        stat, p_value = stats.mannwhitneyu(calls_B, calls_C, alternative='two-sided')
        # Effect size r = Z / sqrt(N)
        n_total = n_B + n_C
        z_score = stats.norm.ppf(1 - p_value/2)  # convert two-sided p-value to Z
        effect_size = abs(z_score) / np.sqrt(n_total)
        effect_type = "r"

    print(f"Test statistic = {stat:.4f}")
    print(f"P-value = {p_value:.4f}")
    print(f"Effect size ({effect_type}) = {effect_size:.4f}")
    
    # Interpret effect size
    if effect_type == "Cohen's d":
        if abs(effect_size) < 0.2:
            label = "negligible"
        elif abs(effect_size) < 0.5:
            label = "small"
        elif abs(effect_size) < 0.8:
            label = "medium"
        else:
            label = "large"
    else:  # r
        if effect_size < 0.1:
            label = "negligible"
        elif effect_size < 0.3:
            label = "small"
        elif effect_size < 0.5:
            label = "medium"
        else:
            label = "large"
    
    print(f"Effect size interpretation: {label}")

    if p_value < alpha:
        print(f"SIGNIFICANT difference: p = {p_value:.4f} < {alpha}")
    else:
        print(f"No significant difference: p = {p_value:.4f} >= {alpha}")
    
    # Store results in dictionary
    stats_dict[q] = {
        "mean_B": mean_B,
        "std_B": std_B,
        "mean_C": mean_C,
        "std_C": std_C,
        "significant": p_value < alpha
    }


In [None]:
# Mapping of question keys to English question text
QUESTION_DISPLAY = {
    tes_const.QA_KEY: "Useful to progress in the game?",
    tes_const.QB_KEY: "Useful to learn Python programming?",
    tes_const.QC_KEY: "Digital assistant in the future?",
    tes_const.QD_KEY: "Help easy to understand?",
    tes_const.QE_KEY: "Help was correct (without errors)?"
}

# Use previously computed stats_dict
summary_list = []
for q in question_cols:
    stats_q = stats_dict[q]  # fetch stored results
    
    summary_list.append({
        'question': q,
        'question_text': QUESTION_DISPLAY[q],
        'mean_B': stats_q['mean_B'],
        'std_B': stats_q['std_B'],
        'mean_C': stats_q['mean_C'],
        'std_C': stats_q['std_C'],
        'significant': stats_q['significant']
    })

summary_df = pd.DataFrame(summary_list)

# Parameters for side-by-side bar chart
x = np.arange(len(summary_df))
width = 0.40

# Create figure and axes
fig, ax = plt.subplots(figsize=(8, 5))

# Plot bars with error bars
bars_B = ax.bar(x - width/2, summary_df['mean_B'], width, yerr=summary_df['std_B'], capsize=5, label='Group B', color='skyblue')
bars_C = ax.bar(x + width/2, summary_df['mean_C'], width, yerr=summary_df['std_C'], capsize=5, label='Group C', color='salmon')

# Set x-axis labels
ax.set_xticks(x)
ax.set_xticklabels(summary_df['question_text'], rotation=30, ha='right', fontsize=10)

# Add axis labels and title
ax.set_xlabel("Question", fontsize=12)
ax.set_ylabel("Mean score", fontsize=12)
# Add horizontal dashed line at 50%
ax.axhline(y=50, color='red', linestyle='--', linewidth=1.5, label='50% reference')

ax.grid(axis='y', linestyle='--', alpha=0.7)
ax.set_ylim(0, 100)

# Annotate significance and values above bars
for i, row in summary_df.iterrows():
    # Position for significance star
    y_star = max(row['mean_B'], row['mean_C']) + max(row['std_B'], row['std_C']) * 0.1
    if row['significant']:
        ax.text(i, y_star, '*', ha='center', va='bottom', fontsize=16, color='black')

    # Add values above each bar (just above the bar, ignoring SD)
    offset = 1.4  # small vertical margin above the bar
    dx_b = 0.17   # horizontal shift (can adjust if needed)
    dx_c = -0.17  # horizontal shift (can adjust if needed)
    ax.text(i - width/2 - dx_b, row['mean_B'] + offset, f"{row['mean_B']:.1f}", ha='center', va='bottom', fontsize=10, color='black')
    ax.text(i + width/2 - dx_c, row['mean_C'] + offset, f"{row['mean_C']:.1f}", ha='center', va='bottom', fontsize=10, color='black')

# Add legend
ax.legend()

# Adjust layout, save figure, and display
plt.tight_layout()
plt.savefig("../outputs/assistance_perception_BC.png", dpi=300, bbox_inches="tight")
plt.show()


## 4/ Student perception of game resources and assistance [A]

What is student perception of game resources and assistance in group A ?

In [None]:
# Columns of questions to analyze: QI and QJ
question_cols = tes_const.HELP_QUESTIONS

# Mapping of question keys to English question text
QUESTION_MAPPING = {
    tes_const.QI_KEY: "Did you find that the available resources in the game (instructions + programming memo) were sufficient to progress?",
    tes_const.QJ_KEY: "Would you have liked to receive more help in the game?"
}
df_A = post_test_data[post_test_data[tes_const.GROUP_ID_KEY] == tes_const.GROUP_A].copy()

summary_list = []
for q in question_cols:
    values = df_A[q].dropna()
    mean = values.mean()
    std = values.std()
    n = len(values)
    summary_list.append({
        'question_key': q,
        'question_text': QUESTION_MAPPING[q],
        'mean': mean,
        'std': std,
        'n': n
    })

group_A_summary = pd.DataFrame(summary_list)

# Display summary
for _, row in group_A_summary.iterrows():
    print(f"\n######## {row['question_key']} : {QUESTION_MAPPING[row['question_key']]} ########")
    print(f"Group A (n={row['n']}): M = {row['mean']:.2f}, SD = {row['std']:.2f}\n")


## 5/ Feedback typology [C]

### 5.1/ Characteristics combination

What are the most common combinations of characteristics for the feedback in group C?

The feedback in group C contains within it a characterization expressed in XML format (`<feedback_caractéristiques>`).
Here is an example of feedback:
```xml
<feedback>
    <feedback_message>
        Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt.
    <feedback_message>
    <feedback_caractéristiques>
        <combination>technical</combination>
        <combination>error_pointed</combination>
    </feedback_caractéristiques>
</feedback>
```

In [None]:
# Filter only group C interactions
interaction_C = interaction_data[interaction_data[int_const.GROUP_ID_DATA_KEY] == int_const.GROUP_C].copy()

# Further filter: only feedback received from the assistant
feedback_C = interaction_C[
    (interaction_C[int_const.ACTION_DATA_KEY] == int_const.RECEIVED_ACTION) &
    (interaction_C[int_const.OBJECT_DATA_KEY] == int_const.ASSISTANT_HELP_OBJECT)
].copy()

# Function to extract combinations from XML feedback
def extract_combinations(feedback_xml):
    if not feedback_xml or pd.isna(feedback_xml):
        return []
    try:
        root = ET.fromstring(str(feedback_xml))
        combis = [elem.text for elem in root.findall('.//feedback_caractéristiques/combination')]
        return combis
    except ET.ParseError:
        return []

# Apply the function: store a list of combinations per row
feedback_C['combinations'] = feedback_C[int_const.CODE_DATA_KEY].apply(extract_combinations)

# Count occurrences ignoring the order of elements
all_combinations_tuples = [tuple(sorted(c)) for c in feedback_C['combinations']]
combination_counts = Counter(all_combinations_tuples)

# Convert to DataFrame and calculate percentage
total = sum(combination_counts.values())
combination_df = pd.DataFrame(
    [(list(k), v, v / total * 100) for k, v in combination_counts.items()],
    columns=['combination_list', 'count', 'percentage']
).sort_values(by='count', ascending=False)

print(combination_df)


In [None]:
top_k = 8
top_df = combination_df.head(top_k).copy()
top_labels = ['\n'.join(c) for c in top_df['combination_list']]
top_percentages = top_df['percentage'].values

# Parameters for single-group bar chart
x = np.arange(len(top_labels))
width = 0.75

# Create figure and axes
fig, ax = plt.subplots(figsize=(8, 5))

# Plot bars
bars = ax.bar(x, top_percentages, width, label='Combinations')

# Set x-axis labels
ax.set_xticks(x)
ax.set_xticklabels(top_labels, rotation=30, ha='right', fontsize=10)

# Axis labels and title
ax.set_xlabel("Top characteristics combination", fontsize=12)
ax.set_ylabel("Percentage (%)", fontsize=12)
ax.set_ylim(0, max(top_percentages)*1.08)
ax.grid(axis='y', linestyle='--', alpha=0.7)



# Add percentage labels on top of each bar
for bar in bars:
    height = bar.get_height()
    ax.annotate(f'{height:.1f}%',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontsize=10)

# Add legend
ax.legend()

# Layout adjustment, save figure, display
plt.tight_layout()
plt.savefig("../outputs/feedback_char_combination_C.png", dpi=300, bbox_inches="tight")
plt.show()

### 5.2/ Characteristics type

What are the most common characteristics for the feedback in group C?

In [None]:
# Filter only group C interactions
interaction_C = interaction_data[
    interaction_data[int_const.GROUP_ID_DATA_KEY] == int_const.GROUP_C
].copy()

# Further filter: only feedback received from the assistant
feedback_C = interaction_C[
    (interaction_C[int_const.ACTION_DATA_KEY] == int_const.RECEIVED_ACTION) &
    (interaction_C[int_const.OBJECT_DATA_KEY] == int_const.ASSISTANT_HELP_OBJECT)
].copy()

# Function to extract individual characteristics from XML feedback
def extract_characteristics(feedback_xml):
    if not feedback_xml or pd.isna(feedback_xml):
        return []
    try:
        root = ET.fromstring(str(feedback_xml))
        # Extract each individual characteristic
        chars = [elem.text for elem in root.findall('.//feedback_caractéristiques/combination')]
        return chars
    except ET.ParseError:
        return []

# Extract list of characteristics (not combinations)
feedback_C['characteristics'] = feedback_C[int_const.CODE_DATA_KEY].apply(extract_characteristics)

# Flatten the list of lists
flat_chars = [c for sublist in feedback_C['characteristics'] for c in sublist]

# Count occurrences
char_counts = Counter(flat_chars)

# Convert to DataFrame + compute percentages
total = sum(char_counts.values())

char_df = pd.DataFrame(
    [(k, v, v / total * 100) for k, v in char_counts.items()],
    columns=['characteristic', 'count', 'percentage']
).sort_values(by='count', ascending=False)

print(char_df)


In [None]:
# Labels and percentages
labels = char_df['characteristic'].tolist()
percentages = char_df['percentage'].tolist()

# X positions
x = np.arange(len(labels))
width = 0.75  # bar width

# Create figure
fig, ax = plt.subplots(figsize=(8, 5))

# Plot bars
bars = ax.bar(x, percentages, width, label='Characteristics')

# Set x-axis labels
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=30, ha='right', fontsize=10)

# Axis labels and title
ax.set_xlabel("Feedback characteristics", fontsize=12)
ax.set_ylabel("Percentage (%)", fontsize=12)
ax.set_ylim(0, max(percentages) * 1.1)
ax.grid(axis='y', linestyle='--', alpha=0.7)

# Add percentage labels
for bar in bars:
    height = bar.get_height()
    ax.annotate(
        f'{height:.1f}%',
        xy=(bar.get_x() + bar.get_width() / 2, height),
        xytext=(0, 3),
        textcoords="offset points",
        ha='center', va='bottom', fontsize=10
    )

# Legend
ax.legend()

# Layout + save + show
plt.tight_layout()
plt.savefig("../outputs/feedback_char_C.png", dpi=300, bbox_inches="tight")
plt.show()
