In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import math

# --- Paths ---
base_path = "/proj/berzelius-2024-90/users/x_liumi/Qwen2.5-VL/pedagogy"
xlsx_path = os.path.join(base_path, "Higher Education Pedagogy Course Student Survey(1-49).xlsx")

# --- Load Data ---
df = pd.read_excel(xlsx_path)

# --- Identify columns correctly ---
age_col = [c for c in df.columns if "age group" in c.lower()][0]
term_col = [c for c in df.columns if "term" in c.lower() or "semester" in c.lower()][0]

# ===============================
# 1. AGE GROUP BAR PLOT
# ===============================
age_counts = df[age_col].value_counts().sort_index()

plt.figure(figsize=(8, 5))
bars = plt.bar(age_counts.index, age_counts.values, color='cornflowerblue', edgecolor='black')
plt.title("Distribution of Age Groups", fontsize=14)
plt.xlabel("Age Group")
plt.ylabel("Number of Students")

# Add count labels above bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.3, int(yval),
             ha='center', va='bottom', fontsize=10)

plt.tight_layout()

age_plot_path = os.path.join(base_path, "age_group_distribution.png")
plt.savefig(age_plot_path, dpi=300)
plt.close()
print(f"✅ Saved age group plot to: {age_plot_path}")

# ===============================
# 2. TERM / YEAR OF STUDY PIE CHART
# ===============================

def parse_year(raw):
    """Parse a free-text term/semester answer into Year 1–5, or 'Unspecified'."""
    if pd.isna(raw):
        return "Unspecified/Other"
    s = str(raw).lower().strip()

    # Ignore irrelevant long answers
    if any(x in s for x in ["several years", "not in a particular term"]):
        return "Unspecified/Other"

    # Extract numeric term info
    nums = re.findall(r'\b\d+\b', s)
    if nums:
        n = int(nums[0])
        if 1 <= n <= 20:
            # Always treat as term number (2 terms = 1 year)
            year = math.ceil(n / 2)
            if year > 5:
                year = 5  # cap at Year 5 for simplicity
            return f"Year {year}"

    # English / Swedish word-based
    mapping = {
        "first": "Year 1", "första": "Year 1", "1st": "Year 1", "the first": "Year 1",
        "second": "Year 2", "andra": "Year 2", "2nd": "Year 2",
        "third": "Year 3", "tredje": "Year 3", "3rd": "Year 3",
        "fourth": "Year 4", "fjärde": "Year 4", "4th": "Year 4",
        "fifth": "Year 5", "femte": "Year 5", "5th": "Year 5",
    }
    for k, v in mapping.items():
        if k in s:
            return v

    # Otherwise it's unknown or unparseable
    return "Unspecified/Other"

# Apply parser to all responses
df["Parsed Year"] = df[term_col].apply(parse_year)

# Count all categories (including Unspecified)
year_counts = df["Parsed Year"].value_counts().sort_index()

# Check total count sanity
total_responses = len(df)
assert total_responses == 49, f"Expected 49 responses, got {total_responses}"

# --- Pie chart with correct total-based percentages ---
fig, ax = plt.subplots(figsize=(7, 7))

# Percentages relative to total (including Unspecified)
percentages = [count / total_responses * 100 for count in year_counts.values]

wedges, texts, autotexts = ax.pie(
    year_counts,
    labels=None,
    startangle=90,
    colors=plt.cm.Set3.colors,
    autopct=lambda p: f'{p:.1f}%' if p >= 2 else '',  # hide tiny labels under 2%
    textprops={'fontsize': 10, 'color': 'black'}
)

ax.legend(
    wedges,
    [f"{label} ({count}, {p:.1f}%)" for label, count, p in zip(year_counts.index, year_counts.values, percentages)],
    title="Study Year (n=49)",
    loc="center left",
    bbox_to_anchor=(1, 0.5)
)
plt.title("Distribution by Year of Study", fontsize=14)
plt.tight_layout()

year_plot_path = os.path.join(base_path, "year_of_study_distribution.png")
plt.savefig(year_plot_path, dpi=300, bbox_inches="tight")
plt.close()
print(f"✅ Saved year-of-study plot with correct total percentages to: {year_plot_path}")

# --- Summary printout ---
print("\nParsed year distribution (n=49 total):")
print(year_counts)
print("\nPercentages (relative to total 49):")
for label, count, p in zip(year_counts.index, year_counts.values, percentages):
    print(f"{label:20s}  {count:2d}  ({p:4.1f}%)")


✅ Saved age group plot to: /proj/berzelius-2024-90/users/x_liumi/Qwen2.5-VL/pedagogy/age_group_distribution.png
✅ Saved year-of-study plot with correct total percentages to: /proj/berzelius-2024-90/users/x_liumi/Qwen2.5-VL/pedagogy/year_of_study_distribution.png

Parsed year distribution (n=49 total):
Parsed Year
Unspecified/Other     7
Year 1                7
Year 2               14
Year 3               10
Year 4                4
Year 5                7
Name: count, dtype: int64

Percentages (relative to total 49):
Unspecified/Other      7  (14.3%)
Year 1                 7  (14.3%)
Year 2                14  (28.6%)
Year 3                10  (20.4%)
Year 4                 4  ( 8.2%)
Year 5                 7  (14.3%)
