In [None]:
import os
import re
import requests
import pandas as pd
import openai as client
import numpy as np
import json
import matplotlib.pyplot as plt
from fpdf import FPDF
import seaborn as sns
from xhtml2pdf import pisa
from PyPDF2 import PdfMerger
import re
import csv
from typing import List
from utils import print_question_data
from utils import print_first_5_students
from utils import print_single_value_in_table
import matplotlib
matplotlib.use('Agg')  # Non-interactive backend
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import matplotlib.font_manager as fm
import numpy as np
from utils import print_single_value_in_table
from utils import err_box_red
from utils import pretty_print_results

In [14]:
def generate_individual_student_report(csv_path, student_name, output_folder):
    # 1. Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # 2. Load data
    df = pd.read_csv(csv_path)
    
    # 3. Identify subjects and their test columns
    subjects_info = _identify_subjects_and_tests(df.columns)
    subjects = list(subjects_info.keys())

    # 4. Replace 'AB' with NaN for calculations but keep original data for display
    df_calc = df.copy()
    for subject_data in subjects_info.values():
        for test_col in subject_data['test_columns']:
            df_calc[test_col] = pd.to_numeric(df_calc[test_col], errors='coerce')

    # 5. Compute aggregated scores and class stats
    aggregated_scores = _compute_aggregated_scores(df_calc, subjects_info)
    class_stats = _compute_class_stats(aggregated_scores, subjects)

    # 6. Locate the student row
    student_df = df[df['Student Names'] == student_name]
    if student_df.empty:
        raise ValueError(f"Student '{student_name}' not found")
    student = student_df.iloc[0]
    
    # 7. Compute student's aggregated scores
    student_calc = df_calc[df_calc['Student Names'] == student_name].iloc[0]
    student_aggregated = _compute_student_aggregated_scores(student_calc, subjects_info)
    
    # 8. Generate the PDF
    output_path = _generate_student_pdf(
        student, student_calc, student_aggregated, subjects_info, subjects, 
        class_stats, output_folder, aggregated_scores
    )
    return output_path


def _identify_subjects_and_tests(columns):
    """
    Identify subjects and their corresponding test columns from dataframe columns.
    
    Returns:
        dict: {subject_name: {'test_columns': [col1, col2, ...], 'topic_columns': [...]}}
    """
    subjects_info = {}
    
    # Filter out non-subject columns
    excluded_cols = {'Student Names', 'Attendance', "Teacher's Remarks"}
    
    for col in columns:
        if col in excluded_cols:
            continue
            
        # Check if it's a topic column
        if 'Topics' in col:
            continue
            
        # Extract base subject name (remove _Test1, _Test2, etc.)
        if '_Test' in col:
            base_subject = col.split('_Test')[0]
        else:
            base_subject = col
            
        if base_subject not in subjects_info:
            subjects_info[base_subject] = {
                'test_columns': [],
                'topic_columns': []
            }
        
        # Add to test columns if it's a test column
        if '_Test' in col or col == base_subject:
            subjects_info[base_subject]['test_columns'].append(col)
    
    # Now find topic columns for each subject
    for col in columns:
        if 'Topics' in col:
            # Try to match with subjects
            for subject in subjects_info.keys():
                if subject in col:
                    subjects_info[subject]['topic_columns'].append(col)
                    break
    
    return subjects_info


def _compute_aggregated_scores(df_calc, subjects_info):
    """
    Compute aggregated scores for each student and subject.
    Uses average of all tests for a subject, handling AB/NaN values properly.
    
    Returns:
        DataFrame with columns: Student Names, Subject1_Avg, Subject2_Avg, etc.
    """
    result_data = {'Student Names': df_calc['Student Names']}
    
    for subject, info in subjects_info.items():
        test_cols = info['test_columns']
        
        if len(test_cols) == 1:
            # Single test - just use that column
            result_data[f"{subject}_Avg"] = df_calc[test_cols[0]]
        else:
            # Multiple tests - compute average of available scores
            test_data = df_calc[test_cols]
            # Compute row-wise mean, ignoring NaN values
            result_data[f"{subject}_Avg"] = test_data.mean(axis=1, skipna=True)
    
    return pd.DataFrame(result_data)


def _compute_student_aggregated_scores(student_calc, subjects_info):
    """
    Compute aggregated scores for a single student.
    
    Returns:
        dict: {subject: aggregated_score}
    """
    student_scores = {}
    
    for subject, info in subjects_info.items():
        test_cols = info['test_columns']
        
        if len(test_cols) == 1:
            student_scores[subject] = student_calc[test_cols[0]]
        else:
            # Compute average of available test scores
            test_scores = [student_calc[col] for col in test_cols if pd.notna(student_calc[col])]
            if test_scores:
                student_scores[subject] = sum(test_scores) / len(test_scores)
            else:
                student_scores[subject] = float('nan')  # All tests were AB/NaN
    
    return student_scores


def _compute_class_stats(aggregated_scores, subjects):
    """
    Compute class statistics using aggregated scores.
    """
    class_stats = {}
    
    for subject in subjects:
        col_name = f"{subject}_Avg"
        if col_name in aggregated_scores.columns:
            class_stats[subject] = {
                'highest': aggregated_scores[col_name].max(),
                'lowest': aggregated_scores[col_name].min(),
                'average': aggregated_scores[col_name].mean()
            }
    
    return class_stats


def _generate_student_pdf(student, student_calc, student_aggregated, subjects_info, 
                         subjects, class_stats, output_folder, aggregated_scores):
    """
    Internal helper to build the PDF for one student with compact layout.
    Handles multiple tests per subject and improved strongest/weakest analysis.
    """
    pdf = FPDF()
    pdf.add_page()
    
    # Use Times as it's closer to the "math" font in the template
    pdf.set_font('Times', 'B', 14)

    # Header - more compact
    name = student['Student Names']
    pdf.cell(0, 8, "Student Performance Report", ln=1, align='C')
    pdf.set_font('Times', '', 10)
    
    # Attendance with less spacing (if available)
    if 'Attendance' in student.index:
        att = student['Attendance']
        att_pct = (int(att) / ATTENDANCE_DAYS) * 100 if pd.notna(att) else 0
        pdf.cell(0, 6, f"Attendance: {att} / {ATTENDANCE_DAYS} ({att_pct:.1f}%)", ln=1, align='C')
    
    # Minimal spacing before chart
    pdf.ln(2)

    # Comparison chart - increased height
    chart_path = create_comparison_chart(student, student_aggregated, subjects, class_stats, subjects_info)
    pdf.image(chart_path, x=20, w=170, h=75)
    os.remove(chart_path)
    
    # Compact spacing
    pdf.ln(2)

    # SECTION: Subject Analysis with underlined header
    pdf.set_font('Times', 'B', 12)
    pdf.cell(0, 8, "Subject Analysis", ln=1)
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())
    pdf.ln(1)
    
    pdf.set_font('Times', '', 10)
    
    # Format subject analysis showing individual tests and average
    pct_map = {}
    
    for subject in subjects:
        info = subjects_info[subject]
        test_cols = info['test_columns']
        
        # Get subject average score
        avg_score = student_aggregated.get(subject)
        
        if pd.isna(avg_score):
            pdf.cell(0, 6, f"{subject}: All tests absent", ln=1)
        else:
            # Show individual test scores and average
            test_details = []
            for col in test_cols:
                raw_score = student[col] if col in student.index else 'AB'
                if raw_score == 'AB' or pd.isna(raw_score):
                    test_details.append("AB")
                else:
                    test_details.append(f"{raw_score}")
            
            # Assuming max score is 100 for percentage calculation
            # You may need to adjust this based on your scoring system
            max_score = 100
            avg_pct = (avg_score / max_score) * 100
            pct_map[subject] = avg_pct
            
            if len(test_cols) == 1:
                pdf.cell(0, 6, f"{subject}: {test_details[0]}/{max_score} ({avg_pct:.1f}%)", ln=1)
            else:
                tests_str = " | ".join([f"T{i+1}: {score}" for i, score in enumerate(test_details)])
                pdf.cell(0, 6, f"{subject}: {tests_str} | Avg: {avg_score:.1f}/{max_score} ({avg_pct:.1f}%)", ln=1)

    # SECTION: Performance Highlights with improved logic
    pdf.ln(2)
    pdf.set_font('Times', 'B', 12)
    pdf.cell(0, 8, "Performance Highlights", ln=1)
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())
    pdf.ln(1)
    
    pdf.set_font('Times', '', 10)
    
    # Only show highlights if student has taken at least one test
    if pct_map:
        # Improved strongest/weakest subject logic
        performance_analysis = _analyze_student_performance(pct_map, aggregated_scores, subjects, name)
        
        # Display strongest subjects (top 30% performance or above 80%)
        if performance_analysis['strong_subjects']:
            strong_subjects_str = ", ".join([f"{subj} ({pct:.1f}%)" 
                                           for subj, pct in performance_analysis['strong_subjects']])
            pdf.cell(0, 6, f"Strong Subjects: {strong_subjects_str}", ln=1)
        
        # Display subjects needing improvement
        if performance_analysis['improvement_subjects']:
            pdf.cell(0, 6, "Subjects Needing Improvement:", ln=1)
            for subj, pct, reason in performance_analysis['improvement_subjects']:
                pdf.cell(0, 6, f"- {subj} ({pct:.1f}%) - {reason}", ln=1)
        
        # Overall performance summary
        pdf.cell(0, 6, f"Overall Average: {performance_analysis['overall_avg']:.1f}%", ln=1)
        
        if performance_analysis['consistency_note']:
            pdf.cell(0, 6, performance_analysis['consistency_note'], ln=1)
    else:
        pdf.cell(0, 6, "No test scores available for performance analysis", ln=1)

    # SECTION: Teacher's Remarks (if available)
    if "Teacher's Remarks" in student.index:
        pdf.ln(2)
        pdf.set_font('Times', 'B', 12)
        pdf.cell(0, 8, "Teacher's Remarks", ln=1)
        
        has_remarks = pd.notna(student["Teacher's Remarks"]) and student["Teacher's Remarks"].strip() != ""
        
        if has_remarks:
            pdf.line(10, pdf.get_y(), 200, pdf.get_y())
            pdf.ln(1)
            pdf.set_font('Times', '', 10)
            pdf.multi_cell(0, 5, student["Teacher's Remarks"])
        else:
            pdf.ln(1)
            pdf.set_font('Times', '', 10)
            pdf.cell(0, 5, "No remarks from teacher", ln=1)

    # SECTION: Topics Covered - consolidate from all tests
    remaining_height = 270 - pdf.get_y()
    
    if remaining_height > 20:
        pdf.ln(2)
        pdf.set_font('Times', 'B', 12)
        pdf.cell(0, 8, "Topics Covered", ln=1)
        pdf.line(10, pdf.get_y(), 200, pdf.get_y())
        pdf.ln(1)
        
        pdf.set_font('Times', '', 10)
        for subject in subjects:
            info = subjects_info[subject]
            topic_cols = info['topic_columns']
            
            if topic_cols:
                all_topics = set()
                for col in topic_cols:
                    if col in student.index and pd.notna(student[col]):
                        # Split topics by comma and add to set to avoid duplicates
                        topics = [t.strip() for t in str(student[col]).split(',')]
                        all_topics.update(topics)
                
                if all_topics:
                    topics_text = f"{subject}: {', '.join(sorted(all_topics))}"
                    if len(topics_text) > 100:
                        topics_text = topics_text[:97] + "..."
                    pdf.multi_cell(0, 5, topics_text)

    # Save
    safe_name = name.replace(' ', '_')
    path = os.path.join(output_folder, f"{safe_name}_report.pdf")
    pdf.output(path)
    return path


def _analyze_student_performance(pct_map, aggregated_scores, subjects, student_name):
    """
    Improved performance analysis logic that considers:
    1. Relative performance vs class
    2. Absolute performance thresholds
    3. Consistency across subjects
    """
    analysis = {
        'strong_subjects': [],
        'improvement_subjects': [],
        'overall_avg': 0,
        'consistency_note': ''
    }
    
    if not pct_map:
        return analysis
    
    # Calculate overall average
    analysis['overall_avg'] = sum(pct_map.values()) / len(pct_map)
    
    # Calculate class percentiles for each subject
    subject_percentiles = {}
    for subject in subjects:
        if subject in pct_map:
            col_name = f"{subject}_Avg"
            if col_name in aggregated_scores.columns:
                all_scores = aggregated_scores[col_name].dropna()
                if not all_scores.empty:
                    student_score = pct_map[subject] * 100 / 100  # Convert back to raw score
                    percentile = (all_scores < student_score).sum() / len(all_scores) * 100
                    subject_percentiles[subject] = percentile
    
    # Identify strong subjects (above 80% OR top 25% of class)
    for subject, pct in pct_map.items():
        percentile = subject_percentiles.get(subject, 0)
        if pct >= 80 or percentile >= 75:
            analysis['strong_subjects'].append((subject, pct))
    
    # Sort strong subjects by percentage
    analysis['strong_subjects'].sort(key=lambda x: x[1], reverse=True)
    
    # Identify subjects needing improvement
    for subject, pct in pct_map.items():
        percentile = subject_percentiles.get(subject, 0)
        reasons = []
        
        if pct < 60:
            reasons.append("below 60%")
        elif pct < 75 and percentile < 50:
            reasons.append("below class median")
        elif percentile < 25:
            reasons.append("bottom 25% of class")
        
        if reasons:
            analysis['improvement_subjects'].append((subject, pct, " & ".join(reasons)))
    
    # Sort improvement subjects by percentage (lowest first)
    analysis['improvement_subjects'].sort(key=lambda x: x[1])
    
    # Consistency analysis
    if len(pct_map) > 1:
        scores = list(pct_map.values())
        score_range = max(scores) - min(scores)
        if score_range < 10:
            analysis['consistency_note'] = "Performance is very consistent across subjects"
        elif score_range > 30:
            analysis['consistency_note'] = "Performance varies significantly across subjects"
    
    return analysis

def create_comparison_chart(student, student_aggregated, subjects, class_stats, subjects_info):
    """
    Build and save a matplotlib chart comparing this student's aggregated scores
    against class high/low/average in each subject, with percentages over all bars.
    """
    import matplotlib
    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    from matplotlib.patches import Patch

    # Set Times font family explicitly
    matplotlib.rcParams['font.family'] = 'Times New Roman'
    matplotlib.rcParams['font.serif'] = ['Times New Roman', 'Times', 'DejaVu Serif', 'serif']

    # Prepare data arrays
    marks = []
    absent_subjects = []
    for s in subjects:
        agg = student_aggregated.get(s)
        if pd.isna(agg):
            marks.append(0)
            absent_subjects.append(s)
        else:
            marks.append(agg)

    average = [class_stats[s]['average'] for s in subjects]
    highest = [class_stats[s]['highest'] for s in subjects]
    lowest  = [class_stats[s]['lowest']  for s in subjects]

    # Determine max_score for y-offset of labels
    max_score = max(
        max(average or [0]),
        max(highest  or [0]),
        max(lowest   or [0]),
        max(marks    or [0]),
        100
    )

    # Colors
    student_color = '#4570B7'
    avg_color     = '#9FA7B2'
    high_color    = '#97D077'
    low_color     = '#F08B7E'
    absent_color  = '#E8E8E8'

    # Figure setup
    plt.figure(figsize=(7.5, 5.0))
    x = np.arange(len(subjects))
    width = 0.18

    # Plot bars
    avg_bars     = plt.bar(x,           average, width, color=avg_color,    edgecolor='white', linewidth=0.5, label='Class Average', zorder=1)
    high_bars    = plt.bar(x + width,   highest, width,  color=high_color,   edgecolor='white', linewidth=0.5, label='Class Highest', zorder=1)
    low_bars     = plt.bar(x - width,   lowest,  width,  color=low_color,    edgecolor='white', linewidth=0.5, label='Class Lowest', zorder=1)
    student_bars = plt.bar(
        x + 2*width,
        marks,
        width,
        color=[absent_color if s in absent_subjects else student_color for s in subjects],
        edgecolor='white',
        linewidth=1.0,
        label=student['Student Names'],
        zorder=2
    )

    # Annotate every bar with a percentage or "Absent"
    bar_sets = [
        (avg_bars,     average),
        (high_bars,    highest),
        (low_bars,     lowest),
        (student_bars, marks),
    ]
    for bars, values in bar_sets:
        for idx, rect in enumerate(bars):
            height = rect.get_height()
            # For student bars where the student was absent:
            if bars is student_bars and subjects[idx] in absent_subjects:
                label = "Absent"
            else:
                # value is out of 100, so it's directly percentage
                label = f"{values[idx]:.1f}%"
            plt.text(
                rect.get_x() + rect.get_width() / 2,
                height + max_score * 0.01,  # small offset above bar
                label,
                ha='center',
                va='bottom',
                fontsize=8,
                family='Times New Roman'
            )

    # Axes & titles
    plt.xlabel('Subjects', fontsize=10, family='Times New Roman')
    plt.ylabel('Average Score', fontsize=10, family='Times New Roman')
    plt.suptitle(f"{student['Student Names']}'s Performance",
                 fontsize=12, family='Times New Roman', y=0.98)
    plt.title("Comparison with class statistics (test averages)",
              fontsize=9, family='Times New Roman', pad=10)

    plt.xticks(x + width/2, subjects, fontsize=10, family='Times New Roman')
    plt.ylim(0, max_score * 1.1)
    plt.grid(axis='y', linestyle='--', alpha=0.2, zorder=0)

    # Legend
    legend_elems = [
        Patch(facecolor=student_color, edgecolor='white', label=student['Student Names']),
        Patch(facecolor=avg_color,     edgecolor='white', label='Class Average'),
        Patch(facecolor=high_color,    edgecolor='white', label='Class Highest'),
        Patch(facecolor=low_color,     edgecolor='white', label='Class Lowest'),
    ]
    if absent_subjects:
        legend_elems.append(Patch(facecolor=absent_color, edgecolor='white', label='Absent'))
    plt.legend(handles=legend_elems, loc='upper center', bbox_to_anchor=(0.5, -0.12),
               fontsize=9, framealpha=0.7, edgecolor='#CCCCCC',
               ncol=min(5, len(legend_elems)), prop={'family': 'Times New Roman'})

    # Clean up
    ax = plt.gca()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    for lbl in ax.get_xticklabels() + ax.get_yticklabels():
        lbl.set_fontname('Times New Roman')

    plt.tight_layout(rect=[0, 0.1, 1, 0.97])

    # Save and return
    fname = f"temp_chart_{student['Student Names'].replace(' ', '_')}.png"
    plt.savefig(fname, dpi=200, bbox_inches='tight', facecolor='white')
    plt.close()
    return fname






In [15]:
generate_individual_student_report('./student_data_out.csv','Alice','./reports/')

'./reports/Alice_report.pdf'