<h1>Import Libs</h1>

In [7]:

import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import os
import requests
from utils import print_question_data
from utils import print_first_5_students
from utils import print_single_value_in_table
import openai as client
from utils import print_single_value_in_table
from utils import err_box_red
from utils import pretty_print_results


<h1>Constants</h1>

In [None]:

FILE_ORIGIN = "./student_data.csv"
FILE_DESTINATION = "./student_data_final.csv"
TESTS_AND_CHAPTERS_FOR_SUBJECTS =  [
        ("Test1", ["English", "Maths"], ["A,B", "C,D"]),
        ("Test2", ["English", "Maths"], ["B,C", "D,F"])
    ]
API_URL = 'http://localhost:3000/'
AUTH_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6MywiZW1haWxJZCI6ImhpdGFuc2h1c2hhaDVAZ21haWwuY29tIiwiaWF0IjoxNzQ2NzE5OTI4LCJleHAiOjE3NDY4MDYzMjh9.oofQw4zUkKWcGXvYyJjdK0Mp1y25dlxVSsTRizGEBPE"
GET_QUESTIONS_FOR_TOPICS = API_URL + 'question/get-questions-for-chapters'
SAVE_STUDENT_COST_PER_WORKSHEET = API_URL + '/student-stat-analysis/save-student-cost-per-worksheet'
DOWNLOAD_FROM_S3_LINK =  API_URL + '/student-stat-analysis/download-worksheet-from-s3-link'
GET_WORKSHEET_HTML = API_URL + '/analysis/getWorksheetHTML'
TOTAL_COST = 0
CHAPTERS = ['A', 'B', 'C', 'D', 'E', 'F']

<h1> Read Data</h1>

In [None]:
def read_data(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.csv':
        student_data = pd.read_csv(file_path)
        return student_data
    elif ext in ('.xls', '.xlsx'):
        student_data = pd.read_excel(file_path, sheet_name=None)
        return student_data
    else:
        raise ValueError("Unsupported file extension")

<h1>Add topics to csv Data</h1>

### Demo Input → Expected Output

```python
# Setup for this demo:
TESTS_AND_CHAPTERS_FOR_SUBJECTS = [
    ("Test1", ["English","Maths"], ["A,B","C,D"]),
    ("Test2", ["English","Maths"], ["B,C","E,F"])
]
FILE_ORIGIN      = "input.csv"
FILE_DESTINATION = "output.csv"

# Contents of `input.csv`:
# Student Names,English_Test1,English_Test2,Maths_Test1,Maths_Test2,Attendance
# Alice,85,88,90,92,12
# Bob,78,82,88,85,23
# Charlie,92,94,76,78,24

add_topics_to_csv()

# After running, `output.csv` will include extra columns:
#   English Topics Test1, Maths Topics Test1,
#   English Topics Test2, Maths Topics Test2,
#   English All Topics,    Maths All Topics
#
# And sample rows become:
# Student Names,English_Test1,English_Test2,Maths_Test1,Maths_Test2,Attendance,English Topics Test1,Maths Topics Test1,English Topics Test2,Maths Topics Test2,English All Topics,Maths All Topics
# Alice,85,88,90,92,12,"A,B","C,D","B,C","E,F","A, B, C","C, D, E, F"
# Bob,78,82,88,85,23,"A,B","C,D","B,C","E,F","A, B, C","C, D, E, F"
# Charlie,92,94,76,78,24,"A,B","C,D","B,C","E,F","A, B, C","C, D, E, F"


In [None]:
import csv
from typing import List, Tuple, Dict

def add_topics_to_csv():
    """
    - input_csv_path, output_csv_path: file paths.
    - tests: a list of (test_name, subjects, topic_values), e.g.:
        [
          ("Test1", ["English","Maths"], ["A,B","C,D"]),
          ("Test2", ["English","Maths"], ["B,C","E,F"])
        ]
    
    This will add columns:
      English Topics Test1, Maths Topics Test1,
      English Topics Test2, Maths Topics Test2,
      English All Topics,  Maths All Topics
    """
    # 1) Validate
    for test_name, subjects, chapters in TESTS_AND_CHAPTERS_FOR_SUBJECTS:
        if len(subjects) != len(chapters):
            raise ValueError(f"subjects vs topic_values length mismatch in {test_name}")

    # 2) Collect all unique subjects and append it to all_subjects
    all_subjects: List[str] = []
    for _, subjects, _ in TESTS_AND_CHAPTERS_FOR_SUBJECTS:
        for subj in subjects:
            if subj not in all_subjects:
                all_subjects.append(subj)

    # 3) Build per-test lookup maps
    test_topic_maps: Dict[str, Dict[str, str]] = {
        test_name: dict(zip(subjects, chapters))
        for test_name, subjects, chapters in TESTS_AND_CHAPTERS_FOR_SUBJECTS
    }

    # 4) Open I/O
    with open(FILE_ORIGIN, newline="", encoding="utf-8") as fin, \
         open(FILE_DESTINATION, "w", newline="", encoding="utf-8") as fout:

        reader = csv.DictReader(fin)
        # a) build the new header
        extra_cols: List[str] = []
        for test_name in test_topic_maps:
            for subj in all_subjects:
                if subj in test_topic_maps[test_name]:
                    extra_cols.append(f"{subj} Topics {test_name}")
        for subj in all_subjects:
            extra_cols.append(f"{subj} All Topics")

        writer = csv.DictWriter(fout, fieldnames=reader.fieldnames + extra_cols)
        writer.writeheader()

        # 5) Process each row
        for row in reader:
            # per-test columns
            for test_name, topics_map in test_topic_maps.items():
                for subj, topics_str in topics_map.items():
                    row[f"{subj} Topics {test_name}"] = topics_str

            # aggregated union columns
            for subj in all_subjects:
                all_toks: List[str] = []
                for topics_map in test_topic_maps.values():
                    if subj in topics_map:
                        # split on comma, strip whitespace
                        all_toks.extend([tok.strip() for tok in topics_map[subj].split(",")])
                # dedupe & sort (optional)
                unique = sorted(set(tok for tok in all_toks if tok))
                row[f"{subj} All Topics"] = ", ".join(unique)

            writer.writerow(row)


<h1>Validate Data</h1>

### Demo Input → Expected Output

```python
# Example setup
import pandas as pd
from your_module import validate_data  # adjust import as needed

df = pd.DataFrame({
    'Student Names': ['Alice', 'Bob', 'Charlie'],
    'English_Test1': ['85', '78', '92'],
    'English_Test2': ['88', '82', '94'],
    'Maths_Test1': ['90', '88', '76'],
    'Maths_Test2': ['92', '85', '78'],
    'English Topics Test1': ['A,B', 'A,B', 'A,B'],
    'Maths Topics Test1': ['C,D', 'C,D', 'C,D'],
    'English Topics Test2': ['B,C', 'B,C', 'B,C'],
    'Maths Topics Test2': ['D,F', 'D,F', 'D,F'],
    'English All Topics': ['A, B, C', 'A, B, C', 'A, B, C'],
    'Maths All Topics': ['C, D, F', 'C, D, F', 'C, D, F'],
    'Attendance': [12, 23, 24]
})

clean_df = validate_data(df)

# Console output:
# Number of students loaded: 3
# 
#   Student Names  English_Test1  English_Test2  Maths_Test1  Maths_Test2  English Topics Test1  Maths Topics Test1  English Topics Test2  Maths Topics Test2  English All Topics  Maths All Topics  Attendance
#0         Alice              85              88           90           92                 A, B                C, D                 B, C                D, F            A, B, C          C, D, F          12
#1           Bob              78              82           88           85                 A, B                C, D                 B, C                D, F            A, B, C          C, D, F          23
#2       Charlie              92              94           76           78                 A, B                C, D                 B, C                D, F            A, B, C          C, D, F          24


In [None]:
import pandas as pd
import re

def validate_data(df: pd.DataFrame, test_mark_cols=None) -> pd.DataFrame:
    """
    Cleans and validates a student score DataFrame.

    Steps:
    - Strips whitespace from all string columns
    - Normalizes topic strings like "A,B" to "A, B"
    - Detects missing names or marks
    - Standardizes absent marks as 'AB'
    - Converts valid marks to int/float
    - Fills missing attendance with 0
    - Prints a summary of absentees and preview of data

    Args:
        df (pd.DataFrame): Input student data.
        test_mark_cols (list[str], optional): List of test score columns to validate. If None, inferred.

    Returns:
        pd.DataFrame: Cleaned and validated dataframe.
    """
    df = df.copy()  # Avoid modifying the original DataFrame

    # 0. Remove leading/trailing spaces from all string columns
    for col in df.select_dtypes(include='object'):
        df[col] = df[col].str.strip()

    # 1. Normalize topic columns — ensure "A, B, C" format
    topic_cols = [c for c in df.columns if re.search(r'topics', c, re.IGNORECASE)]
    for col in topic_cols:
        df[col] = df[col].apply(
            lambda x: ', '.join(p.strip() for p in x.split(',')) if isinstance(x, str) else x
        )

    # 2. Check for missing student names and report
    if df['Student Names'].isna().any():
        missing = df[df['Student Names'].isna()].index.tolist()
        print(f"Missing Student Names in rows: {missing}")

    # 3. Identify test mark columns: use passed ones or detect those matching '<Subject>_Test<N>'
    if test_mark_cols is None:
        test_mark_cols = [c for c in df.columns if re.match(r'.+_Test\d+', c)]

    # Ensure the specified test mark columns exist in DataFrame
    missing_marks = [c for c in test_mark_cols if c not in df.columns]
    if missing_marks:
        raise KeyError(f"Expected mark columns not found: {missing_marks}")

    # 4. Validate each test mark column
    for col in test_mark_cols:
        # a) Treat empty or NaN cells as 'AB' (Absent)
        empty_mask = df[col].isna() | (df[col] == '')
        if empty_mask.any():
            for idx in df[empty_mask].index:
                name = df.at[idx, 'Student Names'] or '<Unknown>'
                print(f"Missing {col} for {name}; marking as absent ('AB')")
            df.loc[empty_mask, col] = 'AB'

        # b) Standardize all 'ab', 'Ab', etc. to uppercase 'AB'
        is_ab = df[col].astype(str).str.upper().str.strip() == 'AB'
        df.loc[is_ab, col] = 'AB'

        # c) Try to convert other values to numbers, else mark as 'AB'
        for idx in df.index:
            if df.at[idx, col] == 'AB':
                continue  # Skip if already marked absent
            val = df.at[idx, col]
            try:
                num = float(val)
                df.at[idx, col] = int(num) if num.is_integer() else num
            except Exception:
                name = df.at[idx, 'Student Names'] or '<Unknown>'
                print(f"Invalid {col} value '{val}' for {name}; marking as absent")
                df.at[idx, col] = 'AB'

    # 5. Handle Attendance column
    if 'Attendance' not in df.columns:
        raise KeyError("Expected column 'Attendance' not found")

    # Fill missing attendance with 0
    if df['Attendance'].isna().any():
        for idx in df[df['Attendance'].isna()].index:
            name = df.at[idx, 'Student Names'] or '<Unknown>'
            print(f"Attendance missing for {name}; setting to 0")
        df['Attendance'] = df['Attendance'].fillna(0)

    # Ensure attendance is numeric integers
    df['Attendance'] = pd.to_numeric(df['Attendance'], errors='coerce').fillna(0).astype(int)

    # 6. Summary statistics
    print(f"\nNumber of students loaded: {len(df)}")
    absent_summary = {
        col: (df[col] == 'AB').sum() for col in test_mark_cols if (df[col] == 'AB').any()
    }
    if absent_summary:
        print("\nStudents marked as absent:")
        for col, count in absent_summary.items():
            print(f"  {col}: {count}")

    # 7. Print first 5 rows for review
    print(df.head(5).to_string(index=False))

    return df

<h1>Classify Students Strong and Weak Topics </h1>

### Demo Input → Expected Output

```python
import pandas as pd
from your_module import classify_students_by_topic  # adjust import as needed

# Input DataFrame
df = pd.DataFrame({
    'Student Names': ['Alice', 'Bob', 'Charlie'],
    'English_Test1': [85, 78, 92],
    'English_Test2': [88, 82, 94],
    'Maths_Test1': [90, 88, 76],
    'Maths_Test2': [92, 85, 78],
    'English Topics Test1': ['A, B', 'A, B', 'A, B'],
    'Maths Topics Test1': ['C, D', 'C, D', 'C, D'],
    'English Topics Test2': ['B, C', 'B, C', 'B, C'],
    'Maths Topics Test2': ['D, F', 'D, F', 'D, F'],
    'English All Topics': ['A, B, C', 'A, B, C', 'A, B, C'],
    'Maths All Topics': ['C, D, F', 'C, D, F', 'C, D, F'],
    'Attendance': [12, 23, 24],
    "Teacher's Remarks": ['', '', '']
})

# Run classification
results = classify_students_by_topic(df)
print(results)

# Expected Output:
# [
#     {
#         'name': 'Alice',
#         'attendance': 12,
#         'remarks': '',
#         'strong_topics':   ['A', 'B', 'C', 'D', 'F'],
#         'weak_topics':     [],
#         'practice_topics': [],
#         'topic_details': [
#             {'topic':'A','avg_pct':85.0,'num_tests':1},
#             {'topic':'B','avg_pct':86.5,'num_tests':2},
#             {'topic':'C','avg_pct':89.0,'num_tests':2},
#             {'topic':'D','avg_pct':91.0,'num_tests':2},
#             {'topic':'F','avg_pct':92.0,'num_tests':1},
#         ],
#         'test_details': [
#             {'test_col':'English_Test1','subject':'English','raw':85,'pct':85.0,'topics':['A','B']},
#             {'test_col':'English_Test2','subject':'English','raw':88,'pct':88.0,'topics':['B','C']},
#             {'test_col':'Maths_Test1','subject':'Maths','raw':90,'pct':90.0,'topics':['C','D']},
#             {'test_col':'Maths_Test2','subject':'Maths','raw':92,'pct':92.0,'topics':['D','F']},
#         ]
#     },
#     { ... },  # Bob's dict
#     { ... }   # Charlie's dict
# ]


In [None]:
import re
import numpy as np
import pandas as pd

def classify_students_by_topic(
    df: pd.DataFrame,
    max_score: float = 100.0,
    strong_thresh: float = 85.0,
    weak_thresh: float = 70.0
) -> list[dict]:
    """
    Classify students into strong/weak/practice *topics* based on multiple tests.

    Logic & reasoning:
      1. We may have multiple tests per subject, each covering overlapping topics.
      2. For each student-topic pair, we collect all test percentages in which that topic appeared.
      3. We compute the *average percentage* for that topic.
      4. We apply *fixed thresholds* (85% for strong, 70% for weak) rather than class-level percentiles—
         because topic-level data can be sparse and unevenly distributed.
      5. Topics ≥ strong_thresh → strong; ≤ weak_thresh → weak; otherwise → practice.

    Args:
        df: Input DataFrame containing:
            - "Student Names", one or more "<Subject>_Test<N>" columns,
            - corresponding "<Subject> Topics Test<N>" columns,
            - "Attendance" and "Teacher's Remarks".
        max_score: Maximum possible raw score per test.
        strong_thresh: Percentage threshold above which a topic is 'strong'.
        weak_thresh: Percentage threshold below which a topic is 'weak'.

    Returns:
        A list of per-student dicts with keys:
          - name, attendance, remarks
          - strong_topics, weak_topics, practice_topics
          - topic_details: list of { topic, avg_pct, count_of_tests }
          - test_details: list of { test_col, subject, raw, pct, topics }
    """
    results = []

    # 1) Identify all test-score columns, e.g. 'English_Test1', 'Maths_Test2', etc.
    test_cols = [c for c in df.columns if re.match(r'.+_Test\d+', c)]
    # 2) Identify all topic columns for tests: '<Subject> Topics Test<N>'
    topic_cols = [c for c in df.columns if re.match(r'.+ Topics Test\d+', c)]

    # Build a mapping from each test column to its topic-column name
    # e.g. { 'English_Test1': 'English Topics Test1', ... }
    test_to_topics = {}
    for tc in test_cols:
        subj, num = tc.rsplit('_Test', 1)
        tcol = f"{subj} Topics Test{num}"
        if tcol in df.columns:
            test_to_topics[tc] = tcol
        else:
            raise KeyError(f"Missing topics column for {tc}: expected '{tcol}'")

    for _, row in df.iterrows():
        name       = row['Student Names']
        attendance = row.get('Attendance')
        remarks    = row.get("Teacher's Remarks", "")

        # Will collect raw details for debugging/reporting
        test_details  = []
        # topic_scores accumulates all pct values per topic
        topic_scores  = {}

        # 3) Loop through each test, parse marks and topics
        for tc, tcol in test_to_topics.items():
            raw = row[tc]
            # a) Handle absent
            if isinstance(raw, str) and raw.strip().upper() == 'AB':
                pct = np.nan
            else:
                raw_num = pd.to_numeric(raw, errors='coerce')
                pct     = (raw_num * 100.0 / max_score) if pd.notna(raw_num) else np.nan

            # b) Parse topics list for this test
            topics = []
            tstr = row.get(tcol, "")
            if isinstance(tstr, str) and tstr.strip():
                topics = [t.strip() for t in tstr.split(',')]

            # c) Record test detail
            #    (helps trace exactly which tests contributed to each topic)
            test_details.append({
                'test_col': tc,
                'subject':  tc.split('_Test')[0],
                'raw':      raw,
                'pct':      pct,
                'topics':   topics
            })

            # d) Append pct to each topic's list
            for topic in topics:
                topic_scores.setdefault(topic, []).append(pct)

        # 4) Compute average pct per topic & classify
        strong_topics  = []
        weak_topics    = []
        practice_topics = []
        topic_details   = []

        for topic, pcts in topic_scores.items():
            # ignore NaNs when averaging
            valid = [p for p in pcts if pd.notna(p)]
            avg_pct = float(np.nan) if not valid else sum(valid) / len(valid)

            # classify based on fixed thresholds
            if pd.notna(avg_pct):
                if avg_pct >= strong_thresh:
                    strong_topics.append(topic)
                elif avg_pct <= weak_thresh:
                    weak_topics.append(topic)
                else:
                    practice_topics.append(topic)

            topic_details.append({
                'topic':     topic,
                'avg_pct':   avg_pct,
                'num_tests': len(valid)
            })

        # 5) Assemble result for this student
        results.append({
            'name':             name,
            'attendance':       attendance,
            'remarks':          remarks,
            'strong_topics':    strong_topics,
            'weak_topics':      weak_topics,
            'practice_topics':  practice_topics,
            'topic_details':    topic_details,
            'test_details':     test_details
        })

    return results

<h1>Call API to get questions for all the chapters asked in the examination</h1>

### Demo Input → Expected Output

```python
import requests
from your_module import fetch_questions_for_topics  # adjust import as needed

# Setup for demo:
AUTH_TOKEN = "Bearer your_token_here"
API_URL    = "https://api.yoursite.com/getQuestionsForChapters"
chapters   = ["Algebra Basics", "Calculus I", "Geometry Fundamentals"]

# Run the function
results = fetch_questions_for_topics()
print(results)

# Expected Output (example structure):
# [
#   {
#     "Algebra Basics": [
#       "What is the solution to x + 5 = 12?",
#       "Describe the properties of a linear equation.",
#       {"questionText": "Solve for y: 2y = 14", "options": [{"key":"A","option":"y=6"},{"key":"B","option":"y=7"}, …]},
#       … up to 10 questions total …
#     ]
#   },
#   {
#     "Calculus I": [
#       "Explain the concept of a derivative.",
#       {"questionText": "Find d/dx of x² + 3x", "options":[…]},
#       … 
#     ]
#   },
#   {
#     "Geometry Fundamentals": [
#       "What defines a right triangle?",
#       {"questionText":"Which angle is opposite the hypotenuse?", "options":[…]},
#       … 
#     ]
#   }
# ]


In [None]:
def fetch_questions_for_topics():
    chapters = sorted(chapters)
    # 4. Build headers & payload
    headers = {
        'Authorization': AUTH_TOKEN,
        'Content-Type': 'application/json'
    }
    payload = {'chapters': chapters}
    # 5. Fire the GET (or POST if you prefer) with JSON body
    response = requests.get(API_URL, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()

<h1>Create System Prompt and User Prompt</h1>

In [None]:
SYSTEM_PROMPT = """You are an advanced educational insights generator and personalized learning advisor with expertise in analyzing student academic performance across multiple tests and subjects.

Your primary responsibilities include:

## Performance Analysis:
- Analyze student performance data from multiple tests across different subjects
- Focus primarily on test_details array as it provides the most accurate representation of student performance
- Identify performance trends across multiple tests in the same subject
- Compare performance across different subjects to identify relative strengths and weaknesses
- Use topic_details for supplementary insights about topic-wise average performance

## Insight Generation:
- Generate comprehensive subject-wise analysis showing performance trends
- Identify strong topics (>75% average performance) and weak topics (<65% average performance)
- Prioritize focus areas based on consistent poor performance across multiple tests
- Provide specific, actionable improvement strategies for each weak area
- Consider the number of tests taken per topic when making assessments

## Question Generation Guidelines:
- Generate 6-8 practice questions ONLY for topics identified as weak or needing improvement
- Focus on topics that appear in priority_focus_areas
- Create a balanced mix of difficulty levels: 2-3 easy, 2-3 medium, 2-3 hard questions per topic
- For Math subjects: Generate questions similar to provided sample questions with appropriate difficulty progression
- For English Grammar: Create questions following the style and pattern of provided samples
- For English Literature/Stories: Use questions directly from provided samples when available
- For Social Studies: Use questions directly from provided samples when available
- Exclude questions that require images or visual elements
- Ensure questions are grade-appropriate and align with curriculum standards

## Parent Communication:
- Write in simple, clear English that Indian parents can easily understand
- Address the student by name throughout for personalization
- Use a supportive, encouraging tone while being honest about areas needing improvement
- Provide specific, practical advice that parents can implement at home
- Include references to attendance and teacher remarks when relevant
- Focus on growth mindset and positive reinforcement
- Avoid overly technical educational jargon

## Key Principles:
- Prioritize insights from test_details over other data sources
- Be specific about which tests showed improvement or decline
- Provide context for performance (e.g., "improved from 65% in Test 1 to 78% in Test 2")
- Address the student using male/female pronouns when gender-specific language is needed and if you are unaware just use "the student"
- Use the provided sample questions as a guide for generating new questions
- Ensure all generated questions are relevant to the identified weak topics
- Maintain a positive, constructive tone throughout the analysis
- Focus on actionable steps parents can take to support their child's learning
- Avoid making assumptions about the student's abilities or background
- Maintain an encouraging, growth-focused approach throughout all content
- Ensure all recommendations are actionable and realistic for home implementation

Remember: Your goal is to help parents understand exactly where their child stands academically and provide them with clear, practical steps to support their child's improvement at home."""


<h1>Create User Prompt</h1>

In [None]:
def create_user_prompt(user_data):
    prompt = f"""
    You are provided with comprehensive student performance data below:
    
    **Student Data:**
    {user_data}
    
    **Your Tasks:**
    
    1. **Analyze Performance Trends:**
       - Focus primarily on the 'test_details' array to understand actual test performance
       - Look for patterns across multiple tests in the same subject
       - Identify subjects and topics where performance is declining, improving, or consistent
       - Use 'topic_details' for additional context on average performance per topic
    
    2. **Generate Comprehensive Insights:**
       - Create subject-wise analysis showing performance trends
       - Identify priority focus areas based on consistent poor performance
       - Provide specific improvement strategies for weak topics
       - Highlight strengths and areas where the student is performing well
    
    3. **Create Targeted Practice Questions:**
       - Generate questions ONLY for topics identified as weak or needing improvement
       - Use the sample questions provided below as reference for style and difficulty
       - Ensure questions match the academic level and curriculum requirements
       - Focus on topics that appear in your priority_focus_areas analysis
    
    **Sample Questions for Reference:**
    {questions_for_topics_asked_in_examination}
    
    **Important Guidelines:**
    - Weight your analysis heavily toward test_details as it shows actual test performance
    - Be specific about which tests showed what performance levels
    - Provide context for performance changes across multiple tests
    - Generate questions only for improvement areas, not for strong topics
    - Ensure parent recommendations are practical and implementable at home
    
    Please provide your response in the required JSON format with comprehensive insights and targeted practice questions.
    """
    return prompt

In [None]:
def get_user_prompt(user_data):
    USER_PROMPT = create_user_prompt(user_data)
    return USER_PROMPT

<h1>Get Response Format</h1>

In [None]:
def get_response_format():
    return {
        "type": "json_schema",
        "json_schema": {
            "name": "quiz_schema",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                    "parent_recommendations": {
                        "type": "string",
                        "description": "A comprehensive, personalized note for parents with specific recommendations for improvement, written in simple English that Indian parents can easily understand.",
                    },
                    "student_insights": {
                        "type": "object",
                        "description": "Detailed analysis of student's academic performance across all subjects and tests.",
                        "properties": {
                            "overall_performance": {
                                "type": "string",
                                "description": "Overall assessment of student's academic performance across all subjects."
                            },
                            "subject_wise_analysis": {
                                "type": "array",
                                "description": "Subject-wise detailed analysis based on test performance.",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "subject": {
                                            "type": "string",
                                            "description": "Name of the subject (e.g., Maths, English, Physics, etc.)"
                                        },
                                        "performance_trend": {
                                            "type": "string",
                                            "description": "Analysis of performance trend across multiple tests in this subject"
                                        },
                                        "strong_topics": {
                                            "type": "array",
                                            "description": "Topics where student performed well (>75% average)",
                                            "items": {
                                                "type": "string"
                                            }
                                        },
                                        "weak_topics": {
                                            "type": "array",
                                            "description": "Topics where student needs improvement (<65% average)",
                                            "items": {
                                                "type": "string"
                                            }
                                        },
                                        "improvement_recommendations": {
                                            "type": "array",
                                            "description": "Specific actionable recommendations for improvement in this subject",
                                            "items": {
                                                "type": "string"
                                            }
                                        }
                                    },
                                    "required": ["subject", "performance_trend", "strong_topics", "weak_topics", "improvement_recommendations"],
                                    "additionalProperties": False
                                }
                            },
                            "priority_focus_areas": {
                                "type": "array",
                                "description": "Top 3-5 priority areas that need immediate attention based on test performance",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "topic": {
                                            "type": "string",
                                            "description": "Name of the topic that needs focus"
                                        },
                                        "subject": {
                                            "type": "string",
                                            "description": "Subject this topic belongs to"
                                        },
                                        "current_performance": {
                                            "type": "string",
                                            "description": "Current performance level in this topic"
                                        },
                                        "why_priority": {
                                            "type": "string",
                                            "description": "Explanation of why this topic needs immediate attention"
                                        },
                                        "improvement_strategy": {
                                            "type": "string",
                                            "description": "Specific strategy to improve in this topic"
                                        }
                                    },
                                    "required": ["topic", "subject", "current_performance", "why_priority", "improvement_strategy"],
                                    "additionalProperties": False
                                }
                            }
                        },
                        "required": ["overall_performance", "subject_wise_analysis", "priority_focus_areas"],
                        "additionalProperties": False
                    },
                    "practice_questions": {
                        "type": "array",
                        "description": "Practice questions organized by priority topics that need improvement",
                        "items": {
                            "type": "object",
                            "properties": {
                                "topic": {
                                    "type": "string",
                                    "description": "The topic for which these questions are generated"
                                },
                                "subject": {
                                    "type": "string",
                                    "description": "The subject this topic belongs to"
                                },
                                "questions": {
                                    "type": "array",
                                    "description": "Array of 6-8 practice questions for this topic",
                                    "items": {
                                        "type": "object",
                                        "properties": {
                                            "type": {
                                                "type": "string",
                                                "enum": ["mcq", "descriptive"],
                                                "description": "The type of the question."
                                            },
                                            "questionId": {
                                                "type": "string",
                                                "description": "Unique identifier for the question"
                                            },
                                            "question": {
                                                "type": "string",
                                                "description": "The question text. All math equations must be wrapped between $ and $."
                                            },
                                            "subject": {
                                                "type": "string",
                                                "description": "The subject of the question."
                                            },
                                            "chapter": {
                                                "type": "string",
                                                "description": "The chapter or topic this question belongs to."
                                            },
                                            "marks": {
                                                "type": "number",
                                                "description": "The marks assigned for the question."
                                            },
                                            "options": {
                                                "anyOf": [
                                                    {
                                                        "type": "array",
                                                        "description": "Options for multiple choice questions",
                                                        "items": {
                                                            "type": "object",
                                                            "properties": {
                                                                "key": {
                                                                    "type": "string",
                                                                    "description": "The key for the option (A, B, C, D)"
                                                                },
                                                                "option": {
                                                                    "type": "string",
                                                                    "description": "The option text. Math equations wrapped in $ and $."
                                                                },
                                                                "imageUrl": {
                                                                    "type": "string",
                                                                    "description": "Image URL if needed, empty string otherwise"
                                                                }
                                                            },
                                                            "required": ["key", "option", "imageUrl"],
                                                            "additionalProperties": False
                                                        }
                                                    },
                                                    {
                                                        "type": "null",
                                                        "description": "Null for descriptive questions"
                                                    }
                                                ]
                                            },
                                            "difficulty": {
                                                "type": "string",
                                                "enum": ["easy", "medium", "hard"],
                                                "description": "The difficulty level of the question."
                                            }
                                        },
                                        "required": ["type", "questionId", "question", "subject", "chapter", "marks", "options", "difficulty"],
                                        "additionalProperties": False
                                    }
                                }
                            },
                            "required": ["topic", "subject", "questions"],
                            "additionalProperties": False
                        }
                    }
                },
                "required": ["parent_recommendations", "student_insights", "practice_questions"],
                "additionalProperties": False
            }
        }
    }

<h1>Initiate OpenAI Client </h1>

In [None]:

client.api_key = "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"
def get_questions_and_insights_for_individual_student(user_data):
    completion = client.beta.chat.completions.parse(
      model="o4-mini",
      messages=[
        { "role": "system", "content": SYSTEM_PROMPT},
        { "role": "user", "content": get_user_prompt(user_data)},
      ],
      response_format=get_response_format()
    )
    usage = completion.usage
    completion_tokens = usage.completion_tokens
    prompt_tokens     = usage.prompt_tokens
    total_tokens      = usage.total_tokens
    print_single_value_in_table("completion_tokens",completion_tokens)
    print_single_value_in_table("prompt tokens", prompt_tokens)
    print_single_value_in_table("total tokens", total_tokens)
    input_price = (prompt_tokens * 1.1)/1000000
    output_price = (completion_tokens * 4.4)/1000000
    final_cost = (input_price + output_price)*90
    return [json.loads(completion.choices[0].message.content),str(round(final_cost, 2))];

<h1>Generate Report For Each Student (Class Comparison)</h1>

In [4]:
def generate_individual_student_report(csv_path, student_name, output_folder):
    # 1. Ensure output folder exists
    
    
    os.makedirs(output_folder, exist_ok=True)

    # 2. Load data
    df = pd.read_csv(csv_path)
    
    # 3. Define subjects (excluding non-subject columns)
    subjects = [col for col in df.columns if col not in [
        'Student Names', 'Attendance', "Teacher's Remarks"
    ] and not col.endswith(' Topics')]

    # 4. Replace 'AB' with NaN for calculations but keep original data for display
    df_calc = df.copy()
    for subj in subjects:
        df_calc[subj] = pd.to_numeric(df_calc[subj], errors='coerce')  # Convert to numeric, 'AB' becomes NaN

    # 5. Compute class stats (only using numeric values)
    class_stats = {
        subj: {
            'highest':   df_calc[subj].max(),
            'lowest':    df_calc[subj].min(),
            'average':   df_calc[subj].mean()
        }
        for subj in subjects
    }

    # 6. Locate the student row
    student_df = df[df['Student Names'] == student_name]
    if student_df.empty:
        raise ValueError(f"Student '{student_name}' not found")
    student = student_df.iloc[0]
    
    # 7. Also create a numeric version of student data for calculations
    student_calc = df_calc[df_calc['Student Names'] == student_name].iloc[0]
    print("df calc>>>>>",df_calc)
    # 8. Generate the PDF
    output_path = _generate_student_pdf(
        student, student_calc, subjects, class_stats, output_folder,df_calc
    )
    return output_path


def _generate_student_pdf(student, student_calc, subjects, class_stats, output_folder, df_calc=None):
    """
    Internal helper to build the PDF for one student with compact layout.
    Handles 'AB' (absent) values in student data.
    
    Args:
        student: Student's row from the original dataframe
        student_calc: Student's row from the df_calc dataframe (with numeric values)
        subjects: List of subject columns
        class_stats: Dictionary with class statistics
        output_folder: Where to save the PDF
        df_calc: The full numeric dataframe for percentile calculations
    """
    pdf = FPDF()
    pdf.add_page()
    
    # Use Times as it's closer to the "math" font in the template
    pdf.set_font('Times', 'B', 14)

    # Header - more compact
    name = student['Student Names']
    pdf.cell(0, 8, "Student Performance Report", ln=1, align='C')
    pdf.set_font('Times', '', 10)
    print(subjects)
    # Attendance with less spacing
    att = student['Attendance']

    att_pct = (int(att) / ATTENDANCE_DAYS) * 100 if pd.notna(att) else 0
    pdf.cell(0, 6, f"Attendance: {att} / {ATTENDANCE_DAYS} ({att_pct:.1f}%)", ln=1, align='C')
    
    # Minimal spacing before chart
    pdf.ln(2)

    # Comparison chart - increased height
    chart_path = create_comparison_chart(student, student_calc, subjects, class_stats)
    pdf.image(chart_path, x=20, w=170, h=75)  # Increased height from 60 to 75
    os.remove(chart_path)
    
    # Compact spacing
    pdf.ln(2)

    # SECTION: Subject Analysis with underlined header
    pdf.set_font('Times', 'B', 12)
    pdf.cell(0, 8, "Subject Analysis", ln=1)
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())
    pdf.ln(1)
    
    pdf.set_font('Times', '', 10)
    
    # Format subject analysis as a table to save space
    col_width = 60
    row_height = 6
    
    # Create columns for subjects - 3 per row
    pct_map = {}
    rows = (len(subjects) + 2) // 3  # Ceiling division
    
    for i in range(rows):
        for j in range(3):
            idx = i * 3 + j
            if idx < len(subjects):
                subj = subjects[idx]
                raw = student[subj]
                # Check if student was absent
                if raw == 'AB' or raw == '🆎' or raw == '' or pd.isna(raw):
                    # Display "Absent" instead of percentage
                    display_text = f"{subj}: Absent"
                    # We don't add to pct_map when absent
                else:
                    # Normal case with score
                    raw_num = float(raw)
                    pct = (raw_num / 30) * 100
                    pct_map[subj] = pct
                    display_text = f"{subj}: {raw}/30 ({pct:.1f}%)"
                
                pdf.cell(col_width, row_height, 
                      display_text, 
                      ln=0 if j < 2 and (i*3+j+1) < len(subjects) else 1)
    print(pct_map,"pct")
    # SECTION: Performance Highlights
    pdf.ln(2)
    pdf.set_font('Times', 'B', 12)
    pdf.cell(0, 8, "Performance Highlights", ln=1)
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())
    pdf.ln(1)
    
    pdf.set_font('Times', '', 10)
    
    # Only show highlights if student has taken at least one test
    if pct_map:
        # Calculate 80th percentile for each subject that the student has taken
        subject_percentiles = {}
        needs_improvement_subjects = []
        # Only proceed if df_calc is provided
        if df_calc is not None:
            for subj in pct_map:

                # Get all non-NaN scores for this subject to calculate percentile
                all_scores = df_calc[subj].dropna()
                
                if not all_scores.empty:
                    # Calculate 80th percentile
                    percentile_80 = np.percentile(all_scores, 80)
                    subject_percentiles[subj] = percentile_80
                    # Check if student score is less than 75% or below 80th percentile
                    if pct_map[subj] < 75 or student_calc[subj] < percentile_80:
                        needs_improvement_subjects.append((subj, pct_map[subj]))
        
        # Find strongest subject (highest percentage)
        best_subj = max(pct_map, key=pct_map.get)
        best_pct = pct_map[best_subj]
        
        # Display strongest subject
        pdf.cell(95, 6, f"Strongest Subject: {best_subj} ({best_pct:.1f}%)", ln=0)
        
        # Display subject that needs improvement (if any)
        
        if needs_improvement_subjects:
            # Sort by percentage ascending (lowest first)
            needs_improvement_subjects.sort(key=lambda x: x[1])
            pdf.cell(0, 6, "Subjects Needing Improvement:", ln=1)
            for subj, pct in needs_improvement_subjects:
                pdf.cell(0, 6, f"- {subj} ({pct:.1f}%)", ln=1)
        else:
            pdf.cell(95, 6, "All subjects meet expectations", ln=1)
    else:
        pdf.cell(0, 6, "No test scores available for performance analysis", ln=1)

    # SECTION: Teacher's Remarks
    pdf.ln(2)
    pdf.set_font('Times', 'B', 12)
    pdf.cell(0, 8, "Teacher's Remarks", ln=1)
    
    # Check if there are any teacher's remarks
    has_remarks = pd.notna(student["Teacher's Remarks"]) and student["Teacher's Remarks"].strip() != ""
    
    # Only draw the line if there are remarks
    if has_remarks:
        pdf.line(10, pdf.get_y(), 200, pdf.get_y())
        pdf.ln(1)
        pdf.set_font('Times', '', 10)
        # Use multi_cell with smaller height to make text more compact
        pdf.multi_cell(0, 5, student["Teacher's Remarks"])
    else:
        # No remarks - display message
        pdf.ln(1)
        pdf.set_font('Times', '', 10)
        pdf.cell(0, 5, "No remarks from teacher", ln=1)

    # SECTION: Topics Covered - only if we have space
    remaining_height = 270 - pdf.get_y()  # Approx A4 usable height
    
    if remaining_height > 20:  # Only show if we have room
        pdf.ln(2)
        pdf.set_font('Times', 'B', 12)
        pdf.cell(0, 8, "Topics Covered", ln=1)
        pdf.line(10, pdf.get_y(), 200, pdf.get_y())
        pdf.ln(1)
        
        pdf.set_font('Times', '', 10)
        for subj in subjects:
            tcol = f"{subj} Topics"
            if tcol in student.index and pd.notna(student[tcol]):
                # Limited space, so keep it brief
                if(subj =="Maths"):
                    topic_text = f"{subj}: {student[tcol]}"
                elif(subj == "Biology"):
                    topic_text = f"{subj}: {student[tcol]} , Human reproduction"
                else:
                    topic_text = f"{subj}: {student[tcol]}"
                if len(topic_text) > 100:
                    topic_text = topic_text[:97] + "..."
                pdf.multi_cell(0, 5, topic_text)

    # Save
    safe_name = name.replace(' ', '_')
    path = os.path.join(output_folder, f"{safe_name}_report.pdf")
    pdf.output(path)
    return path

def create_comparison_chart(student, student_calc, subjects, class_stats):
    """
    Build and save a matplotlib chart comparing this student
    against class high/low/average in each subject, with a 
    parent-friendly, aesthetic design.
    
    Uses Times font to match the rest of the report.
    Handles 'AB' (absent) values in student data.
    """
    import matplotlib
    import matplotlib.font_manager as fm
    
    # Set Times font family explicitly - this ensures consistency with the PDF
    matplotlib.rcParams['font.family'] = 'Times New Roman'
    # For systems that might not have Times New Roman, fall back to serif
    matplotlib.rcParams['font.serif'] = ['Times New Roman', 'Times', 'DejaVu Serif', 'serif']
    
    # Data preparation - check for absent tests
    marks = []
    percentages = []
    absent_subjects = []
    
    for s in subjects:
        if student[s] == 'AB':
            # Track which subjects student was absent for
            marks.append(0)  # Use 0 for absent in chart
            percentages.append(None)  # No percentage for absent
            absent_subjects.append(s)
        else:
            # Normal case with score
            marks.append(student_calc[s])
            percentages.append((student_calc[s]/30)*100 if pd.notna(student_calc[s]) else None)
    
    highest  = [class_stats[s]['highest'] for s in subjects]
    lowest   = [class_stats[s]['lowest'] for s in subjects]
    average  = [class_stats[s]['average'] for s in subjects]
    
    # Set a modern, professional color palette
    student_color = '#4570B7'  # Vibrant blue for student
    avg_color = '#9FA7B2'      # Muted gray for average
    high_color = '#97D077'     # Soft green for highest
    low_color = '#F08B7E'      # Soft red for lowest
    absent_color = '#E8E8E8'   # Light gray for absent
    
    # Create figure with increased height 
    plt.figure(figsize=(7.5, 5.0))  # Increased height from 4.5 to 5.0
    
    # Define positions and width
    x = np.arange(len(subjects))
    width = 0.18  # Slightly thinner bars with more spacing
    
    # Plot bars with softer colors and borders
    avg_bars = plt.bar(x, average, width, color=avg_color, edgecolor='white', 
                    linewidth=0.5, label='Class Average', zorder=1)
    high_bars = plt.bar(x + width, highest, width, color=high_color, edgecolor='white',
                     linewidth=0.5, label='Class Highest', zorder=1)
    low_bars = plt.bar(x - width, lowest, width, color=low_color, edgecolor='white',
                    linewidth=0.5, label='Class Lowest', zorder=1)
    
    # Make student bars stand out more
    student_bars = plt.bar(x + 2*width, marks, width, 
                          color=[absent_color if s in absent_subjects else student_color for s in subjects], 
                          edgecolor='white', linewidth=1.0, 
                          label=f"{student['Student Names']}", zorder=2)
    
    # Add student score percentages above their bars (except for absent)
    for i, (p, m, s) in enumerate(zip(percentages, marks, subjects)):
        if s in absent_subjects:
            plt.annotate("Absent", 
                       xy=(x[i] + 2*width, 2),  # Position just above the x-axis
                       ha='center', va='bottom',
                       fontsize=8, fontweight='normal', color='#000',
                       family='Times New Roman')  # Explicitly set font family
        elif p is not None:
            plt.annotate(f"{int(p)}%", 
                       xy=(x[i] + 2*width, m + 0.5), 
                       ha='center', va='bottom',
                       fontsize=9, fontweight='normal', color=student_color,
                       family='Times New Roman')  # Explicitly set font family
    
    # Set axis labels with consistent Times font
    plt.xlabel('Subjects', fontsize=10, fontweight='normal', family='Times New Roman')
    plt.ylabel('Marks (out of 30)', fontsize=10, fontweight='normal', family='Times New Roman')
    
    # Set title and add subtitle as a single title with newline for better spacing
    plt.suptitle(f"{student['Student Names']}'s Performance", 
            fontsize=12, fontweight='normal', family='Times New Roman', y=0.98)
    
    # Add explanatory subtitle closer to title
    plt.title("This chart compares the student's performance with the class statistics", 
            fontsize=10, fontstyle='normal', family='Times New Roman', pad=10)
    
    # Improve x-axis readability with Times font
    plt.xticks(x + width/2, subjects, fontsize=10, fontweight='normal', family='Times New Roman')
    
    # Set y-axis to have a max of 30 (full marks) with Times font
    plt.ylim(0, 32)  # Slight buffer for annotations
    plt.yticks(range(0, 31, 5), fontsize=10, family='Times New Roman')
    
    # Add a horizontal line at maximum possible marks
    plt.axhline(y=30, color='#CCCCCC', linestyle='-', linewidth=1, alpha=0.7)
    
    # Add "Full Marks (30)" text with Times font
    plt.text(len(subjects) - 0.2, 30.5, 'Full Marks (30)', 
           ha='right', va='bottom', fontsize=8, color='black', family='Times New Roman')
    
    # Add subtle grid for readability
    plt.grid(axis='y', linestyle='--', alpha=0.2, zorder=0)
    
    # Create custom legend with more descriptive labels
    legend_elements = [
        Patch(facecolor=student_color, edgecolor='white', label=f"{student['Student Names']}"),
        Patch(facecolor=avg_color, edgecolor='white', label='Class Average'),
        Patch(facecolor=high_color, edgecolor='white', label='Class Highest'),
        Patch(facecolor=low_color, edgecolor='white', label='Class Lowest')
    ]
    
    # Add "Absent" to legend if student has any absent marks
    if absent_subjects:
        legend_elements.append(
            Patch(facecolor=absent_color, edgecolor='white', label='Absent')
        )
    
    # Move legend outside the plot area with Times font
    plt.legend(handles=legend_elements, 
             loc='upper center', 
             bbox_to_anchor=(0.5, -0.12),  # Position below the plot - adjusted for larger chart
             fontsize=9, 
             framealpha=0.7, 
             edgecolor='#CCCCCC',
             ncol=min(5, len(legend_elements)),
             prop={'family': 'Times New Roman'})  # Set font family for legend
    
    # Remove top and right spines for cleaner look
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    
    # Apply Times New Roman to tick labels
    for label in plt.gca().get_xticklabels() + plt.gca().get_yticklabels():
        label.set_fontname('Times New Roman')
    
    # Clean background and tight layout with adjusted rect to make room for legend
    plt.tight_layout(rect=[0, 0.1, 1, 0.97])  # Adjusted bottom margin for legend
    
    # Save with clean white background
    fname = f"temp_chart_{student['Student Names'].replace(' ','_')}.png"
    plt.savefig(fname, dpi=200, bbox_inches='tight', facecolor='white')
    plt.close()
    return fname
# generate_individual_student_report('./4/student_data_final.csv', 'Anusha',f"./14/reports/")

<h1>Convert worksheet html to pdf </h1>

In [5]:
def convert_latex_to_mathml(html_content):
    """
    Convert LaTeX equations enclosed in $ signs to MathML format.
    
    Args:
        html_content (str): HTML content with LaTeX equations
        
    Returns:
        str: HTML content with LaTeX equations replaced by MathML
    """
    # Regular expression to find LaTeX expressions enclosed in $ signs
    # This handles both inline math ($...$) and display math ($$...$$)
    pattern = r'\$\$(.*?)\$\$|\$(.*?)\$'
    
    def replace_math(match):
        if match.group(1) is not None:  # Display math ($$...$$)
            latex_expr = match.group(1)
            mathml = latex2mathml.converter.convert(latex_expr, display='block')
            return mathml
        else:  # Inline math ($...$)
            latex_expr = match.group(2)
            mathml = latex2mathml.converter.convert(latex_expr, display='inline')
            return mathml
    
    # Replace all matches with MathML
    return re.sub(pattern, replace_math, html_content)

def download_html_to_pdf(s3_url, output_pdf_path):
    """
    Makes an API call to a local server that converts HTML from an S3 URL to PDF
    
    Args:
        s3_url (str): The S3 URL of the HTML file to convert
        output_pdf_path (str): Local file path where to save the PDF
    
    Returns:
        dict: Response containing success status and path if successful
    """
    # Ensure the output directory exists
    output_dir = os.path.dirname(output_pdf_path)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
    
    # First, fetch the HTML content to process LaTeX equations
    try:
        html_response = requests.get(s3_url)
        html_response.raise_for_status()
        
        # Convert LaTeX to MathML
        html_content = html_response.text
        
        # Now we can either:
        # 1. Upload the modified HTML somewhere and pass that URL to the API
        # 2. Or modify the API to accept HTML content directly
        
        # For now, we'll continue with the original approach using the S3 URL
        # but in a real implementation, you might want to handle the modified HTML
    except requests.exceptions.RequestException as e:
        err_box_red("Failed to fetch HTML from S3",str(e))
        return {"success": False, "error": str(e)}
    
    # Prepare the API request data
    api_url = DOWNLOAD_FROM_S3_LINK
    payload = {
        "s3Link": s3_url,  # Using original S3 URL
        "pathToSave": output_pdf_path,
    }
    
    # Make the API request
    try:
        response = requests.post(api_url, json=payload)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Parse the response
        result = response.json()
        
        if result.get('success', False):
            print_single_value_in_table("PDF successfully saved to",{result.get('savedPath', output_pdf_path)})
        else:
            err_box_red("PDF generation failed",str(e))
        return result['path']
    
    except requests.exceptions.RequestException as e:
        err_box_red("PDF generation failed",str(e))
        return {"success": False, "error": str(e)}

<h1>Save Student Record in Database </h1>

In [None]:
def save_student_record(student_name, standard,worksheet_s3_link,cost,school_name='Surya International School'):
    """
    Sends a POST to your Express endpoint to save a new student record.

    Args:
        student_name (str): Name of the student.
        school_name (str): Name of the student's school.
        standard (str): Grade or standard of the student.
        worksheet_s3_link (str): URL to the student’s worksheet HTML in S3.
        cost (int): Associated cost.
        base_url (str): Base URL of your server (no trailing slash).

    Returns:
        dict: The parsed JSON response from the server, or None on error.
    """
    payload = {
        "studentName": student_name,
        "schoolName": school_name,
        "standard": standard,
        "worksheet_s3_link": worksheet_s3_link,
        "cost": cost,
    }
    headers = {"Content-Type": "application/json"}

    try:
        response = requests.post(SAVE_STUDENT_COST_PER_WORKSHEET, json=payload, headers=headers)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as err:
        err_box_red("[ERROR] Failed to save student record:", err)
        return None


<h1>Generate worksheet and combine worksheet and report</h1>

In [None]:
def generate_worksheet_for_students_and_combine_report(data):
    TOTAL_COST = 0
    output_paths = []
    for student in data:
        print_single_value_in_table("Student Being Processed",student['name'])
        student_name = student['name']
        [questions_and_insights_for_student,cost] = get_questions_and_insights_for_individual_student(student)
        worksheet = requests.request(
        method='POST',
        url=GET_WORKSHEET_HTML,
        json={'student_data':questions_and_insights_for_student}
        )
        worksheet.raise_for_status()
        student_specific_questions_and_insights = worksheet.json()
        student_worksheet_link = student_specific_questions_and_insights['worksheet_html']
        
        student_worksheet_pdf = download_html_to_pdf(student_worksheet_link,f"../analysis_insights_copy/{STANDARD}/worksheets/{student_name}.pdf")
        print_single_value_in_table("student_worksheet_link",student_worksheet_link)
        print_single_value_in_table("student_worksheet_pdf_cost",student_worksheet_pdf)
        print_single_value_in_table("cost",cost)
        TOTAL_COST += float(cost)
        student_comparison_report_pdf = generate_individual_student_report(FILE_DEST,student_name,f"./{STANDARD}/reports/")
        full_output_path = combine_pdfs(student_comparison_report_pdf,student_worksheet_pdf,f"./{STANDARD}/final_reports",f"{student_name}_insights.pdf")
        save_student_record(student_name,standard=STANDARD,cost=cost,worksheet_s3_link=student_worksheet_link)
        output_paths.append(full_output_path)
    print_single_value_in_table("Total Cost",TOTAL_COST)
    return output_paths


<h1>Merge Pdfs</h1>

In [None]:
def combine_pdfs(pdf_path1, pdf_path2, output_folder, output_filename):
    """
    Combine two PDFs into a single PDF file.
    
    Args:
        pdf_path1 (str): Path to the first PDF file
        pdf_path2 (str): Path to the second PDF file
        output_folder (str): Folder where the combined PDF will be saved
        output_filename (str): Filename for the combined PDF
    
    Returns:
        str: Path to the combined PDF file
    """
    try:
        # Create a PDF merger object
        merger = PdfMerger()
        
        # Append the PDFs to the merger
        merger.append(pdf_path1)
        merger.append(pdf_path2)
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        full_output_path = os.path.join(output_folder, output_filename)        
        # Write the combined PDF to the output path
        merger.write(full_output_path)
        merger.close()
        print_single_value_in_table("Successfully combined PDFs and saved to",full_output_path)
        return full_output_path
    
    except Exception as e:
        err_box_red("Error combining pdfs",e)
        return None

In [None]:
add_topics_to_csv()

In [None]:
student_data = read_data(FILE_DESTINATION)

In [None]:
student_data_validated = validate_data(student_data)

In [None]:
classified_students_data = classify_students_by_topic(student_data_validated)

In [None]:
questions_for_topics_asked_in_examination = fetch_questions_for_topics(GET_QUESTIONS_FOR_TOPICS, AUTH_TOKEN,CHAPTERS)
print_question_data(questions_for_topics_asked_in_examination)

In [None]:
generate_worksheet_for_students_and_combine_report(classified_students_data)
print_single_value_in_table("Total cost",TOTAL_COST)