<h1>Import Libs</h1>

In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import os

<h1>Constants</h1>

In [None]:
FILE_ORIGIN = "./student_data.csv"
FILE_DESTINATION = "./student_data_final.csv"
TESTS_AND_CHAPTERS_FOR_SUBJECTS =  [
        ("Test1", ["English", "Maths"], ["A,B", "C,D"]),
        ("Test2", ["English", "Maths"], ["B,C", "D,F"])
    ]
ATTENDANCE_DAYS_TOTAL = 22

<h1> Read Data</h1>

In [None]:
def read_data(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.csv':
        student_data = pd.read_csv(file_path)
        return student_data
    elif ext in ('.xls', '.xlsx'):
        student_data = pd.read_excel(file_path, sheet_name=None)
        return student_data
    else:
        raise ValueError("Unsupported file extension")

<h1>Add topics to csv Data</h1>

In [None]:
import csv
from typing import List, Tuple, Dict

def add_topics_to_csv():
    """
    - input_csv_path, output_csv_path: file paths.
    - tests: a list of (test_name, subjects, topic_values), e.g.:
        [
          ("Test1", ["English","Maths"], ["A,B","C,D"]),
          ("Test2", ["English","Maths"], ["B,C","E,F"])
        ]
    
    This will add columns:
      English Topics Test1, Maths Topics Test1,
      English Topics Test2, Maths Topics Test2,
      English All Topics,  Maths All Topics
    """
    # 1) Validate
    for test_name, subjects, chapters in TESTS_AND_CHAPTERS_FOR_SUBJECTS:
        if len(subjects) != len(chapters):
            raise ValueError(f"subjects vs topic_values length mismatch in {test_name}")

    # 2) Collect all unique subjects and append it to all_subjects
    all_subjects: List[str] = []
    for _, subjects, _ in TESTS_AND_CHAPTERS_FOR_SUBJECTS:
        for subj in subjects:
            if subj not in all_subjects:
                all_subjects.append(subj)

    # 3) Build per-test lookup maps
    test_topic_maps: Dict[str, Dict[str, str]] = {
        test_name: dict(zip(subjects, chapters))
        for test_name, subjects, chapters in TESTS_AND_CHAPTERS_FOR_SUBJECTS
    }

    # 4) Open I/O
    with open(FILE_ORIGIN, newline="", encoding="utf-8") as fin, \
         open(FILE_DESTINATION, "w", newline="", encoding="utf-8") as fout:

        reader = csv.DictReader(fin)
        # a) build the new header
        extra_cols: List[str] = []
        for test_name in test_topic_maps:
            for subj in all_subjects:
                if subj in test_topic_maps[test_name]:
                    extra_cols.append(f"{subj} Topics {test_name}")
        for subj in all_subjects:
            extra_cols.append(f"{subj} All Topics")

        writer = csv.DictWriter(fout, fieldnames=reader.fieldnames + extra_cols)
        writer.writeheader()

        # 5) Process each row
        for row in reader:
            # per-test columns
            for test_name, topics_map in test_topic_maps.items():
                for subj, topics_str in topics_map.items():
                    row[f"{subj} Topics {test_name}"] = topics_str

            # aggregated union columns
            for subj in all_subjects:
                all_toks: List[str] = []
                for topics_map in test_topic_maps.values():
                    if subj in topics_map:
                        # split on comma, strip whitespace
                        all_toks.extend([tok.strip() for tok in topics_map[subj].split(",")])
                # dedupe & sort (optional)
                unique = sorted(set(tok for tok in all_toks if tok))
                row[f"{subj} All Topics"] = ", ".join(unique)

            writer.writerow(row)


<h1>Validate Data</h1>

In [None]:
import pandas as pd
import re

def validate_data(df: pd.DataFrame, test_mark_cols=None) -> pd.DataFrame:
    """
    Cleans and validates a student score DataFrame.

    Steps:
    - Strips whitespace from all string columns
    - Normalizes topic strings like "A,B" to "A, B"
    - Detects missing names or marks
    - Standardizes absent marks as 'AB'
    - Converts valid marks to int/float
    - Fills missing attendance with 0
    - Prints a summary of absentees and preview of data

    Args:
        df (pd.DataFrame): Input student data.
        test_mark_cols (list[str], optional): List of test score columns to validate. If None, inferred.

    Returns:
        pd.DataFrame: Cleaned and validated dataframe.
    """
    df = df.copy()  # Avoid modifying the original DataFrame

    # 0. Remove leading/trailing spaces from all string columns
    for col in df.select_dtypes(include='object'):
        df[col] = df[col].str.strip()

    # 1. Normalize topic columns — ensure "A, B, C" format
    topic_cols = [c for c in df.columns if re.search(r'topics', c, re.IGNORECASE)]
    for col in topic_cols:
        df[col] = df[col].apply(
            lambda x: ', '.join(p.strip() for p in x.split(',')) if isinstance(x, str) else x
        )

    # 2. Check for missing student names and report
    if df['Student Names'].isna().any():
        missing = df[df['Student Names'].isna()].index.tolist()
        print(f"Missing Student Names in rows: {missing}")

    # 3. Identify test mark columns: use passed ones or detect those matching '<Subject>_Test<N>'
    if test_mark_cols is None:
        test_mark_cols = [c for c in df.columns if re.match(r'.+_Test\d+', c)]

    # Ensure the specified test mark columns exist in DataFrame
    missing_marks = [c for c in test_mark_cols if c not in df.columns]
    if missing_marks:
        raise KeyError(f"Expected mark columns not found: {missing_marks}")

    # 4. Validate each test mark column
    for col in test_mark_cols:
        # a) Treat empty or NaN cells as 'AB' (Absent)
        empty_mask = df[col].isna() | (df[col] == '')
        if empty_mask.any():
            for idx in df[empty_mask].index:
                name = df.at[idx, 'Student Names'] or '<Unknown>'
                print(f"Missing {col} for {name}; marking as absent ('AB')")
            df.loc[empty_mask, col] = 'AB'

        # b) Standardize all 'ab', 'Ab', etc. to uppercase 'AB'
        is_ab = df[col].astype(str).str.upper().str.strip() == 'AB'
        df.loc[is_ab, col] = 'AB'

        # c) Try to convert other values to numbers, else mark as 'AB'
        for idx in df.index:
            if df.at[idx, col] == 'AB':
                continue  # Skip if already marked absent
            val = df.at[idx, col]
            try:
                num = float(val)
                df.at[idx, col] = int(num) if num.is_integer() else num
            except Exception:
                name = df.at[idx, 'Student Names'] or '<Unknown>'
                print(f"Invalid {col} value '{val}' for {name}; marking as absent")
                df.at[idx, col] = 'AB'

    # 5. Handle Attendance column
    if 'Attendance' not in df.columns:
        raise KeyError("Expected column 'Attendance' not found")

    # Fill missing attendance with 0
    if df['Attendance'].isna().any():
        for idx in df[df['Attendance'].isna()].index:
            name = df.at[idx, 'Student Names'] or '<Unknown>'
            print(f"Attendance missing for {name}; setting to 0")
        df['Attendance'] = df['Attendance'].fillna(0)

    # Ensure attendance is numeric integers
    df['Attendance'] = pd.to_numeric(df['Attendance'], errors='coerce').fillna(0).astype(int)

    # 6. Summary statistics
    print(f"\nNumber of students loaded: {len(df)}")
    absent_summary = {
        col: (df[col] == 'AB').sum() for col in test_mark_cols if (df[col] == 'AB').any()
    }
    if absent_summary:
        print("\nStudents marked as absent:")
        for col, count in absent_summary.items():
            print(f"  {col}: {count}")

    # 7. Print first 5 rows for review
    print(df.head(5).to_string(index=False))

    return df

<h1>Classify Students Strong and Weak Topics </h1>

In [28]:
import re
import numpy as np
import pandas as pd

def classify_students_by_topic(
    df: pd.DataFrame,
    max_score: float = 100.0,
    strong_thresh: float = 85.0,
    weak_thresh: float = 70.0
) -> list[dict]:
    """
    Classify students into strong/weak/practice *topics* based on multiple tests.

    Logic & reasoning:
      1. We may have multiple tests per subject, each covering overlapping topics.
      2. For each student-topic pair, we collect all test percentages in which that topic appeared.
      3. We compute the *average percentage* for that topic.
      4. We apply *fixed thresholds* (85% for strong, 70% for weak) rather than class-level percentiles—
         because topic-level data can be sparse and unevenly distributed.
      5. Topics ≥ strong_thresh → strong; ≤ weak_thresh → weak; otherwise → practice.

    Args:
        df: Input DataFrame containing:
            - "Student Names", one or more "<Subject>_Test<N>" columns,
            - corresponding "<Subject> Topics Test<N>" columns,
            - "Attendance" and "Teacher's Remarks".
        max_score: Maximum possible raw score per test.
        strong_thresh: Percentage threshold above which a topic is 'strong'.
        weak_thresh: Percentage threshold below which a topic is 'weak'.

    Returns:
        A list of per-student dicts with keys:
          - name, attendance, remarks
          - strong_topics, weak_topics, practice_topics
          - topic_details: list of { topic, avg_pct, count_of_tests }
          - test_details: list of { test_col, subject, raw, pct, topics }
    """
    results = []

    # 1) Identify all test-score columns, e.g. 'English_Test1', 'Maths_Test2', etc.
    test_cols = [c for c in df.columns if re.match(r'.+_Test\d+', c)]
    # 2) Identify all topic columns for tests: '<Subject> Topics Test<N>'
    topic_cols = [c for c in df.columns if re.match(r'.+ Topics Test\d+', c)]

    # Build a mapping from each test column to its topic-column name
    # e.g. { 'English_Test1': 'English Topics Test1', ... }
    test_to_topics = {}
    for tc in test_cols:
        subj, num = tc.rsplit('_Test', 1)
        tcol = f"{subj} Topics Test{num}"
        if tcol in df.columns:
            test_to_topics[tc] = tcol
        else:
            raise KeyError(f"Missing topics column for {tc}: expected '{tcol}'")

    for _, row in df.iterrows():
        name       = row['Student Names']
        attendance = row.get('Attendance')
        remarks    = row.get("Teacher's Remarks", "")

        # Will collect raw details for debugging/reporting
        test_details  = []
        # topic_scores accumulates all pct values per topic
        topic_scores  = {}

        # 3) Loop through each test, parse marks and topics
        for tc, tcol in test_to_topics.items():
            raw = row[tc]
            # a) Handle absent
            if isinstance(raw, str) and raw.strip().upper() == 'AB':
                pct = np.nan
            else:
                raw_num = pd.to_numeric(raw, errors='coerce')
                pct     = (raw_num * 100.0 / max_score) if pd.notna(raw_num) else np.nan

            # b) Parse topics list for this test
            topics = []
            tstr = row.get(tcol, "")
            if isinstance(tstr, str) and tstr.strip():
                topics = [t.strip() for t in tstr.split(',')]

            # c) Record test detail
            #    (helps trace exactly which tests contributed to each topic)
            test_details.append({
                'test_col': tc,
                'subject':  tc.split('_Test')[0],
                'raw':      raw,
                'pct':      pct,
                'topics':   topics
            })

            # d) Append pct to each topic's list
            for topic in topics:
                topic_scores.setdefault(topic, []).append(pct)

        # 4) Compute average pct per topic & classify
        strong_topics  = []
        weak_topics    = []
        practice_topics = []
        topic_details   = []

        for topic, pcts in topic_scores.items():
            # ignore NaNs when averaging
            valid = [p for p in pcts if pd.notna(p)]
            avg_pct = float(np.nan) if not valid else sum(valid) / len(valid)

            # classify based on fixed thresholds
            if pd.notna(avg_pct):
                if avg_pct >= strong_thresh:
                    strong_topics.append(topic)
                elif avg_pct <= weak_thresh:
                    weak_topics.append(topic)
                else:
                    practice_topics.append(topic)

            topic_details.append({
                'topic':     topic,
                'avg_pct':   avg_pct,
                'num_tests': len(valid)
            })

        # 5) Assemble result for this student
        results.append({
            'name':             name,
            'attendance':       attendance,
            'remarks':          remarks,
            'strong_topics':    strong_topics,
            'weak_topics':      weak_topics,
            'practice_topics':  practice_topics,
            'topic_details':    topic_details,
            'test_details':     test_details
        })

    return results

In [24]:
add_topics_to_csv()

In [25]:
student_data = read_data(FILE_DESTINATION)

In [26]:
student_data_validated = validate_data(student_data)


Number of students loaded: 3
Student Names English_Test1 English_Test2 Maths_Test1 Maths_Test2  Attendance English Topics Test1 Maths Topics Test1 English Topics Test2 Maths Topics Test2 English All Topics Maths All Topics
        Alice            85            88          90          92          12                 A, B               C, D                 B, C               D, F            A, B, C          C, D, F
          Bob            78            82          88          85          23                 A, B               C, D                 B, C               D, F            A, B, C          C, D, F
      Charlie            92            94          76          78          24                 A, B               C, D                 B, C               D, F            A, B, C          C, D, F


  df.loc[is_ab, col] = 'AB'
  df.loc[is_ab, col] = 'AB'
  df.loc[is_ab, col] = 'AB'
  df.loc[is_ab, col] = 'AB'


In [None]:
classified_students_data = classify_students_by_topic(student_data_validated)
(classified_students_data)

[{'name': 'Alice', 'attendance': 12, 'remarks': '', 'strong_topics': ['A', 'B', 'C', 'D', 'F'], 'weak_topics': [], 'practice_topics': [], 'topic_details': [{'topic': 'A', 'avg_pct': 85.0, 'num_tests': 1}, {'topic': 'B', 'avg_pct': 86.5, 'num_tests': 2}, {'topic': 'C', 'avg_pct': 89.0, 'num_tests': 2}, {'topic': 'D', 'avg_pct': 91.0, 'num_tests': 2}, {'topic': 'F', 'avg_pct': 92.0, 'num_tests': 1}], 'test_details': [{'test_col': 'English_Test1', 'subject': 'English', 'raw': 85, 'pct': 85.0, 'topics': ['A', 'B']}, {'test_col': 'English_Test2', 'subject': 'English', 'raw': 88, 'pct': 88.0, 'topics': ['B', 'C']}, {'test_col': 'Maths_Test1', 'subject': 'Maths', 'raw': 90, 'pct': 90.0, 'topics': ['C', 'D']}, {'test_col': 'Maths_Test2', 'subject': 'Maths', 'raw': 92, 'pct': 92.0, 'topics': ['D', 'F']}]}, {'name': 'Bob', 'attendance': 23, 'remarks': '', 'strong_topics': ['C', 'D', 'F'], 'weak_topics': [], 'practice_topics': ['A', 'B'], 'topic_details': [{'topic': 'A', 'avg_pct': 78.0, 'num_tes