## Timetable to Json

In [None]:
!pip install camelot-py[cv] pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting camelot-py[cv]
  Downloading camelot_py-1.0.0-py3-none-any.whl.metadata (9.4 kB)
Collecting pdfminer-six>=20240706 (from camelot-py[cv])
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdf<4.0,>=3.17 (from camelot-py[cv])
  Downloading pypdf-3.17.4-py3-none-any.whl.metadata (7.5 kB)
Collecting pypdfium2>=4 (from camelot-py[cv])
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━

In [None]:
import pandas as pd
from datetime import datetime, timedelta
import json
import re
from collections import defaultdict

def clean_dataframe(df):
    df = df.fillna("").astype(str).apply(lambda col: col.str.strip())
    keywords = ["travaux pratiques", "lab. sessions", "projet en labo"]
    cols_to_drop = []

    for col in df.columns:
        all_match_keywords = True
        for cell in df[col]:
            if cell == "":
                continue
            cell_lower = cell.lower()
            if not any(keyword in cell_lower for keyword in keywords):
                all_match_keywords = False
                break
        if all_match_keywords:
            cols_to_drop.append(col)

    df = df.drop(columns=cols_to_drop)
    df = df[~df.apply(lambda row: all(cell == "" for cell in row), axis=1)]
    df = df.loc[:, ~df.apply(lambda col: all(cell == "" for cell in col), axis=0)]
    df = df.reset_index(drop=True)
    return df

def normalize_day_headers(df):
    if df.columns[0] == "" or df.columns[0] is None or str(df.iloc[0, 0]).strip() == "":
        df = df.drop(columns=df.columns[0])
    df.iloc[0] = df.iloc[0].astype(str).str.strip()
    day_row = df.iloc[0].ffill().bfill()
    df.columns = day_row
    df = df.drop(index=0).reset_index(drop=True)
    return df

def generate_times(max_rows):
    times = []
    start_time = datetime.strptime("08:00", "%H:%M")
    break_start = datetime.strptime("12:15", "%H:%M")
    break_end = datetime.strptime("13:30", "%H:%M")

    current_time = start_time
    while len(times) < max_rows:
        if break_start <= current_time < break_end:
            current_time = break_end
        end_time = current_time + timedelta(minutes=75)
        times.append((current_time.strftime('%H:%M'), end_time.strftime('%H:%M')))
        current_time = end_time + timedelta(minutes=15)

    return times

def extract_weeks(extra_info):
    if not extra_info:
        return None
    extra_info = extra_info.lower()
    if "impair" in extra_info:
        return "impaires"
    elif "pair" in extra_info:
        return "paires"
    match = re.search(r"sem(?:\\.|aine)?[\\s:]*(\\d{1,2})[\\s\u00e0\-]*(\\d{1,2})?", extra_info)
    if match:
        start = match.group(1)
        end = match.group(2)
        return f"{start}-{end}" if end else f"{start}"
    return None

def parse_cell(cell_text):
    if not isinstance(cell_text, str) or cell_text.strip() == "":
        return None
    lines = [line.strip() for line in cell_text.split('\n') if line.strip()]
    if not lines:
        return None
    instructor = lines[-1]
    extra_info = ""
    course_lines = []
    for line in lines[:-1]:
        if line.startswith("Gr.") or line.startswith("Gr"):
            extra_info = line
        else:
            course_lines.append(line)
    course = " ".join(course_lines).strip()
    if "Gr." in course:
        parts = course.split("Gr.")
        course = parts[0].rstrip(", ").strip()
        extra_info = "Gr." + parts[1].strip()
    course = course.rstrip(",").strip()
    return {
        "course": course,
        "instructor": instructor,
        "extra_info": extra_info
    }

def df_schedule_to_json(df):
    days = df.columns.tolist()
    time_slots = generate_times(len(df))
    schedule_list = []
    cutoff_time = datetime.strptime("19:15", "%H:%M")

    for row_idx, row in df.iterrows():
        start_time, end_time = time_slots[row_idx]
        start_dt = datetime.strptime(start_time, "%H:%M")
        if start_dt >= cutoff_time:
            continue
        for day in days:
            cell = row[day]
            parsed = parse_cell(cell)
            if parsed:
                weeks = extract_weeks(parsed.get("extra_info", ""))
                schedule_list.append({
                    "course": parsed["course"],
                    "instructor": parsed["instructor"],
                    "start_time": start_time,
                    "end_time": end_time,
                    "week_day": day,
                    "weeks": weeks,
                    "extra_info": parsed["extra_info"]
                })
    return schedule_list

def group_schedule(schedule_list):
    grouped = {}
    for entry in schedule_list:
        key = (entry["course"], entry["instructor"])
        if key not in grouped:
            grouped[key] = {
                "course": entry["course"],
                "instructor": entry["instructor"],
                "extra_info": entry["extra_info"],
                "schedule": []
            }
        grouped[key]["schedule"].append({
            "start_time": entry["start_time"],
            "end_time": entry["end_time"],
            "week_day": entry["week_day"],
            "weeks": entry["weeks"]
        })
    return list(grouped.values())

In [None]:
import pdfplumber
import re

def extract_program_sections(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        sections = []
        master_found = False
        preparatory_count = 0

        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")
            program = None
            semester = None
            fallback_program_candidate = None  # store fallback candidate if needed

            for idx, line in enumerate(lines):
                line = line.strip()

                # Case 1: French Cursus Ingénieur with semester and program
                if (match := re.search(r"Semestre\s+(\d+).*Cursus Ingénieur\s*:\s*(.+)", line, re.IGNORECASE)):
                    semester = int(match.group(1))
                    program = match.group(2).strip()
                    break

                # Case 2: English Bachelor program
                if (match := re.search(r"Semester\s+(\d+)\s*-\s*Bachelor of Engineering\s*:\s*(.+)", line, re.IGNORECASE)):
                    semester = int(match.group(1))
                    program = match.group(2).strip()
                    break

                # Case 3: First Master program
                if not master_found and "master" in line.lower() and "in" in line.lower():
                    program = line.strip()
                    master_found = True
                    # Look ahead for semester info
                    if idx + 1 < len(lines):
                        sem_line = lines[idx + 1]
                        sem_match = re.search(r"semester\s+(\d+)", sem_line, re.IGNORECASE)
                        if sem_match:
                            semester = int(sem_match.group(1))
                    break

                # Case 4: Preparatory Classes (any line containing "preparatory classes")

                if re.search(r"preparatory\s+classes", line, re.IGNORECASE):
                    preparatory_count += 1
                    # Only extract on the SECOND occurrence
                    if preparatory_count == 2:
                        program = line.strip()
                        if idx + 1 < len(lines):
                            sem_match = re.search(r"semester\s+(\d+)", lines[idx + 1], re.IGNORECASE)
                            if sem_match:
                                semester = int(sem_match.group(1))
                        break


                # Collect fallback program candidate before "Academic year" or "Année universitaire"
                if ("academic year" in line.lower() or "année universitaire" in line.lower()) and idx > 0:
                    prev_line = lines[idx - 1].strip()
                    # Avoid lines with department or university names
                    if all(x.lower() not in prev_line.lower() for x in ["department", "département", "université", "university", "en ligne"]):
                        fallback_program_candidate = prev_line

                # Extract semester from line mentioning "Academic year" or "Année universitaire"
                if semester is None:
                    sem_match = re.search(r"(semester|semestre)\s+(\d+)", line, re.IGNORECASE)
                    if sem_match:
                        semester = int(sem_match.group(2))

            if program is None and fallback_program_candidate:
                program = fallback_program_candidate


            if program:
                sections.append({
                    "program": program,
                    "semester": semester,
                    "page": i + 1
                })

        return sections


In [None]:
import camelot
import json
import pandas as pd
from collections import defaultdict

# --- Use your existing functions: ---
# - extract_program_sections
# - normalize_day_headers
# - clean_dataframe
# - generate_times
# - parse_cell
# - df_schedule_to_json
# - group_schedule

# 1. Extract tables using Camelot
tables = camelot.read_pdf("/content/2024-2025-DEM - Semestre 1 (1).pdf", pages="all")
dfs_by_page = {}

for i, table in enumerate(tables):
    df_raw = table.df
    try:
        df_cleaned = normalize_day_headers(df_raw)
        df_cleaned = clean_dataframe(df_cleaned)
        dfs_by_page[i + 1] = df_cleaned  # Camelot page numbers are 1-indexed
    except Exception as e:
        print(f"❌ Failed processing table on page {i+1}: {e}")

# 2. Combine program+semester info with each course block
def combine_program_courses(pdf_path, dfs_by_page):
    program_sections = extract_program_sections(pdf_path)
    program_by_page = {sec["page"]: {"program": sec["program"], "semester": sec["semester"]} for sec in program_sections}

    combined_courses = []

    for page_num, df in dfs_by_page.items():
        if page_num not in program_by_page:
            continue

        program_info = program_by_page[page_num]
        schedule_json = df_schedule_to_json(df)
        grouped_schedule = group_schedule(schedule_json)

        for course_entry in grouped_schedule:
            combined_courses.append({
                "program": program_info["program"],
                "semester": program_info["semester"],
                **course_entry
            })

    return combined_courses

# 3. Run combination logic
pdf_path = "/content/2024-2025-DEM - Semestre 1 (1).pdf"
combined_courses = combine_program_courses(pdf_path, dfs_by_page)

# 4. Group final output
def group_courses_by_program_and_semester(combined_courses):
    grouped = defaultdict(lambda: {"program": "", "semester": 0, "courses": []})

    for entry in combined_courses:
        key = (entry["program"], entry["semester"])
        grouped[key]["program"] = entry["program"]
        grouped[key]["semester"] = entry["semester"]

        course_data = {
            "course": entry["course"],
            "instructor": entry["instructor"],
            "extra_info": entry["extra_info"],
            "schedule": entry["schedule"]
        }
        grouped[key]["courses"].append(course_data)

    return list(grouped.values())

grouped_by_program = group_courses_by_program_and_semester(combined_courses)

# 5. Save JSON output
with open("grouped_courses_by_program_DEM_S1.json", "w", encoding="utf-8") as f:
    json.dump(grouped_by_program, f, ensure_ascii=False, indent=2)

print(" Saved structured data to grouped_courses_by_program_DEM_S1.json")


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


 Saved structured data to grouped_courses_by_program_DEM_S1.json


In [None]:
from collections import defaultdict
import json

def restructure_courses_by_semester(program_data_list):
    result = defaultdict(lambda: {"program": None, "courses_by_semester": defaultdict(list)})

    for program_data in program_data_list:
        program_name = program_data.get("program")
        semester = program_data.get("semester")
        courses = program_data.get("courses", [])

        # Initialize if first time
        if result[program_name]["program"] is None:
            result[program_name]["program"] = program_name

        # Assign all courses in this program entry to the semester from parent key
        for course in courses:
            result[program_name]["courses_by_semester"][str(semester)].append(course)

    # Convert nested defaultdict to dicts
    for program_name in result:
        result[program_name]["courses_by_semester"] = dict(result[program_name]["courses_by_semester"])

    return dict(result)

#  Usage example
json_path = "/content/grouped_courses_by_program_DEM_S1.json"
program_data_list = json.load(open(json_path, encoding="utf-8"))
restructured = restructure_courses_by_semester(program_data_list)

# Save to file
output_path = "/content/restructured_grouped_courses_by_program_DEM_S1.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(restructured, f, ensure_ascii=False, indent=2)

print(f" Restructured data saved to: {output_path}")


 Restructured data saved to: /content/restructured_grouped_courses_by_program_DEM_S1.json


In [None]:
import json
import re
from pathlib import Path

def extract_weeks(extra_info):
    if not extra_info:
        return None
    info = extra_info.lower()
    if "impair" in info:
        return "impaires"
    elif "pair" in info:
        return "paires"
    match = re.search(r"sem(?:\.\s*|aine\s*)?(\d{1,2})(?:\s*[\u00e0\-]\s*(\d{1,2}))?", info)
    if match:
        start = match.group(1)
        end = match.group(2)
        return f"{start}-{end}" if end else start
    return None

def detect_session_type(extra_info: str):
    if not extra_info:
        return "cours"
    info = extra_info.lower()
    if "travaux pratiques" in info or "tp" in info:
        return "travaux pratiques"
    return "cours"

def flatten_schedule(schedule, extra_info=None):
    weeks = extract_weeks(extra_info)
    session_type = detect_session_type(extra_info)
    result = []

    if isinstance(schedule, dict):
        # dict of day -> list of time-range strings or dicts
        for day, slots in schedule.items():
            for slot in slots:
                if isinstance(slot, str):
                    # time-range string like "08:00-09:15"
                    try:
                        start_time, end_time = slot.strip().split("-")
                    except ValueError:
                        print(f"[WARNING] Skipping malformed time slot: {slot} on {day}")
                        continue
                    result.append({
                        "start_time": start_time,
                        "end_time": end_time,
                        "week_day": day,
                        "weeks": weeks,
                        "session_type": session_type
                    })
                elif isinstance(slot, dict):
                    # Already a detailed slot dict, add missing fields if needed
                    slot_copy = slot.copy()
                    slot_copy.setdefault("week_day", day)
                    if slot_copy.get("weeks") is None:
                        slot_copy["weeks"] = weeks or "all"
                    if "session_type" not in slot_copy:
                        slot_copy["session_type"] = session_type
                    result.append(slot_copy)
                else:
                    print(f"[WARNING] Unexpected slot format: {slot}")

    elif isinstance(schedule, list):
        for slot in schedule:
            if isinstance(slot, str):
                try:
                    start_time, end_time = slot.strip().split("-")
                except ValueError:
                    print(f"[WARNING] Skipping malformed time slot: {slot}")
                    continue
                result.append({
                    "start_time": start_time,
                    "end_time": end_time,
                    "week_day": None,
                    "weeks": weeks,
                    "session_type": session_type
                })
            elif isinstance(slot, dict):
                slot_copy = slot.copy()
                if slot_copy.get("weeks") is None:
                    slot_copy["weeks"] = weeks or "all"
                if "session_type" not in slot_copy:
                    slot_copy["session_type"] = session_type
                slot_copy.setdefault("week_day", None)
                result.append(slot_copy)
            else:
                print(f"[WARNING] Unexpected slot format: {slot}")
    else:
        print(f"[WARNING] Unexpected schedule format: {schedule} (type {type(schedule)})")

    return result



def transform_full_json(data: dict) -> dict:
    for program_data in data.values():
        for semester_courses in program_data.get("courses_by_semester", {}).values():
            for course in semester_courses:
                # Only replace the schedule field — leave others like 'type' untouched
                original_schedule = course.get("schedule", {})
                extra_info = course.get("extra_info", "")
                course["schedule"] = flatten_schedule(original_schedule, extra_info)
    return data



if __name__ == "__main__":
    input_path = Path("/content/restructured_grouped_courses_by_program_DEM_S1.json")
    output_path = Path("/content/grouped_courses_by_program_DEM_S1_flatten_new.json")

    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    transformed = transform_full_json(data)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(transformed, f, indent=2, ensure_ascii=False)

    print(f"Transformed schedule saved to {output_path}")

Transformed schedule saved to /content/grouped_courses_by_program_DEM_S1_flatten_new.json


In [None]:
import json
import re
from pathlib import Path

def normalize_schedule_format(course: dict) -> dict:
    schedule = course.get("schedule", {})

    # Extract instructor and extra_info from course
    instructor = course.pop("instructor", None)
    extra_info = course.pop("extra_info", None)

    # Helper function to replace weeks=null with weeks='all'
    def fix_weeks(slots):
        for slot in slots:
            if slot.get("weeks") is None:
                slot["weeks"] = "all"
        return slots

    # Case 1: schedule is a flat list of slots
    if isinstance(schedule, list):
        fixed_slots = fix_weeks(schedule)
        course["schedule"] = {
            "group_1": {
                "instructor": instructor,
                "slots": fixed_slots
            }
        }

    # Case 2: schedule is a dict with group keys
    elif isinstance(schedule, dict):
        normalized = {}
        for idx, (key, val) in enumerate(schedule.items(), start=1):
            group_key = f"group_{idx}"
            if isinstance(val, list):  # assume list of slots
                fixed_slots = fix_weeks(val)
                normalized[group_key] = {
                    "instructor": instructor,
                    "slots": fixed_slots
                }
            elif isinstance(val, dict):
                slots = val.get("slots", [])
                fixed_slots = fix_weeks(slots)
                normalized[group_key] = {
                    "instructor": val.get("instructor", instructor),
                    "slots": fixed_slots
                }
        course["schedule"] = normalized

    return course

def transform_full_json(data: dict) -> dict:
    for program_data in data.values():
        for semester_courses in program_data.get("courses_by_semester", {}).values():
            for i, course in enumerate(semester_courses):
                semester_courses[i] = normalize_schedule_format(course)
    return data

if __name__ == "__main__":
    input_path = Path("/content/grouped_courses_by_program_DEM_S1_flatten_new.json")
    output_path = Path("grouped_courses_by_program_DEM_S1_flatten_structured_new_.json")

    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    transformed = transform_full_json(data)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(transformed, f, indent=2, ensure_ascii=False)

    print(f"Transformed schedule saved to {output_path}")


Transformed schedule saved to grouped_courses_by_program_DEM_S1_flatten_structured_new_.json


### Courses and Prerequistes

In [None]:
!pip install pdfminer.six



In [None]:
import re
import json

def extract_courses_from_lines(lines):
    course_pattern = re.compile(
        r"^(?P<code>[0-9A-Z\s]{4,14}\d)\s+(?P<title>.+?)\s+(?P<credits>\d+)\s*Cr\.?", re.UNICODE
    )

    prereq_pattern = re.compile(r"^Prérequis ?: (?P<prerequisites>.+)", re.IGNORECASE)

    courses = []
    current_course = None

    def finalize():
        nonlocal current_course
        if current_course:
            print("Finalizing course:", current_course["code"])
            courses.append(current_course.copy())
            current_course = None


    for line in lines:
        line = line.strip()
        if not line:
            continue

        match = course_pattern.match(line)
        if match:
          finalize()
          code_raw = match.group("code")
          normalized_code = re.sub(r"\s+", "", code_raw)  # Remove spaces
          current_course = {
              "code": normalized_code,
              "title": match.group("title").strip(),
              "credits": int(match.group("credits")),
              "description": "",
              "prerequisites": None
          }
        elif current_course:
            prereq_match = prereq_pattern.match(line)
            if prereq_match:
                current_course["prerequisites"] = prereq_match.group("prerequisites")
            else:
                current_course["description"] += " " + line.strip()

    finalize()
    return courses

def is_course_section_header(line):
    if line.startswith("### ") or line.startswith("##### "):
        header = line.lower()
        print(f"Checking header for courses: {header}")
        return "description des cours" in header or "descriptives des cours" in header
    return False

def extract_courses_from_markdown(markdown_text):
    lines = markdown_text.splitlines()
    all_departments = {}
    current_department = None
    current_subdepartment = None
    collecting = False
    buffer = []

    for line in lines:
        line = line.strip()

        # --- Detect Department ---
        if line.startswith("## "):
            # Flush any buffer from previous collection
            if collecting and buffer:
                if current_department:
                    if current_department == "Département des Etudes Doctorales" and current_subdepartment:
                        print(f"Flushing Doctorat Master: {current_subdepartment}")
                        all_departments[current_department][current_subdepartment]["courses"].extend(extract_courses_from_lines(buffer))
                    else:
                        all_departments[current_department].extend(extract_courses_from_lines(buffer))
                buffer = []
                collecting = False

            current_department = line[3:].strip()
            current_subdepartment = None
            print(f"New department: {current_department}")
            if current_department == "Département des Etudes Doctorales":
                all_departments[current_department] = {}
            else:
                all_departments[current_department] = []
            continue

        # --- Doctorate Master Header ---
        if current_department == "Département des Etudes Doctorales" and line.startswith("### Master"):
            # Flush buffer if switching master
            if collecting and buffer and current_subdepartment:
                all_departments[current_department][current_subdepartment]["courses"].extend(extract_courses_from_lines(buffer))
                buffer = []

            current_subdepartment = line[4:].strip()
            print(f"Doctorat Master Found: {current_subdepartment}")
            all_departments[current_department][current_subdepartment] = {"courses": []}
            collecting = False
            continue

        # --- Doctorate Course Section Header ---
        if current_department == "Département des Etudes Doctorales" and line.lower().startswith("##### contenu des cours") or line.lower().startswith("##### course content"):
            print(f"Found 'Contenu des Cours' in {current_subdepartment}")
            collecting = True
            buffer = []
            continue

        # End of a content block in Doctorate Master
        if current_department == "Département des Etudes Doctorales" and line.startswith("##### ") and collecting:
            print(f"End of 'Contenu des Cours' in {current_subdepartment}")
            if current_subdepartment and buffer:
                all_departments[current_department][current_subdepartment]["courses"].extend(extract_courses_from_lines(buffer))
            buffer = []
            collecting = False
            continue

        # --- Standard Course Section Header ---
        elif is_course_section_header(line):
            print(f"Found course section header: {line}")
            if collecting and buffer and current_department:
                all_departments[current_department].extend(extract_courses_from_lines(buffer))
                buffer = []
            collecting = True
            continue

        # --- End collecting on any other header ---
        elif (line.startswith("### ") or line.startswith("##### ")) and collecting:
            print(f"Hit new header {line} while collecting. Flushing buffer and stopping collection.")
            if current_department:
                if current_department == "Département des Etudes Doctorales" and current_subdepartment:
                    all_departments[current_department][current_subdepartment]["courses"].extend(extract_courses_from_lines(buffer))
                elif isinstance(all_departments[current_department], list):
                    all_departments[current_department].extend(extract_courses_from_lines(buffer))
            buffer = []
            collecting = False
            continue

        # --- Collect lines ---
        elif collecting:
            buffer.append(line)

    # --- Final flush ---
    if collecting and buffer:
        if current_department:
            if current_department == "Département des Etudes Doctorales" and current_subdepartment:
                print(f"Final flush for Doctorat Master {current_subdepartment}")
                all_departments[current_department][current_subdepartment]["courses"].extend(extract_courses_from_lines(buffer))
            elif isinstance(all_departments[current_department], list):
                print(f"Final flush for department {current_department}")
                all_departments[current_department].extend(extract_courses_from_lines(buffer))
        else:
            print("Warning: No current department set at final flush.")

    return all_departments

# Example usage
if __name__ == "__main__":
    with open("/content/Catalogue_ESIB_2022-2023.md", "r", encoding="utf-8") as f:
        markdown = f.read()

    dept_courses = extract_courses_from_markdown(markdown)

    with open("courses_by_department.json", "w", encoding="utf-8") as f:
        json.dump(dept_courses, f, ensure_ascii=False, indent=2)

    print(f"✅ Extracted courses from {len(dept_courses)} departments.")


New department: École Supérieure d’Ingénieurs de Beyrouth (ESIB)
Checking header for courses: ### historique
Checking header for courses: ### mission de l’esib
Checking header for courses: ### vision de l’esib
Checking header for courses: ### direction
Checking header for courses: ### administration
Checking header for courses: ### corps professoral
Checking header for courses: ### diplômes
Checking header for courses: ### admission
Checking header for courses: ##### admission en première année ingénieur (classe de mathématiques supérieures)
Checking header for courses: ##### admission par transfert d’une autre institution
Checking header for courses: ##### admission en master
Checking header for courses: ### frais de scolarité
Checking header for courses: ### organisation des enseignements
Checking header for courses: ### vie associative
Checking header for courses: ### aumônerie
New department: Département des Classes Préparatoires
Checking header for courses: ### responsable : melhe

In [None]:
import json
import re
from pathlib import Path

def extract_weeks(extra_info):
    if not extra_info:
        return "all"
    info = extra_info.lower()
    if "impair" in info:
        return "impaires"
    elif "pair" in info:
        return "paires"
    match = re.search(r"sem(?:\.|aine)?\s*(\d{1,2})(?:\s*[\u00e0\-]\s*(\d{1,2}))?", info)
    if match:
        start = match.group(1)
        end = match.group(2)
        return f"{start}-{end}" if end else start
    return "all"

def detect_session_type(extra_info):
    if not extra_info:
        return "cours"
    info = extra_info.lower()
    if "travaux pratiques" in info or "tp" in info:
        return "travaux pratiques"
    return "cours"

def flatten_schedule(schedule: dict, extra_info: str = None):
    weeks = extract_weeks(extra_info)
    session_type = detect_session_type(extra_info)
    result = []
    for day, slots in schedule.items():
        for time_range in slots:
            try:
                start_time, end_time = time_range.strip().split("-")
                result.append({
                    "start_time": start_time,
                    "end_time": end_time,
                    "week_day": day,
                    "weeks": weeks,
                    "session_type": session_type
                })
            except ValueError:
                print(f"[WARNING] Skipping malformed time slot: {time_range} on {day}")
    return result

def transform_full_json(data: dict) -> dict:
    for program_data in data.values():
        for semester_courses in program_data.get("courses_by_semester", {}).values():
            for course in semester_courses:
                # Only replace the schedule field — leave others like 'type' untouched
                original_schedule = course.get("schedule", {})
                extra_info = course.get("extra_info", "")
                course["schedule"] = flatten_schedule(original_schedule, extra_info)
    return data


if __name__ == "__main__":
    input_path = Path("/content/grouped_courses_by_program_DEM_S1_flatten.json")
    output_path = Path("grouped_courses_by_program_DEM_S1_flatten_structured.json")

    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    transformed = transform_full_json(data)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(transformed, f, indent=2, ensure_ascii=False)

    print(f"Transformed schedule saved to {output_path}")


Transformed schedule saved to grouped_courses_by_program_DEM_S1_flatten_structured.json


In [None]:
import json

# Load the file
with open("grouped_courses_by_program_DEM_S1_current.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Traverse all programs and semesters
for program in data.values():
    semesters = program.get("courses_by_semester", {})
    for course_list in semesters.values():
        for course in course_list:
            if "extra_info" not in course:
                course["extra_info"] = ""

# Save back the updated file
with open("grouped_courses_by_program_DEM_S1_current_updated.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)
