In [None]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250506


In [None]:
import re
import json

def extract_courses_from_lines(lines):
    course_pattern = re.compile(
        r"^(?P<code>[0-9A-Z\s]{4,14}\d)\s+(?P<title>.+?)\s+(?P<credits>\d+)\s*Cr\.?", re.UNICODE
    )

    prereq_pattern = re.compile(r"^Prérequis ?: (?P<prerequisites>.+)", re.IGNORECASE)

    courses = []
    current_course = None

    def finalize():
        nonlocal current_course
        if current_course:
            print("Finalizing course:", current_course["code"])
            courses.append(current_course.copy())
            current_course = None


    for line in lines:
        line = line.strip()
        if not line:
            continue

        match = course_pattern.match(line)
        if match:
          finalize()
          code_raw = match.group("code")
          normalized_code = re.sub(r"\s+", "", code_raw)  # Remove spaces
          current_course = {
              "code": normalized_code,
              "title": match.group("title").strip(),
              "credits": int(match.group("credits")),
              "description": "",
              "prerequisites": None
          }
        elif current_course:
            prereq_match = prereq_pattern.match(line)
            if prereq_match:
                current_course["prerequisites"] = prereq_match.group("prerequisites")
            else:
                current_course["description"] += " " + line.strip()

    finalize()
    return courses

def is_course_section_header(line):
    if line.startswith("### ") or line.startswith("##### "):
        header = line.lower()
        print(f"Checking header for courses: {header}")
        return "description des cours" in header or "descriptives des cours" in header
    return False

def extract_courses_from_markdown(markdown_text):
    lines = markdown_text.splitlines()
    all_departments = {}
    current_department = None
    current_subdepartment = None
    collecting = False
    buffer = []

    for line in lines:
        line = line.strip()

        # --- Detect Department ---
        if line.startswith("## "):
            # Flush any buffer from previous collection
            if collecting and buffer:
                if current_department:
                    if current_department == "Département des Etudes Doctorales" and current_subdepartment:
                        print(f"Flushing Doctorat Master: {current_subdepartment}")
                        all_departments[current_department][current_subdepartment]["courses"].extend(extract_courses_from_lines(buffer))
                    else:
                        all_departments[current_department].extend(extract_courses_from_lines(buffer))
                buffer = []
                collecting = False

            current_department = line[3:].strip()
            current_subdepartment = None
            print(f"New department: {current_department}")
            if current_department == "Département des Etudes Doctorales":
                all_departments[current_department] = {}
            else:
                all_departments[current_department] = []
            continue

        # --- Doctorate Master Header ---
        if current_department == "Département des Etudes Doctorales" and line.startswith("### Master"):
            # Flush buffer if switching master
            if collecting and buffer and current_subdepartment:
                all_departments[current_department][current_subdepartment]["courses"].extend(extract_courses_from_lines(buffer))
                buffer = []

            current_subdepartment = line[4:].strip()
            print(f"Doctorat Master Found: {current_subdepartment}")
            all_departments[current_department][current_subdepartment] = {"courses": []}
            collecting = False
            continue

        # --- Doctorate Course Section Header ---
        if current_department == "Département des Etudes Doctorales" and line.lower().startswith("##### contenu des cours") or line.lower().startswith("##### course content"):
            print(f"Found 'Contenu des Cours' in {current_subdepartment}")
            collecting = True
            buffer = []
            continue

        # End of a content block in Doctorate Master
        if current_department == "Département des Etudes Doctorales" and line.startswith("##### ") and collecting:
            print(f"End of 'Contenu des Cours' in {current_subdepartment}")
            if current_subdepartment and buffer:
                all_departments[current_department][current_subdepartment]["courses"].extend(extract_courses_from_lines(buffer))
            buffer = []
            collecting = False
            continue

        # --- Standard Course Section Header ---
        elif is_course_section_header(line):
            print(f"Found course section header: {line}")
            if collecting and buffer and current_department:
                all_departments[current_department].extend(extract_courses_from_lines(buffer))
                buffer = []
            collecting = True
            continue

        # --- End collecting on any other header ---
        elif (line.startswith("### ") or line.startswith("##### ")) and collecting:
            print(f"Hit new header {line} while collecting. Flushing buffer and stopping collection.")
            if current_department:
                if current_department == "Département des Etudes Doctorales" and current_subdepartment:
                    all_departments[current_department][current_subdepartment]["courses"].extend(extract_courses_from_lines(buffer))
                elif isinstance(all_departments[current_department], list):
                    all_departments[current_department].extend(extract_courses_from_lines(buffer))
            buffer = []
            collecting = False
            continue

        # --- Collect lines ---
        elif collecting:
            buffer.append(line)

    # --- Final flush ---
    if collecting and buffer:
        if current_department:
            if current_department == "Département des Etudes Doctorales" and current_subdepartment:
                print(f"Final flush for Doctorat Master {current_subdepartment}")
                all_departments[current_department][current_subdepartment]["courses"].extend(extract_courses_from_lines(buffer))
            elif isinstance(all_departments[current_department], list):
                print(f"Final flush for department {current_department}")
                all_departments[current_department].extend(extract_courses_from_lines(buffer))
        else:
            print("Warning: No current department set at final flush.")

    return all_departments

# Example usage
if __name__ == "__main__":
    with open("/content/Catalogue_ESIB_2022-2023.md", "r", encoding="utf-8") as f:
        markdown = f.read()

    dept_courses = extract_courses_from_markdown(markdown)

    with open("courses_by_department.json", "w", encoding="utf-8") as f:
        json.dump(dept_courses, f, ensure_ascii=False, indent=2)

    print(f"✅ Extracted courses from {len(dept_courses)} departments.")


New department: École Supérieure d’Ingénieurs de Beyrouth (ESIB)
Checking header for courses: ### historique
Checking header for courses: ### mission de l’esib
Checking header for courses: ### vision de l’esib
Checking header for courses: ### direction
Checking header for courses: ### administration
Checking header for courses: ### corps professoral
Checking header for courses: ### diplômes
Checking header for courses: ### admission
Checking header for courses: ##### admission en première année ingénieur (classe de mathématiques supérieures)
Checking header for courses: ##### admission par transfert d’une autre institution
Checking header for courses: ##### admission en master
Checking header for courses: ### frais de scolarité
Checking header for courses: ### organisation des enseignements
Checking header for courses: ### vie associative
Checking header for courses: ### aumônerie
New department: Département des Classes Préparatoires
Checking header for courses: ### responsable : melhe