<a href="https://colab.research.google.com/github/KeyboardSnail/AI_CET_Cell_Cut_Off_Pdf_to_CSV/blob/main/.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
import logging
import re
from pathlib import Path
!pip install click
!pip install pdfplumber
!pip install PyPDF2
import click
import pdfplumber
import PyPDF2


RANK_PERC_RE = re.compile(r"(\d+)\s*\(([\d.]+)\)")


SKIP_LINES = {
    "Government of Maharashtra",
    "State Common Entrance Test Cell",
    "Cut Off Merit of All India Seats of CAP Round-III for the Admission to the First Year Under Graduate Technical Course in B.Pharmacy & Post",
    "Graduate Pharm.D. for the Academic Year 2024-25",
    "Cut Off Indicates All India Merit No.; Figures in bracket Indicates NEET / MHT-CET Score",
    "AI to AI - All India Seats Allotted to All India Candidature Candidates.",
}

def configure_logging(log_file: Path):
    logging.basicConfig(
        filename=str(log_file),
        level=logging.INFO,
        format="%(asctime)s – %(levelname)s – %(message)s",
        filemode="w",
    )

def write_csv(out_path: Path, rows: list[list[str]]):
    header = [
        "Sr. No.",
        "All India Merit Rank",
        "Percentile",
        "Choice Code",
        "Institute Name",
        "Course Name",
        "Exam",
        "Type",
        "Seat Type",
        "PDF Page",
    ]
    with open(out_path, "w", newline="", encoding="utf-8") as fh:
        writer = csv.writer(fh)
        writer.writerow(header)
        writer.writerows(rows)

def process_pdf(pdf_path: Path, out_csv: Path, log_file: Path):
    configure_logging(log_file)
    rows = []

    with pdfplumber.open(pdf_path) as plumb, open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)

        for i, page in enumerate(plumb.pages, start=1):
            logging.info(f"Page {i}: Extracting tables...")

            tables = page.extract_tables()
            if not tables:
                logging.info(f"Page {i}: No tables found")
                continue

            for table in tables:

                for row in table[1:]:
                    if not any(row):
                        continue

                    row = [(c or "").strip() for c in row]

                    if len(row) < 8:
                        continue

                    sr_no = row[0]
                    merit = row[1]
                    choice_code = row[2]
                    institute = row[3]
                    course = row[4]
                    exam = row[5]
                    type_ = row[6]
                    seat_type = row[7]

                    rank = percentile = ""
                    m = RANK_PERC_RE.search(merit)
                    if m:
                        rank, percentile = m.groups()

                    rows.append([
                        sr_no,
                        rank,
                        percentile,
                        choice_code,
                        institute,
                        course,
                        exam,
                        type_,
                        seat_type,
                        str(i),  # page number
                    ])

    write_csv(out_csv, rows)
    print(f"✔ Extracted {len(rows)} rows → {out_csv}")


def process_folder(folder: Path):
    for pdf in folder.rglob("*.pdf"):
        out_csv = pdf.with_suffix(".csv")
        log = pdf.with_suffix(".log")
        print(f"→ Processing {pdf}")
        process_pdf(pdf, out_csv, log)


@click.command()
@click.argument("folder", type=click.Path(exists=True, file_okay=False, path_type=Path))
def main(folder: Path):
    process_folder(folder)


if __name__ == "__main__":

    folder_to_process = '/content'

    main.main([folder_to_process], standalone_mode=False)