In [None]:
# Install libs
%pip install pytesseract opencv-python pillow
%pip install fitz
%pip install pymupdf
%pip install pdf2image pytesseract pillow
!sudo apt-get install poppler-utils tesseract-ocr

In [None]:
import os
import re
import cv2
import fitz  # PyMuPDF
import pytesseract
import pandas as pd
from PIL import Image
from google.colab import files
from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment
from openpyxl.utils import get_column_letter
import time

# === === Config === ===
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'  # Adjust if needed
EXCEL_FILE = "admission_data.xlsx"

ordered_keys = [
    "candidate_name",
    "cet_receipt_no",  # Updated key
    "rank",
    "category",
    "quota",
    "admission_order_no",
    "method_of_admission",
    "date_of_issue",
    "date_of_admission",
    "college_code",
    "college_name",
    "is_admitted_to",
    "branch",
    "fee_paid",
    "source_file"
]

# === === Utilities === ===
def clean_fee(t):
    digits = re.sub(r'[^0-9]', '', t or '')
    return int(digits) if digits else 0

def clean_name(name):
    return name.lstrip('-').strip() if name and name.startswith('-') else name

def normalize_key(k):
    return k.lower().replace(" ", "_")

def detect_method(text):
    if re.search(r"COMEDK\s+UGET\s+\d{4}", text, re.IGNORECASE):
        return "COMEDK"
    elif re.search(r"PGCET|CET|KARNATAKA EXAMINATIONS AUTHORITY", text, re.IGNORECASE):
        return "CET"
    return "UNKNOWN"

def has_text_layer(pdf_path):
    with fitz.open(pdf_path) as doc:
        return any(page.get_text().strip() for page in doc)

def append_to_excel(data, file_path):
    normalized_data = {normalize_key(k): v for k, v in data.items()}
    normalized_data["source_file"] = os.path.basename(file_path)
    ordered_data = {key: normalized_data.get(key, "") for key in ordered_keys}
    return pd.DataFrame([ordered_data])

# === === Field Coordinates === ===
FIELD_COORDS_CET_IMAGE = {
    "Date of Issue":        (389, 177, 296, 52),
    "Admission Order No":   (414, 239, 299, 58),
    "College Code":         (385, 319, 310, 53),
    "Method of Admission":  (771, 229, 913, 73),
    "CET No":               (318, 444, 310, 64),
    "Rank":                 (731, 452, 419, 58),
    "Category Claimed":     (1302, 440, 375, 65),
    "Candidate Name":       (345, 515, 813, 65),
    "Admitted To":          (1363, 518, 308, 58),
    "College Name":         (215, 590, 1456, 65),
    "Branch":               (211, 662, 1046, 61),
    "Category":    (1346, 659, 338, 68),
    "Date of Admission":    (222, 737, 362, 62),
    "Fee Paid":             (1267, 2311, 299, 55),
}

FIELD_COORDS_IMAGE = {
    "method_of_admission": (96, 182, 1040, 23),
    "date_of_issue":       (181, 241, 193, 32),
    "admission_order_no":  (530, 241, 169, 37),
    "college_code":        (833, 245, 94, 29),
    "cet_no":              (143, 282, 226, 25),
    "rank":                (441, 285, 240, 22),
    "category":            (797, 282, 115, 27),
    "candidate_name":      (154, 314, 509, 27),
    "is_admitted_to":      (807, 316, 123, 26),
    "college_name":        (82, 349, 836, 27),
    "quota":               (721, 383, 203, 24),
    "fee_paid":            (691, 411, 173, 29),
    "branch":              (82, 381, 576, 25)
}

# === === Extraction Functions === ===
def extract_from_text_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text() for page in doc])

    def extract_field(label):
        match = re.search(rf"{re.escape(label)}\s*:\s*(.+)", text)
        return match.group(1).strip() if match else None

    course = extract_field("Course Selected")

    return {
        "candidate_name": extract_field("Candidate Name"),
        "admission_order_no": extract_field("TAT No"),
        "college_code": extract_field("College Code"),
        "method_of_admission": detect_method(text),
        "cet_receipt_no": extract_field("Receipt No") or extract_field("Application Number"),  # updated
        "rank": extract_field("Rank"),
        "category": extract_field("Seat Category"),
        "is_admitted_to": course,
        "college_name": extract_field("College Name"),
        "branch": course.split('-')[-1].strip() if course and '-' in course else None,
        "category_allotted": None,
        "date_of_admission": extract_field("Date"),
        "fee_paid": clean_fee(extract_field("Total Fee paid so far"))
    }

def extract_from_image_pdf(image_path):
    image = cv2.imread(image_path)
    data = {}
    for field, (x, y, w, h) in FIELD_COORDS_CET_IMAGE.items():
        roi = image[y:y+h, x:x+w]
        config = '--oem 3 --psm 6'
        text = pytesseract.image_to_string(roi, config=config).strip().replace("\n", " ")
        if field == "Fee Paid":
            text = clean_fee(text)
        if field == "Name":
            text = clean_name(text)
        data[field] = text
    data["Method of Admission"] = detect_method(data.get("Method of Admission", ""))
    data["cet_receipt_no"] = clean_cet_no(data.get("CET No", ""))
    return data

def extract_from_partial_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = doc[0].get_text()
    if "comedk" in text.lower():
        return extract_from_text_pdf(pdf_path)
    else:
        pix = doc[0].get_pixmap(dpi=300)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img_path = "temp_admission_image.png"
        img.save(img_path)
        return extract_from_image_pdf(img_path)

def clean_cet_no(text):
    text = re.sub(r'[^A-Z0-9]', '', text.upper())
    if len(text) > 5 and text[1] == 'A':
        text = text[0] + text[2:]
    return text.strip()

def extract_fields_from_image(image_path):
    image = cv2.imread(image_path)
    data = {}
    for field, (x, y, w, h) in FIELD_COORDS_IMAGE.items():
        cropped = image[y:y+h, x:x+w]
        if field in ["college_code", "cet_no", "admission_order_no"]:
            config = r'--oem 3 --psm 7 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
        else:
            config = r'--oem 3 --psm 6'
        text = pytesseract.image_to_string(cropped, config=config).strip().replace('\n', ' ')
        if field == "cet_no":
            text = clean_cet_no(text)
            data["cet_receipt_no"] = text
        elif field == "fee_paid":
            text = clean_fee(text)
        elif field == "candidate_name":
            text = clean_name(text)
        data[field] = text
    data.pop("cet_no", None)
    moa = data.get("method_of_admission", "").upper()
    if "COMEDK" in moa:
        data["method_of_admission"] = "COMEDK"
    elif "CET" in moa or "KEA" in moa:
        data["method_of_admission"] = "CET"
    else:
        data["method_of_admission"] = "UNKNOWN"
    return data

# === === Function to save Excel with better formatting === ===
def save_to_excel_with_format(df, file_path):
    df.to_excel(file_path, index=False, engine='openpyxl')
    wb = load_workbook(file_path)
    ws = wb.active
    bold_font = Font(bold=True)
    center_align = Alignment(horizontal="center", vertical="center")

    for col_idx, column_cells in enumerate(ws.iter_cols(min_row=1, max_row=1), start=1):
        max_length = 0
        for cell in column_cells:
            cell.font = bold_font
            cell.alignment = center_align
            max_length = max(max_length, len(str(cell.value)) if cell.value else 0)
        adjusted_width = max_length + 4
        col_letter = get_column_letter(col_idx)
        ws.column_dimensions[col_letter].width = adjusted_width

    ws.freeze_panes = "A2"
    wb.save(file_path)
    print(f"\n✅ Excel file saved with formatting: {file_path}")

# === === Upload + Process Loop === ===
def upload_and_process_files():
    all_data = []

    while True:
        uploaded = files.upload()
        for filename in uploaded.keys():
            ext = os.path.splitext(filename)[-1].lower()
            try:
                if ext in ['.pdf']:
                    data = extract_from_partial_pdf(filename)
                elif ext in ['.jpg', '.jpeg', '.png']:
                    data = extract_fields_from_image(filename)
                else:
                    print(f"❌ Unsupported file type: {ext}")
                    continue

                df_row = append_to_excel(data, filename)
                all_data.append(df_row)

                print("\nExtracted data:")
                for k, v in data.items():
                    print(f"{k}: {v}")

            except Exception as e:
                print(f"❌ Error processing {filename}: {e}")

        cont = input("Do you want to upload another file? (yes/no): ").strip().lower()
        if cont != 'yes':
            break

    if all_data:
        df_new = pd.concat(all_data, ignore_index=True)

        if os.path.exists(EXCEL_FILE):
            df_existing = pd.read_excel(EXCEL_FILE)
            key_column = 'cet_receipt_no' if 'cet_receipt_no' in df_existing.columns else 'admission_order_no'
            df_existing.set_index(key_column, inplace=True)
            df_new.set_index(key_column, inplace=True)
            df_existing.update(df_new)
            new_only = df_new[~df_new.index.isin(df_existing.index)]
            df_combined = pd.concat([df_existing, new_only])
            df_combined.reset_index(inplace=True)
        else:
            df_combined = df_new

        # Rename column and remove duplicates
        df_combined = df_combined[[col for col in ordered_keys if col in df_combined.columns]]
        df_combined.drop_duplicates(subset=["cet_receipt_no"], keep="first", inplace=True)
        df_combined.reset_index(drop=True, inplace=True)
        df_combined.insert(0, "Sr. No", df_combined.index + 1)
        df_combined.rename(columns={"cet_receipt_no": "CET / Receipt No"}, inplace=True)

        timestamp = time.strftime("%Y%m%d_%H%M%S")
        new_file = f"admission_data_{timestamp}.xlsx"

        save_to_excel_with_format(df_combined, new_file)
        print(f"\n✅ {len(df_combined)} unique records saved to {new_file}")
        files.download(new_file)
    else:
        print("\n⚠ No new data to save. Excel file unchanged.")

    print("Exiting...")

# 🔄 Run the upload loop
upload_and_process_files()

In [None]:
# Filter files:
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment
from openpyxl.utils import get_column_letter
import time
import glob
import os

def get_latest_admission_file():
    files = glob.glob("admission_data_*.xlsx")
    if not files:
        return None
    latest_file = max(files, key=os.path.getctime)
    return latest_file

def format_excel(file_path):
    wb = load_workbook(file_path)
    ws = wb.active

    bold_font = Font(bold=True)
    center_align = Alignment(horizontal="center", vertical="center")

    for col_idx, column_cells in enumerate(ws.iter_cols(min_row=1, max_row=1), start=1):
        max_length = 0
        for cell in column_cells:
            cell.font = bold_font
            cell.alignment = center_align
            max_length = max(max_length, len(str(cell.value)) if cell.value else 0)
        ws.column_dimensions[get_column_letter(col_idx)].width = max_length + 4

    ws.freeze_panes = "A2"
    wb.save(file_path)
    print(f"✅ Filtered file saved as: {file_path}")

def filter_by_fee():
    input_file = get_latest_admission_file()
    if not input_file:
        print("❌ No 'admission_data_*.xlsx' file found.")
        return

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"❌ Error reading {input_file}: {e}")
        return

    fee_input = input("Enter fee range: ").strip()
    try:
        min_fee, max_fee = map(int, fee_input.split('-'))
        filtered_df = df[(df['fee_paid'] >= min_fee) & (df['fee_paid'] <= max_fee)]

        if filtered_df.empty:
            print(f"⚠️ No records found with fee_paid between {min_fee} and {max_fee}")
        else:
            timestamp = time.strftime("%Y%m%d_%H%M%S")
            output_file = f"filtered_fee_range_{min_fee}-{max_fee}_{timestamp}.xlsx"
            filtered_df.to_excel(output_file, index=False, engine='openpyxl')
            format_excel(output_file)

            try:
                from google.colab import files
                files.download(output_file)
            except:
                print(f"✅ File ready: {output_file} (not downloaded automatically outside Colab)")

    except ValueError:
        print("⚠️ Invalid input. Please enter a fee range in the format min-max (e.g., 23000-24000).")


filter_by_fee()


In [None]:
# FIlter branchwise 
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment
from openpyxl.utils import get_column_letter
import time
import glob
import os
import re

# Original branch mapping
raw_branch_map = {
    "AI": "Artificial Intelligence and Machine Learning",
    "AD": "Artificial Intelligence and Data Science",  # ✅ Newly added
    "BT": "Biotechnology Engineering",
    "CH": "Chemical Engineering",
    "CV": "Civil Engineering",
    "CS": "Computer Science and Engineering",
    "CI": "CSE (Artificial Intelligence and Machine Learning)",
    "CY": "CSE(Cyber Security)",
    "EE": "Electrical and Electronics Engineering",
    "EC": "Electronics and Communication Engineering",
    "EI": "Electronics and Instrumentation Engineering",
    "ET": "Electronics Telecommunication Engineering",
    "IM": "Industrial Engineering and Management",
    "IS": "Information Science and Engineering",
    "ME": "Mechanical Engineering",
    "MD": "Medical Electronics Engineering"
}


def normalize_branch_name(name):
    if pd.isna(name):
        return name
    # Replace '&' or 'and' (case-insensitive) with ' and ', ignoring surrounding spaces
    name = re.sub(r'\s*(&|and)\s*', ' and ', name, flags=re.IGNORECASE)
    # Normalize multiple spaces to single space
    name = re.sub(r'\s+', ' ', name)
    return name.strip()

# Normalize the branch_map values
branch_map = {code: normalize_branch_name(name) for code, name in raw_branch_map.items()}

def get_latest_admission_file():
    files = glob.glob("admission_data_*.xlsx")
    if not files:
        return None
    return max(files, key=os.path.getctime)

def format_excel(file_path):
    wb = load_workbook(file_path)
    ws = wb.active

    bold_font = Font(bold=True)
    center_align = Alignment(horizontal="center", vertical="center")

    for col_idx, column_cells in enumerate(ws.iter_cols(min_row=1, max_row=1), start=1):
        max_length = 0
        for cell in column_cells:
            cell.font = bold_font
            cell.alignment = center_align
            max_length = max(max_length, len(str(cell.value)) if cell.value else 0)
        ws.column_dimensions[get_column_letter(col_idx)].width = max_length + 4

    ws.freeze_panes = "A2"
    wb.save(file_path)
    print(f"✅ Filtered file saved as: {file_path}")

def normalize_branch_name(name):
    if pd.isna(name):
        return name

    # Fix common typo
    name = name.replace("Computers", "Computer")

    # Normalize '&' or 'and' to ' and ' with spacing
    name = re.sub(r'(?i)\s*(&|and)\s*', ' and ', name)

    # Normalize multiple spaces to single space
    name = re.sub(r'\s+', ' ', name)

    return name.strip()


def filter_by_branch_code():
    input_file = get_latest_admission_file()
    if not input_file:
        print("❌ No 'admission_data_*.xlsx' file found.")
        return

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"❌ Error reading {input_file}: {e}")
        return

    # Normalize branch names in the DataFrame
    df['branch'] = df['branch'].apply(normalize_branch_name)

    print("\n📚 Available Branch Codes:")
    for code, name in branch_map.items():
        print(f"{code} → {name}")

    user_input = input("\nEnter branch codes separated by commas (e.g., CS, AI, CY): ").strip().upper()
    codes = [code.strip() for code in user_input.split(',')]
    branch_names = [branch_map.get(code) for code in codes if branch_map.get(code)]

    if not branch_names:
        print("⚠️ No valid branch codes entered.")
        return

    filtered_df = df[df['branch'].str.lower().isin([name.lower() for name in branch_names])]
    if filtered_df.empty:
        print(f"⚠️ No records found for branches: {', '.join(branch_names)}")
    else:
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        joined_codes = '_'.join(codes)
        output_file = f"branch_filtered_{joined_codes}_{timestamp}.xlsx"
        filtered_df.to_excel(output_file, index=False, engine='openpyxl')
        format_excel(output_file)

        try:
            from google.colab import files
            files.download(output_file)
        except:
            print(f"✅ File ready: {output_file} (not downloaded automatically outside Colab)")

filter_by_branch_code()
