In [None]:
import pytesseract
from pdf2image import convert_from_path
import re

import cv2
import numpy as np
from PIL import Image

def preprocess_image(img):
    """Convert image to grayscale, apply thresholding, and denoise."""
    img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2GRAY)
    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return Image.fromarray(img)


# Path to your PDF file
pdf_path = "intern.pdf"

# Convert PDF to images
images = convert_from_path(pdf_path, dpi=300)

# Extract text from images
extracted_text = []
for img in images:
    img = preprocess_image(img)  # Apply preprocessing
    text = pytesseract.image_to_string(img, config="--psm 6")  # Use PSM mode for better accuracy
    extracted_text.append(text)


# Combine text from all pages
full_text = "\n".join(extracted_text)

# Define patterns to extract required fields
patterns = {
    "Name": r"Name\s*:\s*(.+)",
    "Register Number": r"Register Number\s*:\s*(\d+)",
    "CGPA": r"CGPA\s*:\s*([\d.]+)",
    "Mobile Number": r"Mobile Number\s*:\s*(\d{10})",
    "Email ID": r"Email ID\s*:\s*([\w\.-]+@[\w\.-]+)",
    "Company": r"Name of the Company / Institution\s*:\s*(.+)",
    "Internship Start Date": r"Internship start date\s*:\s*([\d./-]+)",
    "Internship End Date": r"Internship end date\s*:\s*([\d./-]+)"
}

# Extract information using regex
extracted_info = {}
for key, pattern in patterns.items():
    match = re.search(pattern, full_text, re.IGNORECASE)
    extracted_info[key] = match.group(1).strip() if match else "Not found"

# Print extracted data
for key, value in extracted_info.items():
    print(f"{key}: {value}")


print("\nExtracted data saved to extracted_data.json")


Name: Nikilesh Jayaguptha
Register Number: 3122225001081
CGPA: 8.137
Mobile Number: 9445323734
Email ID: nikilesh2210219@ssn.edu.in
Company: CDAC and tidel park
Internship Start Date: 12.1.18
Internship End Date: 12.3.18

Extracted data saved to extracted_data.json
