1.Stored the extracted text form image or pdf

1.1 Importing necessary libraries 

In [1]:
import tkinter as tk
from tkinter import filedialog
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
from symspellpy import SymSpell, Verbosity
import psycopg2
import os

1.2 PostgreSQL connection details

In [2]:
DB_HOST = "localhost"
DB_NAME = "postgres"
DB_USER = "postgres"
DB_PASSWORD = "9965"
DB_PORT = "5432"

1.3 Define paths

In [3]:
poppler_path = r"C:\\Program Files\\poppler-24.08.0\\Library\\bin"
tesseract_path = r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
pytesseract.pytesseract.tesseract_cmd = tesseract_path

1.4 Initialize SymSpell

In [4]:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = r"D:\\space kids india\\en-80k.txt"

if not sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1):
    print("❌ Dictionary file not found! Check the file path.")
    exit()

1.5 Function to correct spelling

In [5]:
def correct_spelling(text):
    words = text.split()
    corrected_words = [sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)[0].term if sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2) else word for word in words]
    return " ".join(corrected_words)

print(correct_spelling)

<function correct_spelling at 0x0000023719537740>


1.6 Function to select a file (PDF or Image)

In [6]:
def select_file():
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(
        title="Select a File",
        filetypes=[("PDF and Image Files", "*.pdf;*.png;*.jpg;*.jpeg;*.bmp;*.tiff;*.gif")]
    )
    return file_path


1.7 Get user-selected file

In [7]:
file_path = select_file()
if not file_path:
    print("❌ No file selected. Exiting...")
    exit()

print(file_path)

D:/space kids india/Screenshot 2025-03-06 190340.png


1.8 Connect to PostgreSQL

In [8]:
def connect_to_db():
    try:
        conn = psycopg2.connect(
            host=DB_HOST, database=DB_NAME, user=DB_USER, password=DB_PASSWORD, port=DB_PORT
        )
        cursor = conn.cursor()
        print("✅ Connected to PostgreSQL database")
        return conn, cursor
    except Exception as e:
        print(f"❌ Failed to connect to PostgreSQL: {e}")
        exit()

conn, cursor = connect_to_db()

✅ Connected to PostgreSQL database


1.9 Ensure necessary tables exist

In [9]:
cursor.execute("""
    CREATE TABLE IF NOT EXISTS extracted_text (
        id SERIAL PRIMARY KEY,
        file_name TEXT NOT NULL,
        page_number INT DEFAULT NULL,
        corrected_text TEXT NOT NULL
    )
""")
conn.commit()


1.10 Process file

In [10]:
file_name = os.path.basename(file_path)
if file_path.lower().endswith(".pdf"):
    images = convert_from_path(file_path, poppler_path=poppler_path, dpi=300)
    for i, image in enumerate(images, start=1):
        text = pytesseract.image_to_string(image, lang='eng')
        corrected_text = correct_spelling(text)
        cursor.execute("INSERT INTO extracted_text (file_name, page_number, corrected_text) VALUES (%s, %s, %s)",
                       (file_name, i, corrected_text))
        conn.commit()
        print(f"✅ Page {i} stored in database")
else:
    image = Image.open(file_path)
    text = pytesseract.image_to_string(image, lang='eng')
    corrected_text = correct_spelling(text)
    cursor.execute("INSERT INTO extracted_text (file_name, corrected_text) VALUES (%s, %s)",
                   (file_name, corrected_text))
    conn.commit()
    print("✅ Image text saved to PostgreSQL successfully!")


✅ Image text saved to PostgreSQL successfully!


1.11 Close database connection

In [11]:
cursor.close()
conn.close()
print("\n✅ All data saved to PostgreSQL successfully!")


✅ All data saved to PostgreSQL successfully!


2. Retrieving the stored data 

2.1 impoerting libraries 

In [12]:
import psycopg2

2.2 PostgreSQL connection details

In [13]:
DB_HOST = "localhost"
DB_NAME = "postgres"
DB_USER = "postgres"
DB_PASSWORD = "9965"
DB_PORT = "5432"


2.3 Function to connect to PostgreSQL

In [14]:
def connect_to_db():
    try:
        conn = psycopg2.connect(
            host=DB_HOST, database=DB_NAME, user=DB_USER, password=DB_PASSWORD, port=DB_PORT
        )
        return conn
    except Exception as e:
        print(f"❌ Failed to connect to PostgreSQL: {e}")
        return None


2.4 Function to fetch extracted text based on user input (ID, file_name, or page_number)


In [16]:
def fetch_text(identifier_type, identifier_value):
    conn = connect_to_db()
    if not conn:
        return

    query = "SELECT id, file_name, page_number, corrected_text FROM extracted_text WHERE "

    if identifier_type == "id":
        query += "id = %s"
    elif identifier_type == "file_name":
        query += "file_name = %s"
    elif identifier_type == "page_number":
        query += "page_number = %s"
    else:
        print("❌ Invalid input type! Please choose 'id', 'file_name', or 'page_number'.")
        return

    with conn:
        with conn.cursor() as cursor:
            cursor.execute(query, (identifier_value,))
            results = cursor.fetchall()

            if results:
                print("\n✅ Extracted Text Results:")
                for row in results:
                    print(f"\n📄 ID: {row[0]}, File Name: {row[1]}, Page Number: {row[2]}")
                    print(f"📝 Extracted Text:\n{row[3]}\n" + "-"*50)
            else:
                print(f"⚠️ No results found for {identifier_type}: {identifier_value}")


2.5 User input selection

In [18]:
print("\n🔍 Search by:")
print("1️⃣ ID")
print("2️⃣ File Name")
print("3️⃣ Page Number")
choice = input("Enter your choice (1/2/3): ")

if choice == "1":
    id_value = int(input("Enter ID: "))
    fetch_text("id", id_value)
elif choice == "2":
    file_name = input("Enter File Name (with extension): ")
    fetch_text("file_name", file_name)
elif choice == "3":
    page_number = int(input("Enter Page Number: "))
    fetch_text("page_number", page_number)
else:
    print("❌ Invalid choice! Please enter 1, 2, or 3.")



🔍 Search by:
1️⃣ ID
2️⃣ File Name
3️⃣ Page Number


Enter your choice (1/2/3):  2
Enter File Name (with extension):  Screenshot 2025-03-06 190340.png



✅ Extracted Text Results:

📄 ID: 1, File Name: Screenshot 2025-03-06 190340.png, Page Number: None
📝 Extracted Text:
a of Intsoduetton a is on oe collection of soda to that it be easily accessed a to manage these databases DBMS a database h to system and used DBMS DataBase: of Relationol_DBMs of and DBMS, data Honed'in in a table format erin it ate a mon ___|_ th of and th are with a relational is a a RolINo same class a of a a
--------------------------------------------------

📄 ID: 36, File Name: Screenshot 2025-03-06 190340.png, Page Number: None
📝 Extracted Text:
a of Intsoduetton a is on oe collection of soda to that it be easily accessed a to manage these databases DBMS a database h to system and used DBMS DataBase: of Relationol_DBMs of and DBMS, data Honed'in in a table format erin it ate a mon ___|_ th of and th are with a relational is a a RolINo same class a of a a
--------------------------------------------------

📄 ID: 37, File Name: Screenshot 2025-03-06 190340.png, Pa