In [None]:
!pip install pytesseract pdf2image Pillow pymupdf requests
!sudo apt-get update
!sudo apt-get install poppler-utils tesseract-ocr

In [None]:
!pip install spellchecker
!pip install pyspellchecker

In [None]:
!pip install symspellpy

In [None]:
import fitz
import os
import re
from tqdm import tqdm

ROOT_DIR = "ICAR DATA/"
OUTPUT_FILE = "extracted_text.txt"

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            page_text = page.get_text("text")
            page_text = re.sub(r'\n+', ' ', page_text)
            page_text = re.sub(r'\s+', ' ', page_text).strip()
            page_text = re.sub(r'Page\s*\d+', '', page_text)
            text += page_text + " "
    except Exception as e:
        print(f"Error extracting {pdf_path}: {e}")
    return text

all_text = ""
pdf_files = []

for root, _, files in os.walk(ROOT_DIR):
    for file in files:
        if file.endswith(".pdf"):
            pdf_files.append(os.path.join(root, file))

print(f"Found {len(pdf_files)} PDFs. Extracting text...")

for pdf in tqdm(pdf_files):
    all_text += extract_text_from_pdf(pdf) + "\n\n"

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    f.write(all_text)

print(f"Extraction complete! Text saved in {OUTPUT_FILE}")

In [None]:
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell()
sym_spell.load_dictionary("/content/frequency_dictionary_en_82_765.txt", term_index=0, count_index=1)

def correct_spelling(text):
    words = text.split()
    corrected_words = []
    for word in words:
        suggestion = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        corrected_words.append(suggestion[0].term if suggestion else word)
    return " ".join(corrected_words)

with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
    extracted_text = f.read()

cleaned_text = correct_spelling(extracted_text)

with open("cleaned_text.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print("Spelling correction complete! Saved as cleaned_text.txt")

In [None]:
import re

def clean_text(file_path, output_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    text = re.sub(r"[*_#{}\[\]<>|]", "", text)
    text = re.sub(r"[\-–—]{2,}", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(text)

    print("Text cleaned and saved to", output_path)


clean_text("cleaned_text.txt", "icar_extracted_text.txt")