In [None]:
!pip install pymupdf pytesseract opencv-python pillow requests numpy


In [None]:
import pytesseract

# Set the Tesseract path explicitly
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Verify installation
print(pytesseract.get_tesseract_version())  # This should print the Tesseract version


In [None]:
import os
import fitz
from PIL import Image
import io
import cv2
import numpy as np

In [None]:
import os
import fitz  # PyMuPDF
import requests
from PIL import Image
import io
import pytesseract
import cv2
import numpy as np

# Set Tesseract OCR path for Windows
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Ensure TESSDATA_PREFIX is correctly set
os.environ["TESSDATA_PREFIX"] = r"C:\Program Files\Tesseract-OCR\tessdata"

# Your OpenAI API Key
API_KEY = os.getenv('OPENAI_API_KEY')
# Define input and output folders
pdf_folder = "./input_pdfs"
output_folder = "./OCR_results"
os.makedirs(output_folder, exist_ok=True)

def extract_page_as_image(pdf_path, page_index, resolution=900):
    """
    Extracts a specific page from a PDF as an image.
    """
    with fitz.open(pdf_path) as pdf:
        if page_index < 0 or page_index >= pdf.page_count:
            raise IndexError("Page index out of range.")
        
        page = pdf[page_index]
        zoom = resolution / 72  # Scaling factor
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        return img

def perform_ocr(image, language="slv"):
    """
    Performs OCR using Tesseract.
    """
    return pytesseract.image_to_string(image, lang=language)

def send_text_to_chatgpt(text, api_key):
    """
    Sends OCR text to ChatGPT API for correction and enhancement.
    """
    headers = {
        'Authorization': f'Bearer {api_key}',
        'Content-Type': 'application/json'
    }

    ocr_correction_prompt = f"""Correct OCR-induced errors in the text written in Slovene language (a historical text). Follow these guidelines:
1. Fix OCR-induced typos and errors:
   - Correct words split across line breaks
   - Remove unnecessary line breaks within sentences or paragraphs to ensure smooth reading flow.
   - Preserve meaningful paragraph breaks as they appear in the text.
   - Combine fragmented lines into full sentences where appropriate.
   - Fix common OCR errors (e.g., 'rn' misread as 'm')
   - Use context and linguistic knowledge to fix errors, but do not make speculative changes.
   - Focus only on clear errors; do not modify valid content unnecessarily.
   - Do not add extra periods or any unnecessary punctuation unless required by grammatical correctness.

2. Maintain original structure:
   - Preserve all headings, subheadings, and their formatting.
   - Do not merge or split paragraphs unless required to fix clear formatting issues caused by OCR.

3. Preserve original content:
   - Keep all important information from the original text unchanged.
   - Do not add, infer, or introduce any new information.
   
4. Maintain coherence:
   - Handle incomplete sentences gracefully: Correct partial sentences to make them grammatically and contextually correct and Resolve any disruptions caused by OCR errors that may fragment sentences or ideas.

IMPORTANT: Respond ONLY with the corrected text. Preserve all original formatting, including line breaks, except where fixing unnecessary line breaks within sentences or paragraphs. Do not include any introduction, explanation, or metadata.

Original Text in Slovene:
{text}"""
    
    data = {
        "model": "gpt-4o",
        "messages": [
            {"role": "system", "content": "You are an expert in correcting OCR errors in typewriter texts."},
            {"role": "user", "content": ocr_correction_prompt}
        ]
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=data)
    response.raise_for_status()
    return response.json()['choices'][0]['message']['content']

def process_pdf(pdf_path):
    """
    Process all pages of a PDF file.
    """
    filename = os.path.basename(pdf_path).rsplit('.', 1)[0]
    raw_output_path = os.path.join(output_folder, f"{filename}_raw.txt")
    corrected_output_path = os.path.join(output_folder, f"{filename}_corrected.txt")
    
    combined_raw_text = ""
    combined_corrected_text = ""
    
    with fitz.open(pdf_path) as pdf:
        num_pages = pdf.page_count
        
        for page_index in range(num_pages):
            print(f"Processing page {page_index + 1}/{num_pages} of {filename}...")
            try:
                image = extract_page_as_image(pdf_path, page_index)
                ocr_text = perform_ocr(image)
                corrected_text = send_text_to_chatgpt(ocr_text, API_KEY)
                
                combined_raw_text += f"\n\n=== Page {page_index + 1} ===\n\n{ocr_text}"
                combined_corrected_text += f"\n\n=== Page {page_index + 1} ===\n\n{corrected_text}"
            except Exception as e:
                print(f"Error processing page {page_index + 1}: {e}")
    
    with open(raw_output_path, "w", encoding="utf-8") as f:
        f.write(combined_raw_text)
    
    with open(corrected_output_path, "w", encoding="utf-8") as f:
        f.write(combined_corrected_text)
    
    print(f"Processing complete. Results saved in {output_folder}.")

# Process all PDFs in the folder
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    process_pdf(pdf_path)


## Evaluation – WER and CER Analysis

In [None]:
# install modules
!pip install -q jiwer matplotlib seaborn

In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
from jiwer import wer, cer
import pandas as pd

# Define paths
directory = 'OCR_results'

# Initialize lists to store results
documents = []
wer_scores = []
cer_scores = []

unique_file_names = set()
# Iterate over files
for filename in os.listdir(directory):
    unique_file_names.add(filename.replace('_raw', '').replace('_corrected', ''))

In [None]:
# Iterate over files
for filename in unique_file_names:
    name, ext = os.path.splitext(filename)
    raw_filename = f"{name}_raw{ext}"
    raw_path = os.path.join(directory, raw_filename)
    corrected_filename = f"{name}_corrected{ext}"
    corrected_path = os.path.join(directory, corrected_filename)
    
    # Read files
    with open(raw_path, 'r', encoding='utf-8') as f:
        raw_text = f.read()
    with open(corrected_path, 'r', encoding='utf-8') as f:
        corrected_text = f.read()
    
    # Compute WER and CER
    wer_score = wer(corrected_text, raw_text)
    cer_score = cer(corrected_text, raw_text)
    
    # Store results
    documents.append(filename)
    wer_scores.append(wer_score)
    cer_scores.append(cer_score)

In [None]:
# Create DataFrame
df = pd.DataFrame({
    'Document': documents,
    'WER': wer_scores,
    'CER': cer_scores
})
df.head()

In [None]:
len(df)

In [None]:
# Save results to CSV
df.to_csv('ocr_evaluation_results.csv', index=False)

In [None]:
# Sort and plot WER per document
df_sorted_wer = df.sort_values(by='WER', ascending=False)

plt.figure(figsize=(10, 50))
sns.barplot(y='Document', x='WER', data=df_sorted_wer)
plt.title('Word Error Rate (WER) per Document (Sorted Descending)')
plt.tight_layout()
plt.savefig('wer_per_document.png')
plt.show()

In [None]:
# Plot Top 20
df_sorted_wer = df.sort_values(by='WER', ascending=False)[:20]

plt.figure(figsize=(10, 8))
sns.barplot(y='Document', x='WER', data=df_sorted_wer)
plt.title('Word Error Rate (WER) per Document (Top 20 Most Error Rate)')
plt.tight_layout()
plt.savefig('wer_per_document_top_20.png')
plt.show()

In [None]:
# Plot Bottom 20
df_sorted_wer = df.sort_values(by='WER', ascending=True)[:20]

plt.figure(figsize=(10, 8))
sns.barplot(y='Document', x='WER', data=df_sorted_wer)
plt.title('Word Error Rate (WER) per Document (Top 20 Least Error Rate)')
plt.tight_layout()
plt.savefig('wer_per_document_bottom_20.png')
plt.show()

In [None]:
# Sort and plot CER per document
df_sorted_cer = df.sort_values(by='CER', ascending=False)

plt.figure(figsize=(10, 50))
sns.barplot(y='Document', x='CER', data=df_sorted_cer)
plt.title('Character Error Rate (CER) per Document (Sorted Descending)')
plt.tight_layout()
plt.savefig('cer_per_document.png')
plt.show()

In [None]:
# Plot Top 20
df_sorted_cer = df.sort_values(by='CER', ascending=False)[:20]

plt.figure(figsize=(10, 8))
sns.barplot(y='Document', x='CER', data=df_sorted_cer)
plt.title('Character Error Rate (CER) per Document (Top 20 Most Error Rate)')
plt.tight_layout()
plt.savefig('cer_per_document_top_20.png')
plt.show()

In [None]:
# Plot Bottom 20
df_sorted_cer = df.sort_values(by='CER', ascending=True)[:20]

plt.figure(figsize=(10, 8))
sns.barplot(y='Document', x='CER', data=df_sorted_cer)
plt.title('Character Error Rate (CER) per Document (Top 20 Least Error Rate)')
plt.tight_layout()
plt.savefig('cer_per_document_bottom_20.png')
plt.show()

In [None]:
# Create side-by-side subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot WER histogram
sns.histplot(data=df, x="WER", kde=True, bins=30, color='skyblue', ax=axes[0])
axes[0].set_title("Distribution of Word Error Rate (WER)")
axes[0].set_xlabel("WER")
axes[0].set_ylabel("Frequency")

# Plot CER histogram
sns.histplot(data=df, x="CER", kde=True, bins=30, color='salmon', ax=axes[1])
axes[1].set_title("Distribution of Character Error Rate (CER)")
axes[1].set_xlabel("CER")
axes[1].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

In [None]:
import numpy as np

for metric in ["WER", "CER"]:
    sorted_vals = np.sort(df[metric])
    cdf = np.arange(len(sorted_vals)) / float(len(sorted_vals))
    plt.plot(sorted_vals, cdf, label=metric)

plt.title("Cumulative Distribution of WER and CER")
plt.xlabel("Error Rate")
plt.ylabel("Cumulative Proportion")
plt.legend()
plt.grid(True)
plt.show()