<a href="https://colab.research.google.com/github/Khaledhamza77/Document-Processing-Pipeline/blob/main/Document_Processing_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [10]:
import requests
import os
from ultralytics import YOLO
from PIL import Image
import easyocr
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
from pdf2image import convert_from_bytes
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
import pandas as pd
from langchain.text_splitter import SpacyTextSplitter

#Models

Model form: https://huggingface.co/neuralshift/doc-layout-yolov8n/tree/main/weights

or from: https://huggingface.co/neuralshift/doc-layout-yolov8s/tree/main/weights

or from: https://huggingface.co/DILHTWD/documentlayoutsegmentation_YOLOv8_ondoclaynet/tree/main (best!)

In [17]:
model = YOLO('/content/drive/MyDrive/Yolov8 finetuned on DocLayNet/yolov8x-doclaynet-epoch64-imgsz640-initiallr1e-4-finallr1e-5.pt')
reader = easyocr.Reader(['en'])

#Custom Yolov8 Object Dectection and OCR

In [18]:
def remove_extra_spaces(text):
    # Remove extra spaces at the beginning and end of the string
    text = text.strip()
    # Remove double spaces between words
    text = re.sub(r'\s+', ' ', text)
    return text

def get_top_y(box):
    return box[1]

def crop_image(image, bounding_box):
    x_min, y_min, x_max, y_max = bounding_box

    cropped_image = image.crop((x_min, y_min, x_max, y_max))

    return cropped_image

In [19]:
def Yolov8andOCR(image,con,filename):
  df=pd.DataFrame([])
  images=[]
  results = model.predict(image,conf=con,verbose=False)
  for i, r in enumerate(results):
    im_bgr = r.plot()
    im_rgb = Image.fromarray(im_bgr[..., ::-1])
    images.append(im_rgb)

  boxes = results[0].boxes.xyxy.tolist()
  classes = results[0].boxes.cls.tolist()
  # Sort the bounding boxes based on their top y-coordinate (top-to-bottom order)
  combined_data = list(zip(boxes, classes))

  # Sort the combined data based on the y-coordinate of the top edge of each bounding box
  sorted_combined_data = sorted(combined_data, key=lambda x: get_top_y(x[0]))

  # Separate the sorted data back into separate lists
  if len(sorted_combined_data) != 0:
    sorted_bounding_boxes, sorted_classes = zip(*sorted_combined_data)
  else:
    sorted_bounding_boxes=boxes
    sorted_classes=classes

  ocr_text=''
  for box, cls in zip(sorted_bounding_boxes, sorted_classes):
    if cls == 9.0:
      img=crop_image(image,box)
      imagePath='/content/temp_image.jpg'
      img.save(imagePath)
      result = reader.readtext(imagePath)
      os.remove(imagePath)
      extracted_text = ''
      for detection in result:
        text = detection[1]
        extracted_text += text + ' '
      j=df.shape[0]
      df.at[j,'text']=extracted_text
      df.at[j,'filename']=filename
  return df, images

In [39]:
def main_function(filepath,conf,filename,firstpage,chunk):
  df = pd.DataFrame([])
  imgs = []
  images = convert_from_path(filepath)
  for i,image in enumerate(images):
    if not (i==0 and firstpage=='n'):
      df1, imgs1 = Yolov8andOCR(image,conf,filename)
      df = pd.concat([df, df1], ignore_index=True)
      imgs+=imgs1

  text=""
  for txt in df['text']:
    text+=txt

  text_splitter = SpacyTextSplitter(chunk_size=chunk)

  TrDf=pd.DataFrame([])
  chunks=text_splitter.split_text(text)
  for chunk in chunks:
    j=TrDf.shape[0]
    TrDf.at[j,'text']='In '+filename+' it says: '+chunk
    TrDf.at[j,'filename']=filename

  return TrDf, imgs

#Interface

In [41]:
import gradio as gr

theme = gr.themes.Soft(
    primary_hue="sky",
    secondary_hue="sky",
)

with gr.Blocks(theme=theme) as Interface:
  gr.Markdown("Upload pdf file for processing")
  with gr.Row():
    with gr.Column():
      filepath = gr.File(file_count='single', file_types=['.pdf'], type='filepath',scale=2)
      with gr.Row():
        conf = gr.Number(value=0.65,label="Confidence",step=0.1,minimum=0,maximum=1,scale=2)
        chunk = gr.Number(value=1000,label="Chunk size",step=500,minimum=500,maximum=5000,scale=2)
        firstpage = gr.Text(label="Include 1st pg? (y/n)",scale=2)
        filename=gr.Text(label="Enter Filename",scale=2)
      btn = gr.Button("Execute",scale=1)
    dataset = gr.Dataframe(label='Generated dataset',scale=1,wrap=True,headers=['text','filename'],row_count=(20,'dynamic'))
  Gallery = gr.Gallery(label="Yolov8 Detections",interactive=False,scale=1)


  btn.click(main_function, inputs=[filepath,conf,filename,firstpage,chunk], outputs=[dataset,Gallery])

Interface.launch(share=True,debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://61f0814a82efcceb89.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://61f0814a82efcceb89.gradio.live


