# Detect Pdf layouts

###  1 Pdf to Images

In [1]:
from pdf2image import convert_from_path
import cv2

In [2]:
poppler_path = r"C:\Program Files\poppler-22.04.0\Library\bin"

In [3]:
images = convert_from_path('relevedecompte.pdf', poppler_path=poppler_path)

In [4]:
for i in range(len(images)):
    
    images[i].save('pages/page'+str(i)+'.jpg', 'JPEG')

### 2 extract pdf layout 

In [5]:
import layoutparser as lp
import numpy as np
from PIL import Image

In [6]:
image_path = 'pages/page0.jpg'
image = cv2.imread(image_path)

### model with publaynet dataset

In [7]:
model = lp.PaddleDetectionLayoutModel(config_path="lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config",
                                threshold=0.20,
                                label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"},
                                enforce_cpu=False,
                                enable_mkldnn=True)

In [8]:
layout1 = model.detect(image)
print(layout1)

image_with_boxes = lp.draw_box(image, layout1, box_width=3)

image_with_boxes_bgr = cv2.cvtColor(np.array(image_with_boxes), cv2.COLOR_RGB2BGR)

cv2.imwrite('layout_result.jpg', image_with_boxes_bgr)

Layout(_blocks=[TextBlock(block=Rectangle(x_1=79.516845703125, y_1=1366.831298828125, x_2=1578.6143798828125, y_2=2160.018798828125), text=None, id=None, type=Table, parent=None, next=None, score=0.9422100782394409), TextBlock(block=Rectangle(x_1=85.61798095703125, y_1=2114.412353515625, x_2=1116.262939453125, y_2=2159.010498046875), text=None, id=None, type=Text, parent=None, next=None, score=0.8313506245613098), TextBlock(block=Rectangle(x_1=74.46974182128906, y_1=1317.26611328125, x_2=472.6107177734375, y_2=1360.0478515625), text=None, id=None, type=Text, parent=None, next=None, score=0.8205502033233643), TextBlock(block=Rectangle(x_1=80.30029296875, y_1=1178.50390625, x_2=1428.054443359375, y_2=1250.24169921875), text=None, id=None, type=Text, parent=None, next=None, score=0.6170427203178406), TextBlock(block=Rectangle(x_1=74.70989990234375, y_1=206.97821044921875, x_2=1507.751708984375, y_2=1112.60693359375), text=None, id=None, type=Figure, parent=None, next=None, score=0.5726488

True

### test with the tablebank dataset

In [9]:
model2 = lp.PaddleDetectionLayoutModel(config_path="lp://TableBank/ppyolov2_r50vd_dcn_365e_tableBank_latex/config",
                                threshold=0.2,
                                label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"},
                                enforce_cpu=False,
                                enable_mkldnn=True)

In [10]:
layout2 = model2.detect(image)

image_with_boxes = lp.draw_box(image, layout2, box_width=3)
image_with_boxes_bgr = cv2.cvtColor(np.array(image_with_boxes), cv2.COLOR_RGB2BGR)

print(layout2)
cv2.imwrite('Layout_Result2.jpg', image_with_boxes_bgr)

Layout(_blocks=[TextBlock(block=Rectangle(x_1=269.1351318359375, y_1=875.8480834960938, x_2=1386.025390625, y_2=1111.2939453125), text=None, id=None, type=Text, parent=None, next=None, score=0.8451780676841736), TextBlock(block=Rectangle(x_1=73.22955322265625, y_1=1359.31396484375, x_2=1580.960205078125, y_2=2090.005126953125), text=None, id=None, type=Text, parent=None, next=None, score=0.7878875732421875)], page_data={})


True

### Text Detection and Recognition from tables

### extract text 

In [4]:
text_blocks_excluding_tables = [block for block in layout1._blocks if block.type != 'Table']
extracted_text = ' '.join([block.text for block in text_blocks_excluding_tables if block.text])

NameError: name 'layout1' is not defined

In [None]:
organized_text = extracted_text.replace('\n', ' ')

output_file_path = 'textbrut1.0.txt'
with open(output_file_path, 'w', encoding='utf-8') as file:
    file.write(organized_text)

#### extract table 

In [11]:
for l in layout1:
  print(l)
  if l.type == 'Table':
    x_1 = int(l.block.x_1)
    print(l.block.x_1)
    y_1 = int(l.block.y_1)
    x_2 = int(l.block.x_2)
    y_2 = int(l.block.y_2)
    
    break

TextBlock(block=Rectangle(x_1=79.516845703125, y_1=1366.831298828125, x_2=1578.6143798828125, y_2=2160.018798828125), text=None, id=None, type=Table, parent=None, next=None, score=0.9422100782394409)
79.516846


In [12]:
print(x_1,y_1,x_2,y_2)

79 1366 1578 2160


In [13]:
cv2.imwrite('table.jpg', image[y_1:y_2,x_1:x_2])

True

#### detect text in tables

In [14]:
from paddleocr import PaddleOCR, draw_ocr

In [19]:
np.int = np.int32
np.float = np.float64
np.bool = np.bool_

In [32]:
ocr = PaddleOCR(lang='en')
image_path = 'table.jpg'
image_cv = cv2.imread(image_path)
image_height = image_cv.shape[0]
image_width = image_cv.shape[1]
output = ocr.ocr(image_path)

[2024/03/01 10:32:48] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, shape_info_filename=None, precision='fp32', gpu_mem=500, image_dir=None, det_algorithm='DB', det_model_dir='C:\\Users\\nonsh/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_sast_polygon=False, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_box_type='quad', det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, det_fce_box_type='poly', rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\nonsh/.paddleocr/whl\\rec\\en\\en_PP-OCRv3_rec_infer', rec_image_shape='3, 48, 320', rec_batch_n

In [33]:
print(output)

[[[[162.0, 10.0], [252.0, 14.0], [250.0, 47.0], [160.0, 43.0]], ('Valeur', 0.998108446598053)], [[[37.0, 16.0], [103.0, 16.0], [103.0, 45.0], [37.0, 45.0]], ('Date', 0.9987195730209351)], [[[561.0, 17.0], [829.0, 17.0], [829.0, 43.0], [561.0, 43.0]], ("Nature de l'operation", 0.9390084147453308)], [[[1176.0, 14.0], [1251.0, 14.0], [1251.0, 43.0], [1176.0, 43.0]], ('Debit', 0.9974892735481262)], [[[1360.0, 16.0], [1443.0, 16.0], [1443.0, 43.0], [1360.0, 43.0]], ('Credit', 0.9953996539115906)], [[[693.0, 67.0], [1091.0, 67.0], [1091.0, 88.0], [693.0, 88.0]], ('SOLDE PRECEDENT AUXX/XX/XXXX', 0.9577488899230957)], [[[1382.0, 68.0], [1476.0, 68.0], [1476.0, 95.0], [1382.0, 95.0]], ('2 543,19', 0.9416338205337524)], [[[23.0, 115.0], [117.0, 115.0], [117.0, 141.0], [23.0, 141.0]], ('10/06/10', 0.999289870262146)], [[[161.0, 113.0], [253.0, 113.0], [253.0, 141.0], [161.0, 141.0]], ('10/06/10', 0.9994224905967712)], [[[298.0, 115.0], [526.0, 115.0], [526.0, 136.0], [298.0, 136.0]], ('VIR RECU71

In [34]:
boxes = [line[0] for line in output]
texts = [line[1][0] for line in output]
probabilities = [line[1][1] for line in output]

In [35]:
image_boxes = image_cv.copy()

In [36]:
for box,text in zip(boxes,texts):
  cv2.rectangle(image_boxes, (int(box[0][0]),int(box[0][1])), (int(box[2][0]),int(box[2][1])),(0,0,255),1)
  cv2.putText(image_boxes, text,(int(box[0][0]),int(box[0][1])),cv2.FONT_HERSHEY_SIMPLEX,1,(222,0,0),1)

In [37]:
cv2.imwrite('detections.jpg', image_boxes)

True

### define vertical and horizontal columns

In [38]:
im = image_cv.copy()

In [39]:
horiz_boxes = []
vert_boxes = []

for box in boxes:
  x_h, x_v = 0,int(box[0][0])
  y_h, y_v = int(box[0][1]),0
  width_h,width_v = image_width, int(box[2][0]-box[0][0])
  height_h,height_v = int(box[2][1]-box[0][1]),image_height

  horiz_boxes.append([x_h,y_h,x_h+width_h,y_h+height_h])
  vert_boxes.append([x_v,y_v,x_v+width_v,y_v+height_v])

  cv2.rectangle(im,(x_h,y_h), (x_h+width_h,y_h+height_h),(0,0,255),1)
  cv2.rectangle(im,(x_v,y_v), (x_v+width_v,y_v+height_v),(0,255,0),1)

In [40]:
cv2.imwrite('horiz_vert.jpg',im)

True

In [42]:
import tensorflow as tf




In [43]:
horiz_out = tf.image.non_max_suppression(
    horiz_boxes,
    probabilities,
    max_output_size = 1000,
    iou_threshold=0.1,
    score_threshold=float('-inf'),
    name=None
)

In [44]:
horiz_lines = np.sort(np.array(horiz_out))
print(horiz_lines)

[ 1  5  8 11 12 13 15 19 23 26 30 35 38 43 46 50 51 55 58 59 60]


In [45]:
im_nms = image_cv.copy()

In [46]:
for val in horiz_lines:
  cv2.rectangle(im_nms, (int(horiz_boxes[val][0]),int(horiz_boxes[val][1])), (int(horiz_boxes[val][2]),int(horiz_boxes[val][3])),(0,0,255),1)
  

In [47]:
cv2.imwrite('im_nms.jpg',im_nms)

True

### non max supression

In [48]:
vert_out = tf.image.non_max_suppression(
    vert_boxes,
    probabilities,
    max_output_size = 1000,
    iou_threshold=0.1,
    score_threshold=float('-inf'),
    name=None
)

In [49]:
print(vert_out)

tf.Tensor([43 42  3 44  4  5], shape=(6,), dtype=int32)


In [50]:
vert_lines = np.sort(np.array(vert_out))
print(vert_lines)

[ 3  4  5 42 43 44]


In [51]:
for val in vert_lines:
  
   cv2.rectangle(im_nms, (int(vert_boxes[val][0]),int(vert_boxes[val][1])), (int(vert_boxes[val][2]),int(vert_boxes[val][3])),(255,0,0),1)
  

In [52]:
cv2.imwrite('im_nms.jpg',im_nms)

True

### convert csv

In [53]:
out_array = [["" for i in range(len(vert_lines))] for j in range(len(horiz_lines))]
print(np.array(out_array).shape)
print(out_array)

(21, 6)
[['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', ''], ['', '', '', '', '', '']]


In [54]:

unordered_boxes = []

for i in vert_lines:
  print(vert_boxes[i])
  unordered_boxes.append(vert_boxes[i][0])

[1176, 0, 1251, 794]
[1360, 0, 1443, 794]
[693, 0, 1091, 794]
[23, 0, 119, 794]
[161, 0, 255, 794]
[298, 0, 431, 794]


In [55]:
ordered_boxes = np.argsort(unordered_boxes)
print(ordered_boxes)

[3 4 5 2 0 1]


In [56]:
def intersection(box_1, box_2):
  return [box_2[0], box_1[1],box_2[2], box_1[3]]

In [57]:
def iou(box_1, box_2):

  x_1 = max(box_1[0], box_2[0])
  y_1 = max(box_1[1], box_2[1])
  x_2 = min(box_1[2], box_2[2])
  y_2 = min(box_1[3], box_2[3])

  inter = abs(max((x_2 - x_1, 0)) * max((y_2 - y_1), 0))
  if inter == 0:
      return 0
      
  box_1_area = abs((box_1[2] - box_1[0]) * (box_1[3] - box_1[1]))
  box_2_area = abs((box_2[2] - box_2[0]) * (box_2[3] - box_2[1]))
  
  return inter / float(box_1_area + box_2_area - inter)

In [58]:
for i in range(len(horiz_lines)):
  for j in range(len(vert_lines)):
    resultant = intersection(horiz_boxes[horiz_lines[i]], vert_boxes[vert_lines[ordered_boxes[j]]] )

    for b in range(len(boxes)):
      the_box = [boxes[b][0][0],boxes[b][0][1],boxes[b][2][0],boxes[b][2][1]]
      if(iou(resultant,the_box)>0.1):
        out_array[i][j] = texts[b]

In [59]:
out_array=np.array(out_array)

In [60]:
out_array

array([['Date', 'Valeur', '', "Nature de l'operation", 'Debit', 'Credit'],
       ['', '', '', 'SOLDE PRECEDENT AUXX/XX/XXXX', '', '2 543,19'],
       ['10/06/10', '10/06/10', 'VIR RECU7141686480', '', '', '109,43'],
       ['', '', 'DESDEI', '', '', ''],
       ['', '', 'MOTIF8010230150949505', '', '', ''],
       ['', '', '2SDEI028010230150', '', '', ''],
       ['10/06/10', '10/06/10', 'CARTEX34030906CARREFOURMARKET', '',
        '30,65', ''],
       ['10/06/10', '10/06/10', 'CARTEX3403 09/06 GRENIER GOURMAND', '',
        '60,00', ''],
       ['10/06/10', '10/06/10', 'CHEQUE 458', '', '150,00', ''],
       ['10/06/10', '10/06/10', 'CHEQUE 459', '', '70,00', ''],
       ['10/06/10', '10/06/10', 'CARTEX3403 0906CARREFOURMARKET', '',
        '20,00', ''],
       ['10/06/10', '10/06/10', 'CARTEX3403 09/06 GIEDU SECBRON', '',
        '25,00', ''],
       ['10/06/10', '10/06/10', 'CHEQUE 460', '', '49,99', ''],
       ['10/06/10', '10/06/10', 'CHEQUE461', '', '26,00', ''],
       ['10/06

In [62]:
import pandas as pd

In [63]:
pd.DataFrame(out_array).to_csv('sample.csv')

### test docQuery Donuts

In [2]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

In [3]:
from docquery import document, pipeline
p = pipeline('document-question-answering')
doc = document.load_document("relevedecompte.pdf")
for q in ["What is the Credits?", "What is the Debits?"]:
  print(q, p(question=q, **doc.context))

PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?