# **PIPELINE**

##Pdf to Image

In [None]:
!pip install pdf2image
!apt-get install -y poppler-utils

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 29 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.3 [186 kB]
Fetched 186 kB in 1s (212 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 121658 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.3_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.3) ...
Setting up poppler-utils (22.02.0-2ubuntu0.3) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
from pdf2image import convert_from_path
from IPython.display import display

def pdf_to_images(pdf_path):
  images = convert_from_path(pdf_path)
  return images



##Table detection

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q timm
!pip install numpy

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m


In [None]:
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import DetrFeatureExtractor
from transformers import TableTransformerForObjectDetection
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image, ImageOps
from IPython.display import display
import torch


#color for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

def plot_results(pil_img,model, scores, labels, boxes):
    plt.figure(figsize=(16,10))
    plt.imshow(pil_img)
    ax = plt.gca()
    colors = COLORS * 100
    for score, label, (xmin, ymin, xmax, ymax),c  in zip(scores.tolist(), labels.tolist(), boxes.tolist(), colors):
        ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                   fill=False, color=c, linewidth=3))
        text = f'{model.config.id2label[label]}: {score:0.2f}'
        ax.text(xmin, ymin, text, fontsize=15,
                bbox=dict(facecolor='yellow', alpha=0.5))
    plt.axis('off')
    plt.show()


def img_page_to_img_table(pil_image,padding=18,preprocessing=True):
  #image = Image.open(myfile_path).convert("RGB")
  image = pil_image.convert("RGB")
  width, height = image.size
  image.resize((int(width*0.5), int(height*0.5)))

  feature_extractor = DetrFeatureExtractor()
  encoding = feature_extractor(image, return_tensors="pt")
  encoding.keys()
  model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")
  with torch.no_grad():
    outputs = model(**encoding)

  results = feature_extractor.post_process_object_detection(outputs, threshold=0.7, target_sizes=[(height, width)])[0]
  #plot_results(image,model, results['scores'], results['labels'], results['boxes'])

  #table cutting
  tables_list=[]
  bbox_list=[]
  for i in results['boxes'].tolist():
    x1=max(int(i[0])-padding,0)
    y1=max(int(i[1])-padding,0)
    x2=min(int(i[2])+padding,width)
    y2=min(int(i[3])+padding,height)
    image_array = np.array(image)
    cropped_image_array = image_array[y1:y2, x1:x2]
    cropped_image = Image.fromarray(cropped_image_array)
    #rotate the image if labels=1
    if results['labels'].tolist()[index]==1:
      cropped_image=cropped_image.rotate(-90, expand=1)

    if preprocessing==True:
      background_color=cropped_image.getpixel((0,0))
      immagine_con_bordo = ImageOps.expand(cropped_image, border=5, fill='black')
      immagine_finale = ImageOps.expand(immagine_con_bordo, border=20, fill=background_color)
      #display(immagine_finale)
    if preprocessing==False:
      immagine_finale=cropped_image

    if results['scores'].tolist()[index]>0.96:
      tables_list.append(immagine_finale)
      bbox_list.append(i)

  return (tables_list,bbox_list)



##Table Data Extraction

In [None]:
!pip install transformers
!pip install huggingface-hub
!pip install Pillow
!pip install torch
!pip install timm
!pip install pytesseract
!apt install tesseract-ocr
!apt install libtesseract-dev
!pip install opencv-python
!sudo apt-get install tesseract-ocr
!pip install pandas
!pip install easyocr

In [None]:
from PIL import Image, ImageEnhance
import copy
import cv2
import pytesseract
import pandas as pd
from google.colab.patches import cv2_imshow
import easyocr

def filter_boxes_by_label(data, label_to_filter=1):
    filtered_boxes = []

    for label, box,score in zip(data['labels'], data['boxes'], data['scores']):
        if label == label_to_filter and score>treshold_score:
            filtered_boxes.append(box.tolist())

    filtered_boxes = round_numbers_in_list_of_lists(filtered_boxes)

    return filtered_boxes


def get_cell_boxes(row_boxes, col_boxes):
    cell_boxes = []

    for row in row_boxes:
        row_start_y = row[1]
        row_end_y = row[3]
        for col in col_boxes:
            col_start_x = col[0]
            col_end_x = col[2]
            cell_boxes.append([col_start_x, row_start_y, col_end_x, row_end_y])
    cell_boxes = round_numbers_in_list_of_lists(cell_boxes)
    cell_boxes = sorted(cell_boxes, key=lambda x: (x[1],x[0]))

    return cell_boxes

def round_numbers_in_list_of_lists(input_list, decimal_places=2):
    rounded_list = []

    for sublist in input_list:
        rounded_sublist = [round(number) for number in sublist]
        rounded_list.append(rounded_sublist)

    return rounded_list

def crop_cell(pil_image, cell_box, show_output=False):

    array_immagine = np.array(pil_image)
    image = cv2.cvtColor(array_immagine, cv2.COLOR_RGB2BGR)
    x_min, y_min, x_max, y_max = tuple(cell_box)
    cropped_image = image[y_min:y_max, x_min:x_max]
    if show_output:
        cv2_imshow(image)
        cv2_imshow(cropped_image)
    return cropped_image

def extract_text_from_cell(cell_image,ocr="pytesseract", show_output=False):
    gray_image = cv2.cvtColor(cell_image, cv2.COLOR_BGR2GRAY)
    if ocr=='pytesseract':
      text = pytesseract.image_to_string(gray_image, config=r'--psm 6')
    elif ocr == 'easy':
      reader = easyocr.Reader(['en'])
      result = reader.readtext(np.array(gray_image))
      l=[det[1] for det in result]
      text=" ".join(l)
    else:
      print("ERROR")
    if show_output:
        cv2_imshow(cell_image)
        print(text)
    return text

def extract_text_from_table(pil_image, cell_boxes, col_n, ocr):
    """
        outputs a list of lists containing cells text
    """
    output = []
    output_row = []
    for i, c in enumerate(cell_boxes):
        cell_image = crop_cell(pil_image, c)
        cell_text = extract_text_from_cell(cell_image,ocr)
        cell_text = cell_text.replace('\n', '')
        cell_text = cell_text.replace('\x0c', '')
        output_row.append(cell_text)
        if (i+1) % col_n == 0:
            output.append(output_row)
            output_row = []
    return output

"""
def are_boxes_overlapping(rect1, rect2, threshold=0.0):
    # rect = (top-left_x, top-left_y, bottom-right_x, bottom-right_y)
    rect1_x1, rect1_y1, rect1_x2, rect1_y2 = rect1
    rect2_x1, rect2_y1, rect2_x2, rect2_y2 = rect2

    # Check if one rectangle is to the left of the other
    if rect1_x2 < rect2_x1 or rect2_x2 < rect1_x1:
        return False

    # Check if one rectangle is above the other
    if rect1_y2 < rect2_y1 or rect2_y2 < rect1_y1:
        return False

    # Calculate overlapping area
    overlap_x1 = max(rect1_x1, rect2_x1)
    overlap_y1 = max(rect1_y1, rect2_y1)
    overlap_x2 = min(rect1_x2, rect2_x2)
    overlap_y2 = min(rect1_y2, rect2_y2)

    overlap_width = max(0, overlap_x2 - overlap_x1)
    overlap_height = max(0, overlap_y2 - overlap_y1)

    overlap_area = overlap_width * overlap_height

    # Calculate the total area of each rectangle
    area_rect1 = (rect1_x2 - rect1_x1) * (rect1_y2 - rect1_y1)
    area_rect2 = (rect2_x2 - rect2_x1) * (rect2_y2 - rect2_y1)

    # Calculate the percentage of overlapping area relative to the total area of both rectangles
    overlap_percentage = overlap_area / (area_rect1 + area_rect2 - overlap_area)
    print(overlap_percentage)
    # Check if the percentage of overlapping area meets the threshold
    return overlap_percentage >= threshold
"""

def is_box_inside(rect1, rect2, threshold=0.0):
    # rect = (top-left_x, top-left_y, bottom-right_x, bottom-right_y)
    rect1_x1, rect1_y1, rect1_x2, rect1_y2 = rect1
    rect2_x1, rect2_y1, rect2_x2, rect2_y2 = rect2

    # Calculate the area of rect1
    area_rect1 = (rect1_x2 - rect1_x1) * (rect1_y2 - rect1_y1)

    # Calculate the area of the overlapping region
    overlap_x1 = max(rect1_x1, rect2_x1)
    overlap_y1 = max(rect1_y1, rect2_y1)
    overlap_x2 = min(rect1_x2, rect2_x2)
    overlap_y2 = min(rect1_y2, rect2_y2)

    overlap_width = max(0, overlap_x2 - overlap_x1)
    overlap_height = max(0, overlap_y2 - overlap_y1)

    overlap_area = overlap_width * overlap_height

    # Check if the percentage of rect1 inside rect2 meets the threshold
    overlap_percentage = overlap_area / area_rect1

    return overlap_percentage >= threshold

def get_row_headers_indexes(cell_boxes, row_projected_header_boxes, row_n, col_n, threshold=0.5):
    tmp = [0 for i in range(row_n)]

    for i, c in enumerate(cell_boxes):
        for _, r in enumerate(row_projected_header_boxes):
            if is_box_inside(c, r, threshold):
                tmp[i//col_n] += 1

    indexes = []
    for i, v in enumerate(tmp):
        if v == col_n:
            indexes.append(i)

    return indexes

def clean_row_headers(df, row_header_indexes):

    for i in row_header_indexes:
        tmp = ''
        for j in range(df.shape[1]):
            tmp += df.iloc[i][j]
            df.iloc[i][j] = ''
        if i in df.index and 0 in df.columns:
          df.iloc[i][0] = tmp

    return df

def move_projected_rows(df, row_headers_indexes, row_header_index, new_column_index):
    text_to_insert = ''

    for i in range(df.shape[0]):
        if i in row_headers_indexes:
            text_to_insert = df.iloc[i][row_header_index]
        else:
            df.at[i, new_column_index] = text_to_insert

    for i in row_headers_indexes:
        df.drop(i, inplace=True)

    return df

def extract_text_boxes_from_cell(cell_image, show_output=False):
    gray_image = cv2.cvtColor(cell_image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_data(gray_image, config=r'--psm 6')
    if show_output:
        cv2_imshow(cell_image)
        print(text)
    return text


def ocr_result(image_path):
    image = Image.open(image_path)
    enh_con = ImageEnhance.Contrast(image)
    contrast = 2.0
    image = enh_con.enhance(contrast)

    text = pytesseract.image_to_data(image, config='--psm 6')

    word_infos = []
    threshold = 50

    for line in output.split('\n')[1:]:
        tmp = line.split()
        if len(tmp) < 1:
            continue
        conf = int(tmp[10])
        if conf == -1:
            continue
        x1 = int(tmp[6])
        y1 = int(tmp[7])
        x2 = x1 + int(tmp[8])
        y2 = y1 + int(tmp[9])
        word = tmp[11]
        if conf > threshold:
            word_infos.append({'text':word, 'boundingBox': [x1,y1,x2,y2]})

    return word_infos


def merge_groups(groups1):
    """
        to-do: refactoring
    """
    groups = copy.deepcopy(groups1)
    merged_groups = []

    while groups:
        current_group = groups.pop(0)
        merged = set(current_group)

        i = 0
        while i < len(groups):
            if merged.intersection(groups[i]):
                merged.update(groups.pop(i))
            else:
                i += 1

        merged_groups.append(sorted(list(merged)))

    return merged_groups

In [None]:
def table_img_to_df(table_pil_img, th=0.81, ocr='pytesseract'):
  model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")
  image = table_pil_img.convert("RGB")

  feature_extractor = DetrFeatureExtractor()
  encoding = feature_extractor(image, return_tensors="pt")
  encoding.keys()

  with torch.no_grad():
    outputs = model(**encoding)

  target_sizes = [image.size[::-1]]
  results = feature_extractor.post_process_object_detection(outputs, threshold=th, target_sizes=target_sizes)[0]

  #POST-PROCESSING
  table_boxes = filter_boxes_by_label(results, label_to_filter=0,treshold_score=th)
  col_boxes = filter_boxes_by_label(results, label_to_filter=1,treshold_score=th)
  row_boxes = filter_boxes_by_label(results, label_to_filter=2,treshold_score=th)
  col_header_boxes = filter_boxes_by_label(results, label_to_filter=3,treshold_score=th)
  row_projected_header_boxes = filter_boxes_by_label(results, label_to_filter=4,treshold_score=th)
  spanning_boxes = filter_boxes_by_label(results, label_to_filter=5,treshold_score=th)
  col_n = len(col_boxes)
  row_n = len(row_boxes)
  cell_boxes = get_cell_boxes(row_boxes, col_boxes)
  table_data = extract_text_from_table(table_pil_img, cell_boxes, col_n, ocr)
  df = pd.DataFrame(table_data)
  df_simplepostprocess= df.copy()

  """
  cell_image = crop_cell(table_pil_img, cell_boxes[1], show_output=False)
  output = extract_text_boxes_from_cell(cell_image, show_output=True)

  word_infos = []
  threshold = 50
  for line in output.split('\n')[1:]:
      tmp = line.split()
      if len(tmp) < 1:
          continue
      conf = int(tmp[10])
      if conf == -1:
          continue
      x1 = int(tmp[6])
      y1 = int(tmp[7])
      x2 = x1 + int(tmp[8])
      y2 = y1 + int(tmp[9])
      word = tmp[11]
      if conf > threshold:
          word_infos.append({'text':word, 'boundingBox': [x1,y1,x2,y2]})
  """

  #spanning cell
  if len(spanning_boxes) > 0:
      spanning_cell_groups = []

      for j, s in enumerate(spanning_boxes):
          tmp = []
          for i, c in enumerate(cell_boxes):
              if is_box_inside(c, s, threshold=0.4):
                  tmp.append(i)
          spanning_cell_groups.append(tmp)


      filtered_spanning_cell_groups = merge_groups(spanning_cell_groups)
      filtered_spanning_cell_groups

      filtered_spanning_boxes_texts = []
      for g in filtered_spanning_cell_groups:
          cells = []

          for id in g:
              cells.append(cell_boxes[id])

          if len(cells) == 0:
              continue

          x1 = min([p[0] for p in cells])
          y1 = min([p[1] for p in cells])
          x2 = max([p[2] for p in cells])
          y2 = max([p[3] for p in cells])

          box_coord = [x1, y1, x2, y2]

          tmp = crop_cell(table_pil_img, box_coord)
          cell_text = extract_text_from_cell(tmp)
          cell_text = cell_text.replace('\n', ' ')
          cell_text = cell_text.replace('\x0c', '')

          filtered_spanning_boxes_texts.append(cell_text)

      for i, group in enumerate(filtered_spanning_cell_groups):
          for c in group:
              row_id = c//col_n
              col_id = c%col_n
              if 0 <= i < len(filtered_spanning_boxes_texts) and row_id < df.shape[0] and col_id < df.shape[1]:
                  df.at[row_id, col_id] = filtered_spanning_boxes_texts[i]
              else:
                  continue



      df.drop_duplicates()

  #row headers
  if len(row_projected_header_boxes) > 0:
      row_headers_indexes = get_row_headers_indexes(cell_boxes, row_projected_header_boxes, row_n=row_n, col_n=col_n,  threshold=0.5)
      df = clean_row_headers(df, row_headers_indexes)

      """
      # reshape row headers
      df[col_n] = ''
      move_projected_rows(df, row_headers_indexes, row_header_index=0, new_column_index=col_n)
      columns = list(df.columns)
      new_column_order = [columns[-1]] + columns[:-1]
      df = df[new_column_order]
      df.columns = range(len(df.columns))
      df.index = range(len(df.index))
      """


  #other
  result=[]
  result.append(df)
  result.append(df_simplepostprocess)
  """
  #find the number of rows in column header
  def calculate_n_row_headers(spanning_cell_groups):
      if len(spanning_cell_groups) == 0:
          return 1
      import itertools
      tmp = list(itertools.chain.from_iterable(spanning_cell_groups))
      return 1 + max(tmp)//len(col_boxes)


  #changing the df in a more suitable format
  l=[]
  n_col_headers = calculate_n_row_headers(spanning_cell_groups)
  row_headers = len(row_projected_header_boxes) > 0

  for i in range(n_col_headers,df.shape[0]):
      for j in range(1,df.shape[1]):
          if row_headers:
              lt=[df.iloc[i,0], df.iloc[i,1]]
          else:
              lt=[df.iloc[i,0]]
          for k in range(n_col_headers):
              lt.append(df.iloc[k,j])
          lt.append(df.iloc[i,j])
          l.append(lt)

  new_df = pd.DataFrame(l)
  result.append(new_df)
  """
  return result


##Main

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import glob

def main_tables(input_path="/content/drive/MyDrive/ADSP_Project/Paola's file/pdffiles_psoriasis", output_path="/content/drive/MyDrive/ADSP_Project/Code/output_table_pipeline", mode="standard", th_conf=0.8,pre_precessing=True, ocr_type="pytesseract"):

  diz_img=dict() #k= nome_pdf, v=lista di immagini

  cartella_pdf_path=input_path
  nome_cartella_pdf = os.path.basename(cartella_pdf_path)
  percorso_cartella_globale = os.path.join(output_path, nome_cartella_pdf)
  if not os.path.exists(percorso_cartella_globale):
      os.makedirs(percorso_cartella_globale)
      print(f"Cartella '{nome_cartella_pdf}' creata con successo in {percorso_cartella_globale}")
  else:
      print(f"La cartella '{nome_cartella_pdf}' esiste già in {percorso_cartella_globale}")

  tutti_i_file = glob.glob(os.path.join(cartella_pdf_path, '*'))
  print(tutti_i_file)
  pdf_files = [file for file in tutti_i_file if file.lower().endswith('.pdf')]

  for pdf_path in pdf_files:

    nome_pdf= os.path.basename(pdf_path)
    nome_file_senza_estensione, estensione = os.path.splitext(nome_pdf)

    nome_file_txt = "bbox.txt"
    nome_cartella = nome_file_senza_estensione




    nome_file_completo = os.path.basename(pdf_path)
    nome_file_senza_estensione, estensione = os.path.splitext(nome_file_completo)

    nome_file_txt = "bbox.txt"
    nome_cartella = nome_file_senza_estensione

    percorso_cartella = os.path.join(percorso_cartella_globale, nome_cartella)
    if not os.path.exists(percorso_cartella):
        os.makedirs(percorso_cartella)
        print(f"Cartella '{nome_cartella}' creata con successo in {percorso_cartella}")
    else:
        print(f"La cartella '{nome_cartella}' esiste già in {percorso_cartella}")

    percorso_file_txt = os.path.join(percorso_cartella,"bbox.txt" )

    percorso_cartella_img = os.path.join(percorso_cartella, "table_img" )
    if not os.path.exists(percorso_cartella_img):
        os.makedirs(percorso_cartella_img)
        print(f"Cartella '{nome_cartella}' creata con successo in {percorso_cartella_img}")
    else:
        print(f"La cartella '{nome_cartella}' esiste già in {percorso_cartella_img}")

    percorso_cartella_df = os.path.join(percorso_cartella, "dataframes" )
    if not os.path.exists(percorso_cartella_df):
        os.makedirs(percorso_cartella_df)
        print(f"Cartella '{nome_cartella}' creata con successo in {percorso_cartella_df}")
    else:
        print(f"La cartella '{nome_cartella}' esiste già in {percorso_cartella_df}")


    percorso_file_xlsx = os.path.join(percorso_cartella,"dataframes.xlsx" )

    diz_img[nome_file_senza_estensione]=[]
    with pd.ExcelWriter(percorso_file_xlsx) as writer:
      with open(percorso_file_txt , 'w') as filew:
        for npage,page_img in enumerate(pdf_to_images(pdf_path)):
          temp_result=img_page_to_img_table(page_img, padding=18,preprocessing=pre_precessing)
          for ntable,(table_img,table_bbox) in enumerate(zip(temp_result[0],temp_result[1])):
            diz_img[nome_file_senza_estensione].append(table_img)
            temp=table_img_to_df(table_pil_img=table_img,th=th_conf)
            if mode=="standard":
              table_dfs=temp[0]
            if mode=="analysis":
              table_dfs=temp[1]
            table_dfs.to_excel(writer, sheet_name=f'{npage}_{ntable}', index=False)
            display(table_img)
            print(f"PAGINA:{npage}\tTABELLA:{ntable}\tBBOX:{table_bbox}")
            print(table_dfs)
            print("\n\n\n\n\n\n\n\n\n\n\n")

            lista_di_stringhe = list(map(str, table_bbox))
            stringa_risultante = ', '.join(lista_di_stringhe)
            print(f"STRINGA RISULTANTE:{stringa_risultante}")
            with open(percorso_file_txt , 'a') as file:
              file.write(f"{npage}_{ntable},{stringa_risultante}\n")

            table_img.save(f"{percorso_cartella_img}//{npage}_{ntable}.png")

            percorso_file_csv = os.path.join(percorso_cartella_df,f"{npage}_{ntable}.csv" )
            table_dfs.to_csv(percorso_file_csv, index=False)


  return(diz_img)


In [None]:
#with pd.ExcelWriter("/content/df_easy_40248.xlsx") as writer:
#  for npage,page_img in enumerate(pdf_to_images("/content/40248_2016_Article_44.pdf")):
#          temp_result=img_page_to_img_table(page_img, padding=18)
#          for ntable,(table_img,table_bbox) in enumerate(zip(temp_result[0],temp_result[1])):
#            temp=table_img_to_df(table_img, ocr="easy")
#            table_dfs=temp[0]
#            table_dfs.to_excel(writer, sheet_name=f'{npage}_{ntable}', index=False)
#            display(table_img)
#            print(f"PAGINA:{npage}\tTABELLA:{ntable}\tBBOX:{table_bbox}")
#            print(table_dfs)
#            print("\n\n\n\n\n\n\n\n\n\n\n")

In [None]:
 #main_tables(input_path="/content/ingresso", output_path="/content/uscita")

Output hidden; open in https://colab.research.google.com to view.