# Handling PDF Tables using img2Table library:

- So far out performs most of the libraries used before
- It even works when images or web pages are converted to PDF and fed to it.
- Limitation: doesn't detect tables without vertical borders

In [15]:
!pip install img2table pandas openpyxl
!pip install pytesseract
!sudo apt-get install tesseract-ocr

Collecting img2table
  Downloading img2table-1.2.11-py3-none-any.whl.metadata (20 kB)
Collecting polars>=0.20.5 (from polars[pandas]>=0.20.5->img2table)
  Downloading polars-1.2.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting pymupdf>=1.19.1 (from img2table)
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting xlsxwriter>=3.0.6 (from img2table)
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Collecting PyMuPDFb==1.24.9 (from pymupdf>=1.19.1->img2table)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading img2table-1.2.11-py3-none-any.whl (90 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.2/90.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading polars-1.2.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.9/30.9

In [12]:
import os
import pandas as pd
from img2table.document import PDF
from img2table.ocr import TesseractOCR


In [13]:
def extract_tables_from_pdfs(pdf_folder, output_folder, ocr_lang="eng"):
    # Ensure the output directory exists
    os.makedirs(output_folder, exist_ok=True)

    # Instantiate the OCR engine
    ocr = TesseractOCR(lang=ocr_lang)

    # Iterate over all PDF files in the specified folder
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            base_name = os.path.splitext(filename)[0]

            # Create a directory for the current PDF file
            pdf_output_dir = os.path.join(output_folder, base_name)
            os.makedirs(pdf_output_dir, exist_ok=True)

            # Instantiate the PDF document
            pdf = PDF(src=pdf_path)

            # Extract tables
            pdf_tables = pdf.extract_tables(ocr=ocr)

            # Save the extracted tables to an XLSX file
            xlsx_path = os.path.join(pdf_output_dir, f"{base_name}.xlsx")
            pdf.to_xlsx(xlsx_path, ocr=ocr)
            print(f"Saved tables to {xlsx_path}")

            # Convert the XLSX file to CSV
            xlsx_to_csv(xlsx_path, pdf_output_dir, base_name)

def xlsx_to_csv(xlsx_path, pdf_output_dir, base_name):
    # Load the XLSX file
    xlsx = pd.ExcelFile(xlsx_path)

    # Convert each sheet in the XLSX file to a CSV file
    for sheet_name in xlsx.sheet_names:
        df = pd.read_excel(xlsx, sheet_name=sheet_name)
        csv_path = os.path.join(pdf_output_dir, f"{base_name}_{sheet_name}.csv")
        df.to_csv(csv_path, index=False)
        print(f"Converted {sheet_name} to CSV: {csv_path}")


In [69]:
# Example usage
pdf_folder = "/content/newpapers"  # Replace with the path to your PDF folder
output_folder = "/content/newcsv"  # Replace with your desired output folder
extract_tables_from_pdfs(pdf_folder, output_folder)


  .rename({col: f"{col}_" for col in df_h_lines.columns})
  .rename({col: f"{col}_" for col in df_cells.columns})
  .rename({col: f"{col}_" for col in df_h_lines.columns})
  .rename({col: f"{col}_" for col in df_cells.columns})


Saved tables to /content/newcsv/PAW Patrol - Wikipedia/PAW Patrol - Wikipedia.xlsx
Converted Page 1 - Table 1 to CSV: /content/newcsv/PAW Patrol - Wikipedia/PAW Patrol - Wikipedia_Page 1 - Table 1.csv
Converted Page 2 - Table 1 to CSV: /content/newcsv/PAW Patrol - Wikipedia/PAW Patrol - Wikipedia_Page 2 - Table 1.csv
Converted Page 3 - Table 1 to CSV: /content/newcsv/PAW Patrol - Wikipedia/PAW Patrol - Wikipedia_Page 3 - Table 1.csv
Converted Page 3 - Table 2 to CSV: /content/newcsv/PAW Patrol - Wikipedia/PAW Patrol - Wikipedia_Page 3 - Table 2.csv
Converted Page 11 - Table 1 to CSV: /content/newcsv/PAW Patrol - Wikipedia/PAW Patrol - Wikipedia_Page 11 - Table 1.csv
Converted Page 12 - Table 1 to CSV: /content/newcsv/PAW Patrol - Wikipedia/PAW Patrol - Wikipedia_Page 12 - Table 1.csv
Converted Page 13 - Table 1 to CSV: /content/newcsv/PAW Patrol - Wikipedia/PAW Patrol - Wikipedia_Page 13 - Table 1.csv
Converted Page 14 - Table 1 to CSV: /content/newcsv/PAW Patrol - Wikipedia/PAW Patrol

## Attempt to handle limitation (Failed)

In [72]:
!pip install timm

Collecting timm
  Downloading timm-1.0.7-py3-none-any.whl.metadata (47 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/47.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.5/47.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading timm-1.0.7-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: timm
Successfully installed timm-1.0.7


In [3]:
!pip install torch transformers pillow pymupdf matplotlib




In [6]:
import torch
from transformers import DetrImageProcessor, TableTransformerForObjectDetection
from PIL import Image
import requests
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import pytesseract
import pandas as pd
import cv2
import numpy as np
import fitz
import os

In [15]:
import os
import fitz
import torch
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from transformers import DetrImageProcessor, TableTransformerForObjectDetection

# Initialize the processor and model
processor = DetrImageProcessor.from_pretrained("microsoft/table-transformer-detection")
model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")

# Load a PDF
pdf_path = "/content/failedDocs/ESWC2024.pdf"  # Replace with your PDF path
pdf_document = fitz.open(pdf_path)

# Ensure the output directory exists
output_dir = "/content/withBoundingBox"
os.makedirs(output_dir, exist_ok=True)

# Loop through each page in the PDF
for page_num in range(len(pdf_document)):
    page = pdf_document.load_page(page_num)
    pix = page.get_pixmap()  # Convert page to an image
    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    # Preprocess the image
    inputs = processor(images=image, return_tensors="pt")

    # Perform table detection
    outputs = model(**inputs)

    # Convert outputs to COCO API format
    target_sizes = torch.tensor([image.size[::-1]])
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]

    # Visualize the results and extract table regions
    draw = ImageDraw.Draw(image)
    tables = []

    for idx, (score, label, box) in enumerate(zip(results["scores"], results["labels"], results["boxes"])):
        if score > 0.95:  # Threshold to filter low-confidence predictions
            box = box.int().numpy()
            draw.rectangle([box[0], box[1], box[2], box[3]], outline="red", width=2)
            draw.text((box[0], box[1]), f"Table: {score:.2f}", fill="white")

            # Crop the detected table area
            table_image = image.crop((box[0], box[1], box[2], box[3]))
            tables.append(table_image)

            # Save the table image as a separate PDF
            table_pdf_path = os.path.join(output_dir, f"table_page_{page_num}_table_{idx}.pdf")
            table_image.save(table_pdf_path, "PDF", resolution=100.0)

    # Save the page image with bounding boxes as a PDF
    page_pdf_path = os.path.join(output_dir, f"page_{page_num}_with_boxes.pdf")
    image.save(page_pdf_path, "PDF", resolution=100.0)

    # Optionally, display the image with bounding boxes
    plt.figure(figsize=(10, 10))
    plt.imshow(image)
    plt.axis('off')
    plt.show()


Output hidden; open in https://colab.research.google.com to view.

In [17]:
# Example usage
pdf_folder = "/content/withBoundingBox"  # Replace with the path to your PDF folder
output_folder = "/content/withBoundingBox"  # Replace with your desired output folder
extract_tables_from_pdfs(pdf_folder, output_folder)


  .rename({col: f"{col}_" for col in df_h_lines.columns})
  .rename({col: f"{col}_" for col in df_cells.columns})


Saved tables to /content/finalTestYarab/page_6_with_boxes (1)/page_6_with_boxes (1).xlsx
Converted Sheet1 to CSV: /content/finalTestYarab/page_6_with_boxes (1)/page_6_with_boxes (1)_Sheet1.csv


# Handling tables in word documents

In [None]:
# Postponed: use your own implementation or convert word document to PDF/images and use img2Table library

# Handling web tables

In [68]:
# Postponed: use your own implmentation for web scraping

# Generating table summaries using lightweight Mistral-7B

In [None]:
!pip install huggingface_hub




In [7]:
from huggingface_hub import login
login('HuggingFaceToken')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.23 (from langchain)
  Downloading langchain_core-0.2.23-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.93-py3-none-any.whl.metadata (13 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.23->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain-core<

In [3]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.21.3-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.2.10-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (

In [1]:
!pip install -q -U langchain transformers bitsandbytes accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m871.6 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m990.3/990.3 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.2/374.2 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import torch
from transformers import BitsAndBytesConfig
from langchain import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [5]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [8]:
model_4bit = AutoModelForCausalLM.from_pretrained( "mistralai/Mistral-7B-Instruct-v0.1", device_map="auto",quantization_config=quantization_config, )
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [9]:
pipeline_inst = pipeline(
        "text-generation",
        model=model_4bit,
        tokenizer=tokenizer,
        use_cache=True,
        device_map="auto",
        max_length=2500,
        do_sample=True,
        top_k=5,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
)

llm = HuggingFacePipeline(pipeline=pipeline_inst)

  warn_deprecated(


In [10]:
template = """<s>[INST] You are an respectful and helpful assistant, respond always be precise, assertive and politely answer in few words conversational english.
Answer the question below from context below :
{question} [/INST] </s>
"""

def generate_response(question):
  prompt = PromptTemplate(template=template, input_variables=["question","context"])
  llm_chain = LLMChain(prompt=prompt, llm=llm)
  response = llm_chain.run({"question":question})
  return response

In [11]:
test = generate_response("Name one president of america?")

print(test)

  warn_deprecated(
  warn_deprecated(
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'\nOne president of the United States is George Washington.'

In [13]:
t = generate_response("""Write a summary for the following table "Name,Age,Occupation,Country
John Doe,28,Software Engineer,United States
Jane Smith,34,Data Scientist,Canada
Albert Brown,45,Teacher,United Kingdom
Emily White,29,Graphic Designer,Australia
Michael Green,50,Doctor,Germany" ?
""")



In [14]:
t

'<s>[INST] You are an respectful and helpful assistant, respond always be precise, assertive and politely answer in few words conversational english.\nAnswer the question below from context below :\nWrite a summary for the following table "Name,Age,Occupation,Country\nJohn Doe,28,Software Engineer,United States\nJane Smith,34,Data Scientist,Canada\nAlbert Brown,45,Teacher,United Kingdom\nEmily White,29,Graphic Designer,Australia\nMichael Green,50,Doctor,Germany" ?\n [/INST] </s>\n\nThe table shows the personal information of five people from different countries. The names, ages, occupations, and countries of the individuals are listed. The table includes information about John Doe (28, Software Engineer, United States), Jane Smith (34, Data Scientist, Canada), Albert Brown (45, Teacher, United Kingdom), Emily White (29, Graphic Designer, Australia), and Michael Green (50, Doctor, Germany).\n\nThe data suggests that the individuals come from diverse backgrounds and have a range of occup

In [34]:
template = """<s>[INST]
You are Inclusive Insight, an empathetic and detail-oriented assistant committed to making data accessible for visually impaired users. You excel at summarizing complex tables and charts into clear, concise, and meaningful information.

Your passion for inclusivity drives you to break down intricate data and technical jargon into simple explanations, ensuring everyone can understand key points and trends. With patience and attentiveness, you empower users by providing vivid descriptions and highlighting significant details.

Your goal is to create an inclusive environment where everyone has equal access to knowledge.

Summarize the table below for a visually impaired user:
{question} [/INST] </s>
"""

def generate_summary(question):
  prompt = PromptTemplate(template=template, input_variables=["question"])
  llm_chain = LLMChain(prompt=prompt, llm=llm)
  response = llm_chain.run({"question":question})
  return response

In [58]:
import pandas as pd

# Path to your CSV file
csv_file_path = '/content/newcsv/Exercise1/Exercise1_Page 1 - Table 1.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)
# Convert DataFrame to HTML table
html_table = df.to_html(index=False)

# Save the HTML table to a file
with open('/content/csv/table.html', 'w') as f:
    f.write(html_table)

print("HTML table saved to 'table.html'")
#print(html_table)

HTML table saved to 'table.html'


In [59]:
Query1 = "Write a summary for the following table : ' "+ html_table+ " '"
response = generate_summary(Query1)
print(response)

<s>[INST] 
You are Inclusive Insight, an empathetic and detail-oriented assistant committed to making data accessible for visually impaired users. You excel at summarizing complex tables and charts into clear, concise, and meaningful information.

Your passion for inclusivity drives you to break down intricate data and technical jargon into simple explanations, ensuring everyone can understand key points and trends. With patience and attentiveness, you empower users by providing vivid descriptions and highlighting significant details.

Your goal is to create an inclusive environment where everyone has equal access to knowledge.

Summarize the table below for a visually impaired user:
Write a summary for the following table : ' <table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>Institution</th>
      <th>Data Set</th>
      <th>Notes</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>QRH+NCH</td>
      <td>patients</td>
      <td>Patients wi