**STEP 1: EXTRACT THE TEXT FROM THE PDF**

In [7]:
!pip install pdfplumber
!pip install pdf2image
!pip install pytesseract
import requests
import pdfplumber
from pdf2image import convert_from_path
import pytesseract

def download_pdf(url, save_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"PDF downloaded successfully: {save_path}")
    else:
        print(f"Failed to download PDF. Status code: {response.status_code}")

# Function to extract text using pdfplumber
def extract_text_with_pdfplumber(pdf_path):
    extracted_text = {}
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            extracted_text[page_number] = page.extract_text()
    return extracted_text

# Fallback function to use OCR if pdfplumber fails
def extract_text_with_ocr(pdf_path):
    images = convert_from_path(pdf_path)
    ocr_text = {}
    for i, image in enumerate(images):
        text = pytesseract.image_to_string(image, lang='eng')
        ocr_text[i + 1] = text
    return ocr_text

# URL of the PDF
pdf_url = "https://www.hunter.cuny.edu/dolciani/pdf_files/workshop-materials/mmc-presentations/tables-charts-and-graphs-with-examples-from.pdf"
local_pdf_path = "downloaded_pdf.pdf"

# Step 1: Download the PDF
download_pdf(pdf_url, local_pdf_path)

# Step 2: Attempt to extract text with pdfplumber
plumber_text = extract_text_with_pdfplumber(local_pdf_path)

# Step 3: Check if text was extracted; if not, use OCR
if all(not text for text in plumber_text.values()):  # If no text extracted
    print("Text extraction failed with pdfplumber. Switching to OCR...")
    ocr_text = extract_text_with_ocr(local_pdf_path)
    for page, text in ocr_text.items():
        print(f"--- Page {page} ---")
        print(text)
else:
    for page, text in plumber_text.items():
        print(f"--- Page {page} ---")
        print(text)


PDF downloaded successfully: downloaded_pdf.pdf
--- Page 1 ---
Tables, Charts, and
Graphs
with Examples from History, Economics,
Education, Psychology, Urban Affairs and
Everyday Life
REVISED: MICHAEL LOLKUS 2018
--- Page 2 ---

--- Page 3 ---
Tables, Charts, and
Graphs Basics
--- Page 4 ---
 We use charts and graphs to visualize data.
 This data can either be generated data, data gathered from
an experiment, or data collected from some source.
 A picture tells a thousand words so it is not a surprise that
many people use charts and graphs when explaining data.
--- Page 5 ---
Types of Visual
Representations of Data
--- Page 6 ---
Table of Yearly U.S. GDP by
Industry (in millions of dollars)
Source: U.S. Bureau of Labor Statistics
Year 2010 2011 2012 2013 2014 2015
All Industries 26093515 27535971 28663246 29601191 30895407 31397023
Manufacturing 4992521 5581942 5841608 5953299 6047477 5829554
Finance,
Insurance, Real
4522451 4618678 4797313 5031881 5339678 5597018
Estate, Rental,
Le

In [8]:
print(plumber_text)

{1: 'Tables, Charts, and\nGraphs\nwith Examples from History, Economics,\nEducation, Psychology, Urban Affairs and\nEveryday Life\nREVISED: MICHAEL LOLKUS 2018', 2: '', 3: 'Tables, Charts, and\nGraphs Basics', 4: '\uf075 We use charts and graphs to visualize data.\n\uf075 This data can either be generated data, data gathered from\nan experiment, or data collected from some source.\n\uf075 A picture tells a thousand words so it is not a surprise that\nmany people use charts and graphs when explaining data.', 5: 'Types of Visual\nRepresentations of Data', 6: 'Table of Yearly U.S. GDP by\nIndustry (in millions of dollars)\nSource: U.S. Bureau of Labor Statistics\nYear 2010 2011 2012 2013 2014 2015\nAll Industries 26093515 27535971 28663246 29601191 30895407 31397023\nManufacturing 4992521 5581942 5841608 5953299 6047477 5829554\nFinance,\nInsurance, Real\n4522451 4618678 4797313 5031881 5339678 5597018\nEstate, Rental,\nLeasing\nArts,\nEntertainment,\nRecreation, 964032 1015238 1076249 11

***STEP 2: DIVIDE INTO CHUCKS***



In [9]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt_tab')

def segment_text(text):
    chunks = sent_tokenize(text)
    return chunks

chunks = [segment_text(page) for page in plumber_text.values()]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
print(chunks)

[['Tables, Charts, and\nGraphs\nwith Examples from History, Economics,\nEducation, Psychology, Urban Affairs and\nEveryday Life\nREVISED: MICHAEL LOLKUS 2018'], [], ['Tables, Charts, and\nGraphs Basics'], ['\uf075 We use charts and graphs to visualize data.', '\uf075 This data can either be generated data, data gathered from\nan experiment, or data collected from some source.', '\uf075 A picture tells a thousand words so it is not a surprise that\nmany people use charts and graphs when explaining data.'], ['Types of Visual\nRepresentations of Data'], ['Table of Yearly U.S. GDP by\nIndustry (in millions of dollars)\nSource: U.S. Bureau of Labor Statistics\nYear 2010 2011 2012 2013 2014 2015\nAll Industries 26093515 27535971 28663246 29601191 30895407 31397023\nManufacturing 4992521 5581942 5841608 5953299 6047477 5829554\nFinance,\nInsurance, Real\n4522451 4618678 4797313 5031881 5339678 5597018\nEstate, Rental,\nLeasing\nArts,\nEntertainment,\nRecreation, 964032 1015238 1076249 1120496

***STEP 3: EMBED THE TEXT***

In [11]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embeddings(chunks):
    embeddings = model.encode(chunks, convert_to_tensor=True)
    return embeddings
flat_chunks = [chunk for page_chunks in chunks for chunk in page_chunks]
embeddings = generate_embeddings(flat_chunks)


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
print(embeddings)


tensor([[ 0.1010, -0.0341,  0.0138,  ...,  0.0192, -0.0571,  0.0476],
        [ 0.0133, -0.0165, -0.0226,  ...,  0.0410, -0.0189, -0.0785],
        [ 0.0623, -0.0066, -0.0302,  ..., -0.0133, -0.0026, -0.0922],
        ...,
        [ 0.0678,  0.0643, -0.0673,  ...,  0.0209,  0.0322,  0.0086],
        [ 0.0447,  0.0492, -0.0442,  ..., -0.0145,  0.0371, -0.0250],
        [ 0.0202,  0.0019, -0.0224,  ...,  0.0410, -0.0240, -0.0318]])


In [13]:
pip install faiss-gpu



In [14]:
import faiss
import numpy as np
import torch
# Create FAISS index if embeddings are not empty
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
# Convert embeddings to NumPy array
index.add(embeddings.cpu().numpy())
# Metadata mapping
metadata = {i: flat_chunks[i] for i in range(len(flat_chunks))}


In [15]:
print(metadata)

{0: 'Tables, Charts, and\nGraphs\nwith Examples from History, Economics,\nEducation, Psychology, Urban Affairs and\nEveryday Life\nREVISED: MICHAEL LOLKUS 2018', 1: 'Tables, Charts, and\nGraphs Basics', 2: '\uf075 We use charts and graphs to visualize data.', 3: '\uf075 This data can either be generated data, data gathered from\nan experiment, or data collected from some source.', 4: '\uf075 A picture tells a thousand words so it is not a surprise that\nmany people use charts and graphs when explaining data.', 5: 'Types of Visual\nRepresentations of Data', 6: 'Table of Yearly U.S. GDP by\nIndustry (in millions of dollars)\nSource: U.S. Bureau of Labor Statistics\nYear 2010 2011 2012 2013 2014 2015\nAll Industries 26093515 27535971 28663246 29601191 30895407 31397023\nManufacturing 4992521 5581942 5841608 5953299 6047477 5829554\nFinance,\nInsurance, Real\n4522451 4618678 4797313 5031881 5339678 5597018\nEstate, Rental,\nLeasing\nArts,\nEntertainment,\nRecreation, 964032 1015238 1076249

In [16]:
!sudo apt-get update
!sudo apt-get install -y tesseract-ocr
!pip install pytesseract pdf2image


0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building depe

***QUERY 1: From page 2 get the exact unemployment information***

In [17]:
import pytesseract
from pdf2image import convert_from_path
import re
from google.colab import files

pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

def convert_page_to_image(pdf_path, page_number):
    images = convert_from_path(pdf_path, first_page=page_number, last_page=page_number)
    return images[0]

# Function to perform OCR and extract text from the image
def extract_text_with_ocr(image):
    return pytesseract.image_to_string(image, lang="eng")

def get_unemployment_info(text, degree_type):
    pattern = rf"{degree_type}[:\s-]*?(\d+\.?\d*)%"
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return f"{degree_type}: {match.group(1)}%"
    else:
        return f"Unemployment data for {degree_type} not found."
pdf_path="downloaded_pdf.pdf"
page_number = 2
page_image = convert_page_to_image(pdf_path, page_number)

page_text = extract_text_with_ocr(page_image)
degree_type_input = "All workers"
unemployment_info = get_unemployment_info(page_text, degree_type_input)

print("\n--- Unemployment Data ---")
print(unemployment_info)



--- Unemployment Data ---
All workers: 6.1%


***QUERY 2: From page 6 get the tabular data***

In [18]:
import pandas as pd

def extract_table_from_pdf(pdf_path, page_number):
    with pdfplumber.open(pdf_path) as pdf:
        table = pdf.pages[page_number - 1].extract_table()
    return pd.DataFrame(table[1:], columns=table[0])

table_data = extract_table_from_pdf("downloaded_pdf.pdf", 6)


In [19]:
print(table_data)

                                                Year      2010      2011  \
0                                     All Industries  26093515  27535971   
1                                      Manufacturing   4992521   5581942   
2  Finance,\nInsurance, Real\nEstate, Rental,\nLe...   4522451   4618678   
3  Arts,\nEntertainment,\nRecreation,\nAccommodat...    964032   1015238   
4                                              Other  15614511  16320113   

       2012      2013      2014      2015  
0  28663246  29601191  30895407  31397023  
1   5841608   5953299   6047477   5829554  
2   4797313   5031881   5339678   5597018  
3   1076249   1120496   1189646   1283813  
4  16948076  17495515  18318606  18686638  
