<a href="https://colab.research.google.com/github/Gk58IISERTvm98/RAG-Application/blob/main/Extract_Data_from_PDF_HTML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install "unstructured[all-docs]"

In [None]:
!pip install PyPDF2 pdfminer.six pdfplumber pytesseract langchain openai weaviate-client faiss-cpu pinecone-client


In [3]:
!pip install pdf2image



In [4]:
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
!pip install tesseract-ocr
!pip install unstructured-pytesseract
# !pip install unstructured

In [None]:
!pip install --upgrade nltk

In [None]:

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

## **Extract Text from Text-based PDFs**

In [8]:
harry_potter = "/content/harry_potter.pdf"

#### **Using PyPDF2**

In [9]:
import PyPDF2

def extract_text_pypdf2(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def extract_metadata(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        return reader.metadata


In [10]:
potter = extract_text_pypdf2(harry_potter)

In [11]:
potter[0:100]

'114\nCopyright © Canadian Academy of Oriental and Occidental Culture ISSN 1923-1555[Print] \nISSN 1923'

In [12]:
meta_data = extract_metadata(harry_potter)
meta_data

{'/CreationDate': 'D:20240824140400',
 '/Creator': 'PDFium',
 '/Producer': 'PDFium'}

#### **Use Pdf_Miner**

In [13]:
from pdfminer.high_level import extract_text

def extract_text_pdfminer(pdf_path):
    return extract_text(pdf_path)


In [14]:
harry = extract_text_pdfminer(harry_potter)

In [15]:
harry



In [16]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

def extract_metadata_pdfminer(pdf_path):
    with open(pdf_path, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        metadata = doc.info  # This is a list of dictionaries with metadata
        return metadata


In [17]:
harry_meta = extract_metadata_pdfminer(harry_potter)
harry_meta

[{'CreationDate': b'D:20240824140400',
  'Creator': b'PDFium',
  'Producer': b'PDFium'}]

#### **Pdf Plumber**

In [18]:
import pdfplumber

def extract_text_pdfplumber(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text


In [19]:
harry = extract_text_pdfplumber(harry_potter)
harry



## **Extract Text from Scanned PDFs (OCR)**

In [None]:
!sudo apt-get install tesseract-ocr

In [20]:
ragllm = "/content/RAG_LLM.pdf"

In [21]:
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

def convert_pdf_to_images(pdf_path):
    return convert_from_path(pdf_path)


def extract_text_from_images(images):
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image)
    return text

def extract_text_from_scanned_pdf(pdf_path):
  images = convert_pdf_to_images(pdf_path)
  return extract_text_from_images(images)


In [22]:
ragllm_text = extract_text_from_scanned_pdf(ragllm)

In [55]:
ragllm_text

'2312.10997v5 [cs.CL] 27 Mar 2024\n\narXiv\n\nRetrieval-Augmented Generation for Large\nLanguage Models: A Survey\n\nYunfan Gao*, Yun Xiong>, Xinyu Gao?, Kangxiang Jia, Jinliu Pan>, Yuxi Bi®, Yi Dai*, Jiawei Sun*, Meng\nWang*, and Haofen Wang **\n\n*Shanghai Research Institute for Intelligent Autonomous Systems, Tongji University\n>Shanghai Key Laboratory of Data Science, School of Computer Science, Fudan University\n“College of Design and Innovation, Tongji University\n\nAbstract—Large Language Models (LLMs) showcase impres-\nsive capabilities but encounter challenges like hallucination,\noutdated knowledge, and non-transparent, untraceable reasoning\nprocesses. Retrieval-Augmented Generation (RAG) has emerged\nas a promising solution by incorporating knowledge from external\ndatabases. This enhances the accuracy and credibility of the\ngeneration, particularly for knowledge-intensive tasks, and allows\nfor continuous knowledge updates and integration of domain-\nspecific information.

## **Handling Complex PDFs (Images, Tables, etc.)**

In [23]:
from unstructured.partition.pdf import partition_pdf

In [24]:
raw_pdf_elements= partition_pdf(
    filename=ragllm,
    strategy= "hi_res",
    extract_images_in_pdf=True,
    extract_image_block_types= ["Image", "Table"],
    extract_image_block_to_payload=False,
    extract_image_block_output_dir="extracted_data2"
    )

In [25]:
raw_pdf_elements

[<unstructured.documents.elements.Text at 0x7852971b7e80>,
 <unstructured.documents.elements.Header at 0x7851e5b2f160>,
 <unstructured.documents.elements.Title at 0x7851e5b2cc70>,
 <unstructured.documents.elements.NarrativeText at 0x7851e5b2cd00>,
 <unstructured.documents.elements.NarrativeText at 0x7851e5b2fc40>,
 <unstructured.documents.elements.NarrativeText at 0x7851e5b2f460>,
 <unstructured.documents.elements.NarrativeText at 0x7851e5b2d750>,
 <unstructured.documents.elements.Title at 0x7851e83f4df0>,
 <unstructured.documents.elements.NarrativeText at 0x7851e5b2d0f0>,
 <unstructured.documents.elements.NarrativeText at 0x7851e5b2ea70>,
 <unstructured.documents.elements.NarrativeText at 0x7851e5b2d060>,
 <unstructured.documents.elements.NarrativeText at 0x7851e5e9c4c0>,
 <unstructured.documents.elements.Title at 0x7851e83f52a0>,
 <unstructured.documents.elements.NarrativeText at 0x7851e83f5210>,
 <unstructured.documents.elements.Title at 0x7851e3a945b0>,
 <unstructured.documents.ele

In [26]:
Header=[]
Footer=[]
Title=[]
NarrativeText=[]
Text=[]
ListItem=[]


for element in raw_pdf_elements:
  if "unstructured.documents.elements.Header" in str(type(element)):
            Header.append(str(element))
  elif "unstructured.documents.elements.Footer" in str(type(element)):
            Footer.append(str(element))
  elif "unstructured.documents.elements.Title" in str(type(element)):
            Title.append(str(element))
  elif "unstructured.documents.elements.NarrativeText" in str(type(element)):
            NarrativeText.append(str(element))
  elif "unstructured.documents.elements.Text" in str(type(element)):
            Text.append(str(element))
  elif "unstructured.documents.elements.ListItem" in str(type(element)):
            ListItem.append(str(element))

## **Extracting Data From HTML Files**

In [27]:
!pip install beautifulsoup4 requests lxml




In [28]:
from bs4 import BeautifulSoup

def load_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    return html_content

html_file_path = "/content/wild_life.html"
html_content = load_html_file(html_file_path)


In [29]:
soup = BeautifulSoup(html_content, 'lxml')

In [30]:
# Extract all text content from the HTML
text = soup.get_text()
print(text)






Wildlife Explorer






Wildlife Explorer
Discover the beauty of the wild





















Home
Animals
Habitats
Conservation
Contact














Welcome to Wildlife Explorer!
Explore the diverse and fascinating world of wildlife.

Featured Animals / Birds / Snake Species





Lion
The king of the jungle, known for its majestic mane and powerful roar ... 
                                Lions, boasting distinctive manes, powerful roars, and cooperative hunting tactics,
                                symbolizing strength and royalty while facing conservation challenges.

Read more





Elephant
One of the largest land mammals, with impressive tusks and a gentle demeanor.
Read more





Giant Panda
An endangered species known for its distinctive black-and-white fur.
Read more


❮
❯





Resplendent quetzal
The resplendent quetzal, revered by ancient civilizations in Central America, is known for
                            its stunning iridescent plumage and long, trailing tai

In [31]:
def extract_table_data(soup):
    table_data = []
    tables = soup.find_all('table')

    for table in tables:
        rows = table.find_all('tr')  # Find all rows in the table
        for row in rows:
            cells = row.find_all(['td', 'th'])  # Find all cells in the row
            cell_data = [cell.get_text(strip=True) for cell in cells]  # Extract text from each cell
            table_data.append(cell_data)

    return table_data

table_data = extract_table_data(soup)
for row in table_data:
    print(row)


In [32]:
def extract_links(soup):
    links = []
    for a_tag in soup.find_all('a', href=True):
        links.append(a_tag['href'])
    return links

# Example usage
links = extract_links(soup)
print(links)


['#home', '#animals', '#habitats', '#conservation', '#contact']


In [33]:
def extract_images(soup):
    image_sources = []
    for img_tag in soup.find_all('img', src=True):
        image_sources.append(img_tag['src'])
    return image_sources

In [36]:
images = extract_images(soup)
print(images)

['https://miro.medium.com/v2/resize:fit:640/format:webp/0*U5W4ghnlzPFl1fIx.jpg', 'https://img.freepik.com/premium-photo/most-beautiful-hummingbird-world-small-colorful-hummingbird-flight_597321-635.jpg', 'https://qph.cf2.quoracdn.net/main-qimg-ea1b382b9b6dc3ccca7dcb5129976c16-lq', 'https://miro.medium.com/v2/resize:fit:640/format:webp/0*k5LYzW1osI106Xz5.jpg', 'https://miro.medium.com/v2/resize:fit:720/format:webp/0*b8FY_lTv9IhD6rDX.jpg', 'https://miro.medium.com/v2/resize:fit:720/format:webp/0*9d_64r5WTCTzkHMn.', 'https://miro.medium.com/v2/resize:fit:1400/format:webp/0*CWFIlODqVm7oq3uK.', 'https://img.huffingtonpost.com/asset/5b9e91ee250000320036ea9d.png?ops=scalefit_630_400_noupscale', 'https://miro.medium.com/v2/resize:fit:720/format:webp/0*JWocE8Ntnk0hKQHT.jpg', 'https://miro.medium.com/v2/resize:fit:1400/format:webp/0*lNZX04dL8zeuQBlN.jpg', 'https://r4.wallpaperflare.com/wallpaper/870/405/200/lion-nature-animals-baby-animals-wallpaper-3251b2907d667e9b1a48b2d5f0783902.jpg', 'https:

In [39]:
# Extract elements using CSS selectors
titles = soup.select('.title-class')  # Select elements with the class 'title-class'
for title in titles:
    print(title.get_text(strip=True))


In [40]:
# Find specific elements by navigating through parents, children, and siblings
first_div = soup.find('div')
for child in first_div.children:
    print(child)




<h1 id="h_h1">Wildlife Explorer</h1>


<p id="h_p">Discover the beauty of the wild</p>


<div class="box">
<span style="--i:0"><img src="https://miro.medium.com/v2/resize:fit:640/format:webp/0*U5W4ghnlzPFl1fIx.jpg"/></span>
<span style="--i:1"><img src="https://img.freepik.com/premium-photo/most-beautiful-hummingbird-world-small-colorful-hummingbird-flight_597321-635.jpg"/></span>
<span style="--i:2"><img src="https://qph.cf2.quoracdn.net/main-qimg-ea1b382b9b6dc3ccca7dcb5129976c16-lq"/></span>
<span style="--i:3"><img src="https://miro.medium.com/v2/resize:fit:640/format:webp/0*k5LYzW1osI106Xz5.jpg"/></span>
<span style="--i:4"><img src="https://miro.medium.com/v2/resize:fit:720/format:webp/0*b8FY_lTv9IhD6rDX.jpg"/></span>
<span style="--i:5"><img src="https://miro.medium.com/v2/resize:fit:720/format:webp/0*9d_64r5WTCTzkHMn."/></span>
<span style="--i:6"><img src="https://miro.medium.com/v2/resize:fit:1400/format:webp/0*CWFIlODqVm7oq3uK."/></span>
<span style="--i:7"><img src="https:

In [41]:
import csv

def save_to_csv(data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerows(data)

# Example usage
save_to_csv(table_data, 'output.csv')


In [42]:
from bs4 import BeautifulSoup

# Load the HTML file from disk
def load_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    return html_content

# Parse the HTML content
html_file_path = "index.html"
html_content = load_html_file(html_file_path)
soup = BeautifulSoup(html_content, 'lxml')

# Check for text content
text_content = soup.get_text(strip=True)
print("Text content found:", len(text_content) > 0)

# Check for tables
tables = soup.find_all('table')
print("Number of tables found:", len(tables))

# Check for images
images = soup.find_all('img')
print("Number of images found:", len(images))

# Check for links
links = soup.find_all('a', href=True)
print("Number of links found:", len(links))

# Check for forms
forms = soup.find_all('form')
print("Number of forms found:", len(forms))

# Check for lists
unordered_lists = soup.find_all('ul')
ordered_lists = soup.find_all('ol')
print("Number of unordered lists found:", len(unordered_lists))
print("Number of ordered lists found:", len(ordered_lists))

# Check for scripts
scripts = soup.find_all('script')
print("Number of scripts found:", len(scripts))

# Check for embedded content
iframes = soup.find_all('iframe')
audio_files = soup.find_all('audio')
video_files = soup.find_all('video')
print("Number of iframes found:", len(iframes))
print("Number of audio files found:", len(audio_files))
print("Number of video files found:", len(video_files))


Text content found: True
Number of tables found: 1
Number of images found: 0
Number of links found: 13
Number of forms found: 0
Number of unordered lists found: 6
Number of ordered lists found: 0
Number of scripts found: 0
Number of iframes found: 0
Number of audio files found: 0
Number of video files found: 0
