In [16]:
import pdfplumber

def extract_double_column_text(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        if len(pdf.pages) < 2:
            raise ValueError("The PDF has less than 2 pages.")

        second_page = pdf.pages[2]
        width = second_page.width
        height = second_page.height

        # Define bounding boxes for the columns
        left_bbox = (0, 0, width / 2, height)
        right_bbox = (width / 2, 0, width, height)

        # Extract text from each column
        left_text = second_page.within_bbox(left_bbox).extract_text()
        right_text = second_page.within_bbox(right_bbox).extract_text()

        return left_text, right_text

pdf_path = '/content/dell-24-monitor-e2423h-datasheet.pdf'
left_text, right_text = extract_double_column_text(pdf_path)
print("Left Column Text:")
print(left_text)
print("Right Column Text:")
print(right_text)


Left Column Text:
Features & Techn
Monitor Dell 24 Monitor – E2423H
Diagonal Viewing Size 60.47 cm (23.8 inches)
Horizontal 527.04 mm (20.75 inches)
Vertical 296.46 mm (11.67 inches)
Maximum Preset Resolution 1920 x 1080 at 60 Hz
Aspect Ratio 16:9
Pixel Pitch 0.2745 mm x 0.2745 mm
Pixel Per Inch (PPI) 93
Brightness 250 cd/m2 (typical)
Color Support Color gamut (typical): 72% (CIE1
Color depth: 16.7 Million
Contrast Ratio 3000:1 (typical)
Viewing Angle 178°/178°
Response Time 5 ms typical (Fast)
8 ms typical (Normal) (gray to gr
Panel Type VA (Vertical Alignment)
Display Screen Coating Anti-Glare
Backlight Technology LED
Low Blue Light solution with Yes (ComfortView)
Flicker-free screen
Dell Display Manager Compatibility Yes
Remote Asset Management Yes, via Dell Display Manager
Connectivity
Connectors 1 x VGA
1 x DisplayPort 1.2
Design Features
Adjustability Tilt (-5° to 21°)
Security Security lock slot (cable lock sold
Flat Panel Mount Interface VESA (100 mm)
Power
AC input voltage/fre

In [26]:
import pdfplumber
import pandas as pd
import re

def extract_double_column_text(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        if len(pdf.pages) < 3:
            raise ValueError("The PDF has less than 3 pages.")

        second_page = pdf.pages[2]
        width = second_page.width
        height = second_page.height

        # Define bounding boxes for the columns
        left_bbox = (0, 0, width / 2, height)
        right_bbox = (width / 2, 0, width, height)

        # Extract text from each column
        left_text = second_page.within_bbox(left_bbox).extract_text()
        right_text = second_page.within_bbox(right_bbox).extract_text()

        return left_text


def parse_text_to_dict(text):
    lines = text.split('\n')
    data = {}
    current_key = None

    for line in lines:
        if re.match(r'^[A-Za-z].*$', line):
            # This line is likely a new key
            parts = line.split(' ', 1)
            if len(parts) > 1:
                current_key = parts[0]
                value = parts[1].strip()
                if current_key in data:
                    data[current_key] += " " + value
                else:
                    data[current_key] = value
            else:
                current_key = parts[0]
                data[current_key] = ""
        elif current_key and line.strip():
            # This line is a continuation of the current key's value
            data[current_key] += " " + line.strip()

    return data

def merge_dicts(dict1, dict2):
    merged_dict = dict1.copy()
    for key, value in dict2.items():
        if key in merged_dict:
            merged_dict[key] += " " + value
        else:
            merged_dict[key] = value
    return merged_dict

pdf_path = '/content/dell-24-monitor-e2423h-datasheet.pdf'
left_text =  extract_double_column_text(pdf_path)

left_data = parse_text_to_dict(left_text)
right_data = parse_text_to_dict(right_text)

# Merge dictionaries from both columns
merged_data = merge_dicts(left_data, right_data)

# Convert dictionary to DataFrame
df = pd.DataFrame(list(merged_data.items()), columns=['Key', 'Value'])

display(df.head(100))

# Optionally, save to a CSV file
df.to_csv('extracted_data.csv', index=False)


Unnamed: 0,Key,Value
0,Features,& Techn
1,Monitor,Dell 24 Monitor – E2423H
2,Diagonal,Viewing Size 60.47 cm (23.8 inches)
3,Horizontal,527.04 mm (20.75 inches)
4,Vertical,296.46 mm (11.67 inches)
5,Maximum,Preset Resolution 1920 x 1080 at 60 Hz
6,Aspect,Ratio 16:9
7,Pixel,Pitch 0.2745 mm x 0.2745 mm Per Inch (PPI) 93
8,Brightness,250 cd/m2 (typical)
9,Color,Support Color gamut (typical): 72% (CIE1 depth...


Unnamed: 0,Key,Value
0,Features,& Techn
1,Monitor,Dell 24 Monitor – E2423H
2,Diagonal,Viewing Size 60.47 cm (23.8 inches)
3,Horizontal,527.04 mm (20.75 inches)
4,Vertical,296.46 mm (11.67 inches)
5,Maximum,Preset Resolution 1920 x 1080 at 60 Hz
6,Aspect,Ratio 16:9
7,Pixel,Pitch 0.2745 mm x 0.2745 mm Per Inch (PPI) 93
8,Brightness,250 cd/m2 (typical)
9,Color,Support Color gamut (typical): 72% (CIE1 depth...


In [28]:
import pdfplumber
import os
from PIL import Image

def extract_images_from_pdf(pdf_path, page_number=1, output_folder='images'):
    with pdfplumber.open(pdf_path) as pdf:
        if len(pdf.pages) < page_number:
            raise ValueError(f"The PDF has less than {page_number} pages.")

        page = pdf.pages[page_number - 1]  # page_number is 1-indexed
        page_width = page.width
        page_height = page.height

        # Ensure the output folder exists
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        for i, img in enumerate(page.images):
            # Extract the image
            x0, top, x1, bottom = img["x0"], img["top"], img["x1"], img["bottom"]

            # Ensure the bounding box is within the page dimensions
            x0 = max(0, x0)
            top = max(0, top)
            x1 = min(page_width, x1)
            bottom = min(page_height, bottom)

            # Crop the image within the bounding box
            cropped_image = page.within_bbox((x0, top, x1, bottom)).to_image()

            # Save the image
            image_path = os.path.join(output_folder, f'image_{page_number}_{i}.png')
            cropped_image.save(image_path)
            print(f"Saved image to {image_path}")

pdf_path = '/content/dell-24-monitor-e2423h-datasheet.pdf'
extract_images_from_pdf(pdf_path)


Saved image to images/image_1_0.png
Saved image to images/image_1_1.png


In [54]:
import fitz
import os

def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def extract_images_from_pdf(pdf_path, page_number=1, output_folder='images'):
    ensure_directory_exists(output_folder)

    doc = fitz.open(pdf_path)
    if len(doc) < page_number:
        raise ValueError(f"The PDF has less than {page_number} pages.")

    page = doc.load_page(page_number - 1)  # page_number is 1-indexed
    image_list = page.get_images(full=True)

    image_paths = []

    for i, img in enumerate(image_list):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]

        # Save the image in its original format
        image_path = os.path.join(output_folder, f'image_{page_number}_{i}.{image_ext}')
        with open(image_path, "wb") as image_file:
            image_file.write(image_bytes)

        image_paths.append(image_path)
        print(f"Saved image to {image_path}")

    return image_paths

# Example usage
pdf_path = '/content/dell-24-monitor-e2423h-datasheet.pdf'
output_folder = 'images'

# Extract images from the first page
image_paths = extract_images_from_pdf(pdf_path, page_number=1, output_folder=output_folder)

if image_paths:
    print("Extracted images:")
    for path in image_paths:
        print(path)
else:
    print("No images found on the specified page.")


Saved image to images/image_1_0.jpeg
Saved image to images/image_1_1.jpeg
Extracted images:
images/image_1_0.jpeg
images/image_1_1.jpeg


In [4]:
!sudo apt-get install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 2s (2,754 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [1]:
!pip install PyMuPDF

!pip install pdfplumber
!pip install pillow
!pip install pytesseract



In [8]:
pip install google-cloud-vision


Collecting google-cloud-vision
  Downloading google_cloud_vision-3.7.4-py2.py3-none-any.whl.metadata (5.2 kB)
Downloading google_cloud_vision-3.7.4-py2.py3-none-any.whl (467 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m467.5/467.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-vision
Successfully installed google-cloud-vision-3.7.4


In [5]:
genai.configure(api_key='AIzaSyB8znIoeLP22LHjtnTjgCZjAmNKNfe5GvU')


In [9]:
import fitz  # PyMuPDF
import os
from PIL import Image
from google.cloud import vision
from google.cloud.vision_v1 import types

def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def extract_images_from_pdf(pdf_path, page_number=1, output_folder='images', dpi=300):
    ensure_directory_exists(output_folder)

    doc = fitz.open(pdf_path)
    if len(doc) < page_number:
        raise ValueError(f"The PDF has less than {page_number} pages.")

    page = doc.load_page(page_number - 1)  # page_number is 1-indexed
    image_list = page.get_images(full=True)

    image_paths = []

    for i, img in enumerate(image_list):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]

        # Save the image in its original format
        image_path = os.path.join(output_folder, f'image_{page_number}_{i}.{image_ext}')
        with open(image_path, "wb") as image_file:
            image_file.write(image_bytes)

        image_paths.append(image_path)
        print(f"Saved image to {image_path}")

    return image_paths

def extract_text_from_image_with_vision_api(image_path):
    client = vision.ImageAnnotatorClient()

    with open(image_path, "rb") as image_file:
        content = image_file.read()

    image = types.Image(content=content)
    response = client.text_detection(image=image)

    texts = response.text_annotations
    if texts:
        return texts[0].description
    return ""

def clean_extracted_text(text):
    # Remove unwanted table-related text and characters
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        # Remove lines that are likely part of the table structure
        if not any(keyword in line for keyword in ['Table', 'Column', '|', '-', '+']):
            cleaned_lines.append(line)
    cleaned_text = '\n'.join(cleaned_lines)
    return cleaned_text

# Example usage
pdf_path = '/content/dell-24-monitor-e2423h-datasheet.pdf'
output_folder = 'images'

# Extract images from the first page
image_paths = extract_images_from_pdf(pdf_path, page_number=1, output_folder=output_folder)

if len(image_paths) >= 2:
    # Process the second image
    second_image_path = image_paths[1]

    # Extract text from the image using Google Cloud Vision API
    extracted_text = extract_text_from_image_with_vision_api(second_image_path)

    # Clean the extracted text
    cleaned_text = clean_extracted_text(extracted_text)

    print("Cleaned Text from the Image:")
    print(cleaned_text)
else:
    print("Less than two images found on the specified page.")


Saved image to images/image_1_0.jpeg
Saved image to images/image_1_1.jpeg


ERROR:grpc._plugin_wrapping:AuthMetadataPluginCallback "<google.auth.transport.grpc.AuthMetadataPlugin object at 0x7b9c91d7f910>" raised exception!
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/google/auth/compute_engine/credentials.py", line 128, in refresh
    self._retrieve_info(request)
  File "/usr/local/lib/python3.10/dist-packages/google/auth/compute_engine/credentials.py", line 101, in _retrieve_info
    info = _metadata.get_service_account_info(
  File "/usr/local/lib/python3.10/dist-packages/google/auth/compute_engine/_metadata.py", line 323, in get_service_account_info
    return get(request, path, params={"recursive": "true"})
  File "/usr/local/lib/python3.10/dist-packages/google/auth/compute_engine/_metadata.py", line 248, in get
    raise exceptions.TransportError(
google.auth.exceptions.TransportError: ("Failed to retrieve http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/?recursive=true from the Go

ServiceUnavailable: 503 Getting metadata from plugin failed with error: ("Failed to retrieve http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/?recursive=true from the Google Compute Engine metadata service. Status: 404 Response:\nb''", <google.auth.transport.requests._Response object at 0x7b9ca9c9ebf0>)

In [16]:
!pip install pdf2image pillow pytesseract opencv-python


Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
