# Mounted Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Code; Reading Image, format and Size

In [2]:
import os
import cv2  # or Pillow, Matplotlib

# Path to the directory containing images
image_dir = '/content/drive/MyDrive/Helix Good Images'

jpg_count = 0
jpeg_count = 0
png_count = 0

# Iterate through all files in the directory
for filename in os.listdir(image_dir):
    file_path = os.path.join(image_dir, filename)  # Full path to the image

    # Count file types and process images
    if filename.lower().endswith('.jpg'):
        jpg_count += 1
    elif filename.lower().endswith('.jpeg'):
        jpeg_count += 1
    elif filename.lower().endswith('.png'):
        png_count += 1

    if filename.endswith(('.jpg','.jpeg','.png')):  # Check for image file extensions
        # Read the image
        image = cv2.imread(file_path)
        if image is not None:
            print(f"Processed: {filename}, Dimensions: {image.shape}")
        else:
            print(f"Failed to read: {filename}")

# Print the counts for each file type
print("\nFile type counts:")
print(f"Number of .jpg files: {jpg_count}")
print(f"Number of .jpeg files: {jpeg_count}")
print(f"Number of .png files: {png_count}")


Processed: 12 4.jpg, Dimensions: (307, 164, 3)
Processed: 13 2.jpg, Dimensions: (259, 194, 3)
Processed: 11 2.jpg, Dimensions: (640, 665, 3)
Processed: 4 8.jpg, Dimensions: (600, 800, 3)
Processed: 4 7.jpg, Dimensions: (4032, 3024, 3)
Processed: 1 14.jpg, Dimensions: (560, 433, 3)
Processed: 5 10.jpg, Dimensions: (519, 400, 3)
Processed: 6 6.jpg, Dimensions: (600, 900, 3)
Processed: 9 2.jpg, Dimensions: (1000, 748, 3)
Processed: Air Conditioning label 600x527 (1).jpg, Dimensions: (527, 600, 3)
Processed: Condenser 2 1.jpg, Dimensions: (125, 402, 3)
Processed: Coffee Brewer Wattage.jpg, Dimensions: (525, 700, 3)
Processed: Air Handler 1 2.jpeg, Dimensions: (500, 666, 3)
Processed: Air Conditioning label 600x527.jpg, Dimensions: (527, 600, 3)
Processed: Coffee Brewer Wattage (1).jpg, Dimensions: (525, 700, 3)
Processed: rh8.jpg, Dimensions: (1008, 756, 3)
Processed: samsung_refrigerator__like_new_1643367650_f9c93423_progressive (1).jpg, Dimensions: (1080, 810, 3)
Processed: Furnace 2 1.p

In [6]:
pip install paddleocr paddlepaddle


Collecting paddleocr
  Downloading paddleocr-2.9.1-py3-none-any.whl.metadata (8.5 kB)
Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecti

# Code; Extracting Attributes from Paddle

In [7]:
import os
import pandas as pd
from paddleocr import PaddleOCR

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Set language to English

# Path to the directory containing images
image_dir = '/content/drive/MyDrive/Helix Good Images'

# Path to store the results
output_path = '/content/drive/MyDrive/paddle1.csv'

# Initialize counters and storage
processed_count = 0
failed_count = 0
extracted_info = []

# Function to extract text and parse attributes
def extract_attributes(image_path):
    """
    Extract manufacturer attributes from an image using PaddleOCR.
    Returns a dictionary with serial number, model number, and manufacturer name.
    """
    # Run OCR on the image
    results = ocr.ocr(image_path, det=True, rec=True)

    if not results or not results[0]:
        print(f"No text detected in: {image_path}")
        return None

    # Extract text from results
    text_lines = [line[1][0] for line in results[0]]

    # Initialize attributes
    attributes = {
        "Serial Number": None,
        "Model Number": None,
        "Manufacturer Name": None
    }

    # Parse text to find specific attributes
    for line in text_lines:
        line = line.strip()
        if "serial" in line.lower():
            attributes["Serial Number"] = line
        elif "model" in line.lower():
            attributes["Model Number"] = line
        elif "manufacturer" in line.lower() or "made by" in line.lower():
            attributes["Manufacturer Name"] = line

    return attributes

# Iterate through all images in the directory
for filename in os.listdir(image_dir):
    file_path = os.path.join(image_dir, filename)
    if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
        attributes = extract_attributes(file_path)
        if attributes:
            processed_count += 1
            extracted_info.append(attributes)
            print(f"Processed: {filename}, Extracted Attributes: {attributes}")
        else:
            failed_count += 1
            print(f"Failed to process: {filename}")

# Save results to a CSV file
df = pd.DataFrame(extracted_info)
df.to_csv(output_path, index=False)

# Print summary
print("\nProcessing results:")
print(f"Number of images processed successfully: {processed_count}")
print(f"Number of images failed to process: {failed_count}")
print(f"Results saved in: {output_path}")


download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:17<00:00, 221.90it/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:17<00:00, 574.61it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:14<00:00, 151.15it/s]

[2024/12/09 12:23:30] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c




[2024/12/09 12:23:31] ppocr DEBUG: dt_boxes num : 41, elapsed : 0.11588740348815918
[2024/12/09 12:23:31] ppocr DEBUG: cls num  : 41, elapsed : 0.1280210018157959
[2024/12/09 12:23:35] ppocr DEBUG: rec_res num  : 41, elapsed : 3.8605611324310303
Processed: 12 4.jpg, Extracted Attributes: {'Serial Number': None, 'Model Number': 'MODEL NUMBER:SCGSNI-TS', 'Manufacturer Name': None}
[2024/12/09 12:23:35] ppocr DEBUG: dt_boxes num : 22, elapsed : 0.10782098770141602
[2024/12/09 12:23:35] ppocr DEBUG: cls num  : 22, elapsed : 0.10451316833496094
[2024/12/09 12:23:37] ppocr DEBUG: rec_res num  : 22, elapsed : 1.9667000770568848
Processed: 13 2.jpg, Extracted Attributes: {'Serial Number': None, 'Model Number': None, 'Manufacturer Name': None}
[2024/12/09 12:23:38] ppocr DEBUG: dt_boxes num : 28, elapsed : 0.4141724109649658
[2024/12/09 12:23:38] ppocr DEBUG: cls num  : 28, elapsed : 0.10953903198242188
[2024/12/09 12:23:43] ppocr DEBUG: rec_res num  : 28, elapsed : 4.6268861293792725
Processed