<a href="https://colab.research.google.com/github/JustaTirkey/Breast-Cancer-Prediction/blob/main/final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install pytesseract
!pip install pytesseract
!apt-get install -y tesseract-ocr


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 

In [9]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [10]:
import pandas as pd
import requests
from PIL import Image, ImageEnhance
from io import BytesIO
import pytesseract
import re
import matplotlib.pyplot as plt


In [11]:
entity_unit_map = {
    "width": ["centimetre", "foot", "millimetre", "metre", "inch", "yard"],
    "depth": ["centimetre", "foot", "millimetre", "metre", "inch", "yard"],
    "height": ["centimetre", "foot", "millimetre", "metre", "inch", "yard"],
    "item_weight": ["milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"],
    "maximum_weight_recommendation": ["milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"],
    "voltage": ["millivolt", "kilovolt", "volt"],
    "wattage": ["kilowatt", "watt"],
    "item_volume": ["cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon",
                    "pint", "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"]
}

In [12]:
entity_keywords = {
    "width": ["width"],
    "depth": ["depth"],
    "height": ["height"],
    "item_weight": ["weight", "item weight", "weight of item"],
    "maximum_weight_recommendation": ["max weight", "maximum weight", "weight recommendation"],
    "voltage": ["voltage", "volt"],
    "wattage": ["wattage", "watt", "power", "kilowatt"],
    "item_volume": ["volume", "item volume", "capacity", "size", "amount"]
}

In [13]:

# Step 1: Download images from URLs
def download_image(url):
    """Download the image from the given URL and return a PIL Image."""
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    return img

In [14]:
def enhance_image(image):
    """Enhance the contrast and sharpness of an image."""
    # Increase sharpness
    enhancer = ImageEnhance.Sharpness(image)
    image = enhancer.enhance(2.0)  # Adjust sharpness factor as needed
    # show_image(image, title="After Sharpness Enhancement")

    # Increase contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)  # Adjust contrast factor as needed
    # show_image(image, title="After Contrast Enhancement")

    # Increase sharpness
    enhancer = ImageEnhance.Sharpness(image)
    image = enhancer.enhance(2.0)  # Adjust sharpness factor as needed
    # show_image(image, title="After Sharpness Enhancement")

    return image

In [15]:
# Helper function to display an image
def show_image(image, title="Image"):
    """Display an image using matplotlib."""
    plt.figure(figsize=(8, 8))
    plt.imshow(image)
    plt.axis('off')
    plt.title(title)
    plt.show()

In [16]:
# Step 3: Rotate image to find the best orientation for OCR
def get_best_image_orientation(image):
    """Rotate the image to find the best orientation for OCR."""
    best_text = ""
    best_orientation = 0

    for angle in [0, 90, 180, 270]:
        rotated_image = image.rotate(angle, expand=True)
        text = pytesseract.image_to_string(rotated_image)
        if len(text) > len(best_text):  # Choose the orientation with the most extracted text
            best_text = text
            best_orientation = angle

        # show_image(rotated_image, title=f"Rotated by {angle} degrees")

    return image.rotate(best_orientation, expand=True)

In [17]:
text_df = []

In [18]:
# Step 4: Extract text from images using OCR
def extract_text_from_image(image):
    """Extract text from the image using Tesseract OCR."""
    text = pytesseract.image_to_string(image)
    text_df.append(text)  # Append to the global list
    return text

In [19]:
def parse_entity_value(text):
    # Pattern to match numbers followed by optional units (e.g., g, mg, %, kcal)
    pattern = r'\b\d+(\.\d+)?\s?(mg|g|%|kcal)?\b'

    # Extract all matches
    matches = re.findall(pattern, text)

    # Combine the matched numbers and units
    values = [''.join(match) for match in matches]

    return values

In [20]:
# def process_images_for_entities(image_urls, entity_type):
#     """Process the list of image URLs and extract the entity values."""
#     results = []

#     for url, et in zip(image_urls, entity_type):
#         try:
#             # Download the image
#             img = download_image(url)

#             # Enhance the image
#             enhanced_img = enhance_image(img)

#             # Find the best orientation
#             best_img = get_best_image_orientation(enhanced_img)

#             # Extract text from the image
#             extracted_text = extract_text_from_image(best_img)

#             # Parse the entity value based on the extracted text
#             entity_value = parse_entity_value(extracted_text)

#             # Append results
#             results.append({
#                 'image_url': url,
#                 'extracted_text': extracted_text,
#                 'entity_value': entity_value
#             })
#         except Exception as e:
#             # Handle any errors in processing
#             print(f"Error processing image {url}: {e}")
#             results.append({
#                 'image_url': url,
#                 'extracted_text': '',
#                 'entity_value': ''
#             })

#     # Return the results as a DataFrame
#     return pd.DataFrame(results)


In [24]:
import time
import concurrent.futures
import pandas as pd

def process_single_image(index, url, et):
    try:
        # Download the image
        img = download_image(url)

        # Enhance the image
        enhanced_img = enhance_image(img)

        # Find the best orientation
        best_img = get_best_image_orientation(enhanced_img)

        # Extract text from the image
        extracted_text = extract_text_from_image(best_img)

        # Parse the entity value based on the extracted text
        entity_value = parse_entity_value(extracted_text)

        return {
            'index': index,
            'image_url': url,
            'extracted_text': extracted_text,
            'entity_value': entity_value
        }
    except Exception as e:
        # Handle any errors in processing
        print(f"Error processing image {url}: {e}")
        return {
            'index': index,
            'image_url': url,
            'extracted_text': '',
            'entity_value': ''
        }

def process_images_for_entities(image_urls, entity_type, max_workers=8):
    """Process the list of image URLs and extract the entity values."""
    results = []
    start_time = time.time()
    total_images = len(image_urls)

    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {executor.submit(process_single_image, idx, url, et): idx for idx, (url, et) in enumerate(zip(image_urls, entity_type))}

        for future in concurrent.futures.as_completed(future_to_index):
            result = future.result()
            results.append(result)
            index = result['index']
            processed_count = len(results)
            elapsed_time = time.time() - start_time

            if elapsed_time // 1200 > (processed_count // (total_images / (elapsed_time // 1200 + 1))):
                remaining_images = total_images - processed_count
                print(f"Time elapsed: {elapsed_time / 60:.2f} minutes. Index {index} processed. Images left: {remaining_images}.")

    results_df = pd.DataFrame(results)
    return results_df

# Example usage:
# image_urls = [...]
# entity_type = [...]
# df = process_images_for_entities(image_urls, entity_type)
# print(df)


In [25]:
# /content/drive/MyDrive/dataset/train.csv

df = pd.read_csv('/content/drive/MyDrive/dataset/train.csv')
image_urls = df['image_link'][:10].tolist()  # Example: taking the first 4 URLs
entity_type = df['entity_name'][:10].tolist()

In [26]:
results_df = process_images_for_entities(image_urls, entity_type)

In [27]:
results_df

Unnamed: 0,index,image_url,extracted_text,entity_value
0,0,https://m.media-amazon.com/images/I/61I9XdN6OF...,PROPSS'\nNATURE\n\nINGREDIENT MENAGER\n\nNua\n...,[]
1,4,https://m.media-amazon.com/images/I/617Tl40LOX...,"HIGH STRENGTH\n\nAS Rl,\n\nPLANTAGOO\n1400mc P...",[]
2,1,https://m.media-amazon.com/images/I/71gSRbyXmo...,SEU i Dene OR LL ETRE VALEY Ape\n: 3 : Z :\n\n...,[]
3,7,https://m.media-amazon.com/images/I/71DiLRHeZd...,fallen s)o\n\nCe FB AAR er -[ |\n\nWOdds 35a4\...,[]
4,5,https://m.media-amazon.com/images/I/61QsBSE7jg...,\n\n \n \n \n\n¢ Naturally-Sourced Psy...,"[mg, ]"
5,3,https://m.media-amazon.com/images/I/612mrlqiI4...,PS\nSarving Size: 1 Tablet (0.709 g) | Each se...,"[, .709g, , , , , .07, mg, .5, , , , , , , , ,..."
6,2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,Serving Size: 1 Tablet (0.709 g) | Each servin...,"[, .709g, , , .5, , .07, .5, , , , mg, mg, mg,..."
7,9,https://m.media-amazon.com/images/I/71jBLhmTNl...,\n\n\YDeeqJsoH ee a : : se s earn onioyeae\n\...,[]
8,8,https://m.media-amazon.com/images/I/91Cma3Rzse...,"PEPERGION (See\n\nSe,\n\n \n \n \n \n \n ...",[]
9,6,https://m.media-amazon.com/images/I/81xsq6vf2q...,"\n\n \n\n \n\n \n\nDirections: For adults, ta...","[, mg, .5]"


In [None]:
df.head()