In [1]:
import cv2
import pandas as pd
import numpy as np
import pytesseract
import re
from pyzbar.pyzbar import decode

In [2]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [3]:
#Creating barcode reader function

def barcode_reader(image):
     
   
    img = cv2.imread(image)
      
    
    detectedBarcodes = decode(img)
      
   
    if not detectedBarcodes:
        return False
    else:
        return detectedBarcodes[0].data

In [4]:
#Creating function for extracting list of ingredients from picture using pytesseract 

def extract_text(img):
    image = cv2.imread(img)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    
    text = pytesseract.image_to_string(thresh, lang='eng')
    text = text.replace('\n', '')
    
    pattern = r'(?:INGREDIENTS|Ingredients):\s*([\w\s|,]+)'
    
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    if matches:
        return matches[0].split(',')
    else:
        return ''

In [5]:
#loading excel files
barcode_data = pd.read_excel("barcode_data.xlsx")

ingredient_data = pd.read_excel("ingredients_after_all_scraping.xlsx")


In [6]:
#Creating list from substitute_names columns so we can iterate thru seperate ingredients
ingredient_data['substitute_names'] = ingredient_data['substitute_names'].apply(lambda x: x.split(','))

In [7]:
#List of images

image_path = "IMG_6949.jpg"
#image_path = "IMG_6957.jpg"
#image_path = "IMG_6955.jpg"
#image_path = "IMG_6964.jpg"
#image_path = "Mouth_Wash.jpg"



In [8]:
#Extract barcode from image
barcode = barcode_reader(image_path)

In [9]:
#If there is no barcode or we dont have it in the excel file extract it form the image using OCR
if barcode == False:
    ingredients = extract_text(image_path)
    print('No Barcode ')
else:
    if int(barcode) not in list(barcode_data["Barcode"].values):
        ingredients = extract_text(image_path)
        print('No entry in Barcode file, we will extract the ingredients from the picture')
    else:
        ingredients = barcode_data[barcode_data["Barcode"] == int(barcode)]["Ingredient List"].values[0].split(', ')
        print('Barcode valid')

No entry in Barcode file, we will extract the ingredients from the picture


In [10]:
#list of ingredients
for ingredient in ingredients:
    print(ingredient)

Sodium Palmate
 Aqua
Glycerin
 Sodium Palm Kernelate
 Parfum
Palm Kernel Acid
 Rubus Idaeus Fruit Extract
Vaccinium Macrocarpon Fruit Extract
 PrunusAmygdalus Dulcis Seed Extract
 Zea MaysStarch
 Xanthan Gum
 Citric Acid
 SodiumBenzoate
 Potassium Sorbate
 Tetrasodium RartonGlutamate Diacetate
 Sodium Citrate
 SodiumChloride
 Benzyl Salicylate
 Geraniol
 Benzy|Alcohol
 Hexyl Cinnamal
 Linalool
 Cl 77891


In [11]:

#image_path = "zip/IMG_6949.jpg"

from fuzzywuzzy import process
import pandas as pd


#Create function for finding the best match between our database and the extracted list of ingredients  
def get_best_match_and_extract(ingredient_name, name_choices, substitute_choices, dataframe):
    best_match = None
    score = 0
    matched_row = None

    for choice_list in [name_choices, substitute_choices]:
        for choice in choice_list:
            result = process.extractOne(ingredient_name, [choice])
            if result[1] > score:
                best_match = result[0]
                score = result[1]
                matched_row = dataframe[dataframe.isin([choice]).any(axis=1)]

    return best_match, score, matched_row

# Map each ingredient to its best-matched universal name from the DataFrame columns
ingredient_mappings = {}
for ingredient in ingredients:
    best_match, score, matched_row = get_best_match_and_extract(
        ingredient, 
        ingredient_data['Name'],
        ingredient_data['substitute_names'],
        ingredient_data
    )
    
    if not matched_row.empty:
        ingredient_entry = matched_row.iloc[0]
        ingredient_name = ingredient_entry['Name']
        matched_row_info = {
            'Ingredient Name': ingredient_name,
            'Matched Entry': best_match,
            'Score': score,
            'Carcinogens': bool(ingredient_entry['Carcinogens']),
            'EndocrineDisruptors': bool(ingredient_entry['EndocrineDisruptors']),
            'Allergen': bool(ingredient_entry['Allergen']),
            'SkinIrritant': bool(ingredient_entry['SkinIrritant']),
            'Id': ingredient_entry['Id']  
        }
        if score < 85:
            matched_row_info['MatchQuality'] = 'Not a good match'
        ingredient_mappings[ingredient] = matched_row_info
    else:
        ingredient_mappings[ingredient] = "No match found"


for ingredient, result in ingredient_mappings.items():
    print(f"{ingredient}:")
    if result != "No match found":
        if 'MatchQuality' in result:
            print(f"Score: {result['Score']}. {result['MatchQuality']}")
        else:
            print(result)
    else:
        print("No match found")


Sodium Palmate:
{'Ingredient Name': 'Sodium Palmate', 'Matched Entry': 'Sodium Palmate', 'Score': 100, 'Carcinogens': False, 'EndocrineDisruptors': False, 'Allergen': False, 'SkinIrritant': False, 'Id': 270.0}
 Aqua:
{'Ingredient Name': 'Aqua', 'Matched Entry': 'Aqua', 'Score': 100, 'Carcinogens': False, 'EndocrineDisruptors': False, 'Allergen': False, 'SkinIrritant': False, 'Id': 2247.0}
Glycerin:
{'Ingredient Name': 'Glycerin', 'Matched Entry': 'Glycerin', 'Score': 100, 'Carcinogens': False, 'EndocrineDisruptors': False, 'Allergen': False, 'SkinIrritant': False, 'Id': 1558.0}
 Sodium Palm Kernelate:
{'Ingredient Name': 'Sodium Palm Kernelate', 'Matched Entry': 'Sodium Palm Kernelate', 'Score': 100, 'Carcinogens': False, 'EndocrineDisruptors': False, 'Allergen': False, 'SkinIrritant': False, 'Id': 272.0}
 Parfum:
{'Ingredient Name': 'Parfum', 'Matched Entry': 'Parfum', 'Score': 100, 'Carcinogens': False, 'EndocrineDisruptors': False, 'Allergen': False, 'SkinIrritant': False, 'Id': 124