In [2]:
# Import packages
import os
import time
import re
import json
import pandas as pd
from datetime import datetime

from openai import AzureOpenAI
import numpy as np
from dotenv import load_dotenv

In [3]:
# Load environment variables from the .env file
load_dotenv()

True

In [4]:
api_key = os.getenv('AZUREOPENAI_API_KEY')
api_version = os.getenv('AZUREOPENAI_API_VERSION')
azure_endpoint = os.getenv('AZUREOPENAI_API_ENDPOINT')

In [5]:
# Create Azure OpenAI client
# Make sure the environment variables are created
client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    azure_endpoint = azure_endpoint
    )

In [6]:
# Define chat completion function
def completeChat(prompt, style, client, model="gpt-4o-mini"):
    # Execute API call
    result = client.chat.completions.create(
        model=model,
        messages= [
            {
                "role": "system",
                "content": style,
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
        max_tokens=1000,
        temperature=0,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None,
        stream=False,
        seed=42,
        n=1,
    )

    # Extract the response
    response = result.choices[0].message.content.strip()

    return response

In [7]:
# Define text embedding function
def embedText(text, client, model="text-embedding-ada-002"):
    # Execute API call
    result = client.embeddings.create(
        model=model,
        input=text
    )

    # Extract and normalize the embeddings
    embedding = np.array(result.data[0].embedding)
    embedding /= np.linalg.norm(embedding)

    return embedding

## === PRODUCT TEST DATASET ===
### Dictionaries of 10 test products with their information

In [8]:
product_1 = {
    'brand': 'EvasHair',
    'name': 'Haitian Black Castor Oil (Lwil Maskriti)',
    'url': 'https://evashair.fr/fr/boutique/22-40-huile-de-ricin-noire-dhaiti-0745114560452.html#/1-taille-120ml',
    'product_information': """
EvasHair - Haitian Black Castor Oil (Lwil Maskriti)
Rating: 4.8 / 5 - 1,629 reviews
Price: ‚Ç¨14.40 (TTC)
Available sizes: 120ml, 250ml, 500ml
Reference: EVAS-HUILLE-120M
EAN-13: 0745114560452

Secure payment methods: Mastercard, Visa, Amex, Carte Bancaire, Google Pay

ORIGIN OF OUR BLACK CASTOR OIL:
Haiti, a small island in the Caribbean, is home to rich natural treasures. It all begins in Eva‚Äôs grandparents‚Äô backyard, in the village of Piton. In this lush environment, the family prepares a unique recipe based on castor seeds. These seeds come from the castor bean plant (Ricinus communis), from which a precious vegetable oil is extracted: Haitian Black Castor Oil. Known locally as ‚ÄúLwil Maskriti‚Äù in Haitian Creole, or ‚ÄúHuile de Carapate‚Äù in French Antilles, this oil is produced using a traditional ancestral method.

The castor seeds are roasted, then ground and boiled in water until a rich amber-colored oil with a roasted hazelnut scent is obtained. This process preserves omega 3, 6, and 9 fatty acids, vitamin E, and ricinoleic acid ‚Äî making it a unique oil worldwide.

HISTORY OF OUR HAITIAN BLACK CASTOR OIL:
The founder‚Äôs grandfather was the first to produce the oil when Eva began receiving her first online orders through her YouTube videos. He helped produce the oil in larger quantities to meet demand. Though he has since passed, Eva honors his memory and legacy within EvasHair.

Message from Eva Biassou-Andr√©:
‚ÄúFrom where you are, thank you so much, Grandpa, for believing in us and leaving EvasHair as your legacy.‚Äù

HOW OUR OIL IS PRODUCED TODAY:
Initially handcrafted by Eva‚Äôs grandfather and a small team, production has since expanded through cooperatives in Haitian provinces such as L√©og√¢ne, Kenscoff, Petit-Go√¢ve, and C√¥teaux. This traditional yet scaled-up process sustains around 70 Haitian families. The business model encourages rural employment, helping reduce urban migration as more youth return to the provinces to work in oil production.

BENEFITS OF HAITIAN BLACK CASTOR OIL:
‚Ä¢ Stimulates hair growth and thickening (scalp, eyebrows, eyelashes, beard, and nails)
‚Ä¢ Strengthens and coats the hair fiber
‚Ä¢ Locks in moisture for long-lasting hydration
‚Ä¢ Repairs dry, brittle, and damaged hair
‚Ä¢ Protects and fortifies fragile curls and coils

HOW TO USE:
1. To promote growth and thickness:
   Apply a pistachio-sized amount to a clean, dry scalp and massage until fully absorbed.
   Recommended twice per week. Can also be applied to eyebrows, eyelashes, beard, and nails.

2. To retain moisture:
   Work section by section on clean hair. Apply slightly less than a teaspoon of oil after hydrating with water or cream. This seals in moisture.

3. As a hot oil or deep treatment:
   Apply 3‚Äì5 tablespoons of oil to the hair, leave on for 45 minutes under a heating cap, then shampoo and apply a moisturizing conditioner or mask.

4. As a massage oil:
   Warm the desired amount in a water bath, then massage until absorbed.

5. For beard care:
   Apply 2 drops daily, massage and comb. Promotes faster, thicker beard growth.

INGREDIENTS:
‚Ä¢ 100% Pure Ricinus Communis
‚Ä¢ 100% natural ingredient
(Produced in Haiti and packaged in France)

IDEAL FOR:
Curly, coily, and kinky hair (Type 3A to 4C)
Hair strengthening, growth stimulation, deep hydration, and shine enhancement
"""
}


In [9]:
product_2 = {
    'brand': 'Chebhair',
    'name': 'ChebElixir Traditional Hair Oil Bath - 200ml',
    'url': 'https://chebhair.com/products/chebelixir-huile',
    'product_information': """
Chebhair - ChebElixir Oil Bath - 200ml
Rating: 68 reviews
Price: FC 92,697.79 CDF
Status: Out of stock (taxes included, shipping calculated at checkout)

Your 100% natural volume-boosting ally. Your crown loves it!
Secure payment / Fast delivery

DESCRIPTION:
Nourish your hair with the rich, natural properties of Chebhair‚Äôs ChebElixir oil.
Formulated with 100% natural ingredients, this oil promotes growth, strength, and shine.
Chebe and peppermint stimulate hair growth and fortify strands thanks to ultra-hydrating and strengthening properties.
Olive oil and shea butter deliver intense nutrition and radiant shine, reducing dryness and brittleness.
ChebElixir suits all hair types ‚Äî from wavy to coily, even relaxed hair.

PRODUCT DETAILS:
‚Ä¢ SKU: CHEB-OIL200-CHEB
‚Ä¢ Weight: 200 g
‚Ä¢ Product type: Hair oil
‚Ä¢ Brand: chebhairbycjv
‚Ä¢ Texture: Liquid, slightly thick
‚Ä¢ Packaging: Recyclable PET plastic bottle
‚Ä¢ Volume: 200ml
‚Ä¢ 100% natural product
‚Ä¢ Made in France

INGREDIENTS:
BUTYROSPERMUM PARKII OIL (shea butter),
RICINUS COMMUNIS SEED OIL (castor oil),
ALLIUM CEPA BULB OIL (onion oil),
OLEA EUROPAEA FRUIT OIL (olive oil),
CROTON GRATISSIMUS SEED EXTRACT (chebe),
MENTHA PIPERITA LEAF EXTRACT (peppermint),
CAPRYLIC/CAPRIC TRIGLYCERIDE,
PARFUM,
TOCOPHEROL (vitamin E),
INULA CRITHMOIDE FLOWER/LEAF EXTRACT,
LINALOOL

USAGE:
‚Ä¢ Shake well before each use.
‚Ä¢ For daily use: apply a small amount to your hair after your cream or leave-in conditioner.
‚Ä¢ For an intensive nourishing treatment: apply a generous amount as a hot oil bath treatment.

BENEFITS:
‚Ä¢ 100% natural purity ‚Äî crafted with carefully selected natural ingredients.
‚Ä¢ Ideal daily ally ‚Äî lightweight formula suitable for everyday use to nourish and revitalize hair without weighing it down.
‚Ä¢ Ingredient synergy ‚Äî combines peppermint, olive, shea, and chebe for visibly stronger, healthier hair.

PRECAUTIONS:
Avoid contact with eyes. Do not ingest. Keep out of reach of children.
To avoid allergic reactions, perform a patch test on a small section of hair before full use.

LABELS:
FAST DELIVERY / SECURE PAYMENT / 100% NATURAL / MADE IN FRANCE

ADDITIONAL DESCRIPTION:
ChebElixir represents the excellence of natural hair care, designed to enhance textured and curly hair daily.
Its unique formula offers an unmatched care experience ‚Äî a true natural beauty elixir that nourishes curls, promotes growth, and maintains healthy hair.
"""
}


In [10]:
product_3 = {
    'brand': 'Soarn',
    'name': 'Shea Whipped Butter ‚Äì Tropical Juicy',
    'url': 'https://www.soarn.fr/produit/chantilly-de-karite-tropical-juicy/',
    'product_information': """
Soarn - Shea Whipped Butter "Tropical Juicy"
Price: ‚Ç¨24.90
Weight: 170 g
Volume: 250 mL

DESCRIPTION:
This shea whipped butter is rich and ultra-nourishing.
Enriched with kokum, passion fruit, and tiar√© flower, it enhances the reparative and softening properties of shea butter.
Its delicate fragrance takes you on a gentle tropical escape.

Particularly rich in fatty acids and antioxidants, this whipped butter effectively repairs the hair fiber, soothes the skin, and protects against external aggressions.
Ideal for dry, damaged hair and dry skin alike.
Its whipped, silky, and airy texture makes application easy and pleasant.

USES:
‚Ä¢ To seal in hydration after applying your leave-in cream or styling milk, extending moisture retention.
‚Ä¢ Apply to damaged ends for nourishment.
‚Ä¢ Use as a body massage butter for a soft, pleasant skin-care experience.

BENEFITS:
‚Ä¢ Deeply repairs and nourishes dry, brittle hair.
‚Ä¢ Restores radiance and strength to tired, dry skin.
‚Ä¢ Delicately scented for a soothing tropical escape.

HOW TO APPLY:
Scoop a small amount and melt it between your palms.
‚Äì Skin & face: massage gently for full absorption.
‚Äì Hair: apply along the lengths after hydration to seal in moisture, or before hydration to nourish.
In case of contact with eyes or irritation, rinse thoroughly.

PRECAUTIONS:
Store properly to preserve quality. Keep tightly closed and out of reach of children.
Store away from light and heat, preferably in a cool, dark place to maintain effectiveness and extend shelf life.

INGREDIENTS:
Butyrospermum Parkii (Shea Butter),
Garcinia Indica Seed Oil (Kokum Butter),
Passiflora Edulis Seed Oil (Passion Fruit Oil),
Gardenia Tahitensis Flower (Tiar√© Extract),
Tocopherol (Vitamin E),
Coumarin,
D-Limonene.

ADDITIONAL DETAILS:
‚Ä¢ Made in France
‚Ä¢ 100% ingredients of natural origin
‚Ä¢ Vegan friendly
‚Ä¢ 0% parabens, 0% coloring agents, 0% SLS

CATEGORIES:
Beard, Whipped Butters, Kids & Babies, Pregnant & Nursing Women, Daily Hair Care, ‚ÄúBesties of the Team,‚Äù Our Products, Nourish

TAGS:
100% natural, butter, shea butter, whipped shea, hair, body, jojoba, castor, natural care, stretch marks, vegan
"""
}

In [11]:
product_4 = {
    'brand': 'Mango Butterfull',
    'name': 'Moisturizing & Nourishing Milk ‚Äì Nourish',
    'url': 'https://mangobutterfull.com/products/lait-mango-nourish',
    'product_information': """
Mango Butterfull ‚Äì Moisturizing & Nourishing Milk ‚ÄúNourish‚Äù
Price: ‚Ç¨16.90 (100 ml)
Variants: 100 ml (‚Ç¨16.90), 200 ml (‚Ç¨22.00), 500 ml (‚Ç¨49.90) :contentReference[oaicite:1]{index=1}
YUKA rating: 100/100 :contentReference[oaicite:2]{index=2}

DESCRIPTION:
Give lightness to your dense and thirsty hair with this fluid and moisturizing milk boasting vegetal scents.
The irresistible blend of mango, shea, and cocoa butters together with avocado and karapate oils will help nourish and soften your mane, while aloe vera and vegetable glycerin gently hydrate each strand.
For soft and velvety skin, apply over the body until fully absorbed. :contentReference[oaicite:3]{index=3}

BENEFITS:
- Aloe vera: its richness in mucilages, trace elements, enzymes and animating acids brings a hydrating, soothing and repairing effect. :contentReference[oaicite:4]{index=4}
- Vegetable glycerin: by its emollient action, it will relax the hair fiber. :contentReference[oaicite:5]{index=5}
- The trio of butters (mango, shea, cocoa): soften and relax hair, deliver shine and prevent split ends. :contentReference[oaicite:6]{index=6}
- Avocado oil: smoothing, protective and restructuring effect on skin and hair. :contentReference[oaicite:7]{index=7}
- Karapate oil: helps sheath and discipline very dense hair. :contentReference[oaicite:8]{index=8}
- Fresh West-Indian wood leaves (wood of India): tone and purify the scalp. :contentReference[oaicite:9]{index=9}

HOW TO USE:
Hair: Use as a leave-in milk on dry or damp hair. Distribute a small amount over the entire hair, then seal in hydration with an oil, butter or whipped cream. :contentReference[oaicite:10]{index=10}
Body: Suitable for all skin types as a face and body care. :contentReference[oaicite:11]{index=11}

PRECAUTIONS:
- Do not overdose, it is effective in small amount. :contentReference[oaicite:12]{index=12}
- Do not ingest. :contentReference[oaicite:13]{index=13}
- It is recommended to perform a patch test in the fold of the elbow before using on hair. :contentReference[oaicite:14]{index=14}
- PAO (Period After Opening): 12 months. :contentReference[oaicite:15]{index=15}

INGREDIENTS (selected key list):
Aqua, Aloe vera, Glycerin, Vitis vinifera oil, Carica papaya, Carthamus tinctorius seed oil, Oryza sativa bran oil, Sesamum indicum seed oil, Tilia platyphyllos Cocos nucifera oil, Dicaprylyl carbonate, Glyceryl stearate citrate, Polyglyceryl-3 stearate, Glyceryl stearate, Mangifera indica seed butter, Butyrospermum parkii butter, Theobroma cacao seed butter, Coco nucifera milk, Tocopherol, Hydrogenated lecithin, Lactobacillus, Sodium levulinate C10-18 Triglycerides, Xanthan gum, Sodium anisate, Levulinic acid, Sodium benzoate, Potassium sorbate. :contentReference[oaicite:16]{index=16}

ADDITIONAL DETAILS:
‚Ä¢ 100% natural composition. :contentReference[oaicite:17]{index=17}
‚Ä¢ Handmade, artisanal production. :contentReference[oaicite:18]{index=18}
‚Ä¢ Free from palm oil, mineral oils, sulfates, endocrine disruptors. :contentReference[oaicite:19]{index=19}
"""
}


In [12]:
product_5 = {
    'brand': 'Mango Trio',
    'name': 'Trio Mango Wellness',
    'url': 'https://mangobutterfull.com/products/trio-mango-bien-etre',
    'product_information': """Price: ‚Ç¨32.90 (250 ml) / ‚Ç¨24.00 (200 ml ‚Äì sold out)

Full Description: A trio of butters (Care, Sensitive, Nourish) for dry skin and hair. Concentrated wellness in a single 250ml jar. Recommended for children, adults, dry hair of all textures and porosities, relaxed or transitioning hair, colored and dehydrated hair, dreadlocks.

Step 1 ‚Äì Establish a good base: Mango Care Butter
Hair: Repair and revitalize dry hair with propolis.
Skin: Nourishing and soothing for dry skin.
Ingredients: Mangifera indica seed oil, Vitis vinifera oil, Cera flava, honey, propolis extract, fragrance, tocopherol.

Step 2 ‚Äì Strengthen hair: Mango Sensitive Butter
Hair: Strengthens the hair fiber with proteins and fatty acids from coconut milk and cocoa butter.
Skin: Repairs very dry and fragile skin.
Ingredients: Mangifera indica seed oil, Vitis vinifera oil, Carica papaya oil, Theobroma cacao, Cocos nucifera oil and milk, Cera flava, honey, fragrance, tocopherol.

Step 3 ‚Äì Discipline and promote growth: Mango Nourish Butter
Hair: Nourishes, softens, and disciplines hair with avocado oil, Karapate oil, and fresh wood leaves. Recommended for children to use after the Sensitive range as protein needs decrease.
Skin: Provides intense hydration.
Ingredients: Mangifera indica seeds, Persea gratissima oil, Ricinus communis oil (castor oil), Olea europaea oil, Mangifera indica seed oil, Pimenta dioica, Cera flava, honey, Mentha x piperita (peppermint), Pimenta racemosa, fragrance.

Usage Tips: For very dry hair, apply a moisturizing spray or leave-in first, then seal with the butters during the treatment. To benefit from all three butters at once, melt them in a bowl over a bain-marie and pour the liquid mixture back into the jar.

Precautions: Test on the inner elbow before use. Do not overdose. Do not ingest. PAO (Period After Opening): 12 months. Store in a dry place, away from heat and light.

Features: Handmade, artisanal; vegan; free from palm oil, mineral oils, sulfates, endocrine disruptors; cruelty-free.

Full Ingredient Details by Range:
- Mango Care: Mangifera indica seed oil, Vitis vinifera oil, Cera flava, honey, propolis extract, fragrance, tocopherol.
- Mango Sensitive: Vitis vinifera oil, Carica papaya oil, Theobroma cacao, Cocos nucifera oil, Mangifera indica seed extract, Cera flava, honey, glycerin, fragrance, tocopherol.
- Mango Nourish: Persea gratissima oil, Ricinus communis oil, Olea europaea oil, Mangifera indica seed oil, Pimenta dioica, Cera flava, honey, Mentha x piperita, Pimenta racemosa, fragrance, tocopherol.

All products are handmade and artisanal:
- Free from palm oil
- Free from mineral oils
- Sulfate-free
- Free from endocrine disruptors
- Not tested on animals"""
}

In [13]:
product_6 = {
    "brand": "Maishea Natural",
    "name": "Moisturizing & Strengthening Hair Cream",
    "url": "https://www.maisheanatural.com/en/collections/hair-care/products/moisturizing-strengthening-hair-cream-300ml",
    "product_information": """
MOISTURIZING & STRENGTHENING HAIR CREAM
300ML

Our hair cream enriched with hibiscus oil and mango butter, a luxurious formula specially designed to enhance frizzy, curly and curly hair.

Hibiscus oil, known for its nourishing and regenerating properties, penetrates deep into the hair fiber to hydrate and revitalize hair, while promoting growth and improving elasticity. Combined with mango butter, rich in fatty acids and vitamins, our hair cream provides intense nutrition, leaving hair soft, supple and shiny.

Our lightweight, non-greasy formula defines curls, reduces frizz and makes styling easier, for perfectly bouncy hair.

HAIR TYPE:
Dry and fragile hair, curly, frizzy

BENEFITS:
Nourishes and deeply hydrates hair, strengthens hair fiber, defines curls

INGREDIENTS:
Hibiscus oil, aloe vera juice, vegetable glycerin, mango butter

USAGE TIPS:
After the hair mask:
When your hair is clean and well towel-dried, divide it into four sections for even application.

NOURISH DEEPLY:
Apply the hair cream generously to the lengths and ends.

2 IN 1:
Use our hair cream as a leave-in for continuous hydration without rinsing or, use it as a conditioner leaving it on for a few minutes then rinsing thoroughly with lukewarm water.

KEY INGREDIENTS:

HIBISCUS OIL:
Hibiscus oil deeply nourishes, stimulates hair growth, strengthens follicles and adds shine and softness to hair, while promoting a healthy scalp.

ALOE VERA JUICE:
Aloe vera juice soothes the scalp while deeply moisturizing hair. It helps maintain natural moisture balance and prevent dryness.

VEGETABLE GLYCERIN:
Glycerin acts as a natural moisturizer, attracting and retaining moisture for healthy, hydrated hair that resists frizz and breakage.

MANGO BUTTER:
Mango butter is rich in vitamins and nutrients. It intensely nourishes hair deep down, sealing in moisture and adding softness to curls.

INCI LIST:
Aqua, Aloe Barbadensis Leaf Juice, Hibiscus Sabdariffa Oil, Glycerin, Cetearyl Alcohol, Mangifera Indica Seed Butter, Behentrimonium Methosulfate, Butyrospermum Parkii (Shea) Butter, Panthenol, Cetyl Alcohol, Ricinus Communis Seed Oil, Sesamum Indicum Seed Oil, Cocos Nucifera Oil, Benzyl Alcohol, Hydrolyzed Silk, Tocopherol, Butylene Glycol, Xanthan Gum, Parfum, Helianthus Annuus Seed Oil, Dehydroacetic Acid

NATURAL INGREDIENTS:
Ingredients 99.2% of natural origin

TEXTURED HAIR:
Suitable for frizzy to curly hair

RECYCLABLE POT:
Our pots are 100% recyclable

LONG-LASTING HYDRATION AND INCREASED SUPPLENESS:
Our hair cream, enriched with hibiscus oil, has been designed to intensely moisturize and revitalize dry and damaged hair. Thanks to its unique properties, hibiscus oil helps strengthen the roots and lengths, reducing hair loss and promoting denser, more robust hair.

Rich in 38% aloe vera juice, hibiscus hair cream offers a powerful solution for hydrating hair, without weighing it down. Our formula is also enriched with vegetable glycerin, a natural humectant that attracts and retains moisture inside the hair, ensuring long-lasting hydration and increased suppleness. Additionally, mango butter acts as a deep conditioner, leaving hair smooth and silky, while improving manageability for easy styling.
"""
}


In [14]:
product_7 = {
    "brand": "EvasHair",
    "name": "Moisturizing Shampoo",
    "url": "https://evashair.fr/fr/produits-hydratants/27-shampooing-hydratant-0745110054634.html",
    "product_information": """
MOISTURIZING SHAMPOO
4.8 / 5 - 243 REVIEWS
PRICE: ‚Ç¨16.90 (250ML)

SIZE: 250ML / 500ML

USAGE INSTRUCTIONS:
Thoroughly wet the hair. Apply a generous amount and gently massage the scalp until it foams. For shampoo application, work methodically, section by section, to avoid tangles. Then rinse thoroughly. Next, apply the 2-in-1 Deep Conditioner Mask Jacmel.

PRECAUTIONS:
Shampoo should be applied primarily to the scalp to remove buildup and product deposits. Massage gently with your fingers to remove deposits efficiently. Do not scratch the scalp with your nails to avoid injury. Avoid contact with eyes. If contact occurs, rinse carefully with water for several minutes. Keep out of reach of children. Do not swallow. Suitable for adults and children over 3 years old.

INGREDIENTS:
AQUA (WATER), HELIANTHUS ANNUUS SEED OIL, DIETHYLHEXYL SODIUM SULFOSUCCINATE, RICINUS COMMUNIS (CASTOR) SEED OIL, DISODIUM LAURYL SULFOSUCCINATE, COCAMIDOPROPYL BETAINE, SORBITAN STEARATE, MORINGA OLEIFERA SEED OIL, GLYCERIN, PROPYLENE GLYCOL, BENZYL ALCOHOL, XANTHAN GUM, PANTHENOL, FRAGRANCE, POLYQUATERNIUM-10, DEHYDROACETIC ACID, CARICA PAPAYA FRUIT EXTRACT, TOCOPHEROL, GLYCINE SOJA (SOYBEAN) OIL, CI 19140, POTASSIUM SORBATE, SODIUM BENZOATE, CITRIC ACID, CI 14720, SODIUM SULFATE

88% NATURAL ORIGIN INGREDIENTS
Product made and packaged in France

HAIR TYPE:
Suitable for high, normal, and low porosity hair. Ideal for dry and damaged hair, afro-textured, curly, and wavy hair.

DESCRIPTION:
EvasHair moisturizing shampoo gently cleanses hair while maintaining optimal hydration. Nourishes the hair, leaving it soft and supple. Can be used as the first or second step in a hair care routine for afro-textured, curly, and wavy hair.

BENEFITS:
Respects the hair‚Äôs natural hydrolipidic film, deeply hydrates, reduces breakage, stimulates blood circulation in the scalp, and promotes hair growth. Facilitates styling with protective twists and circular scalp massages.

HOW TO USE:
Recommended frequency: once a week or every two weeks. Create twists on your hair to prevent tangles, then apply shampoo to the scalp in circular motions. Foam is sufficient to cleanse the lengths. Complete your routine with the 2-in-1 deep conditioner mask and moisturizing hair cream.

KEY INGREDIENTS:
Sunflower oil (Helianthus Annuus Seed Oil), castor oil (Ricinus Communis Seed Oil), moringa oil (Moringa Oleifera Seed Oil), glycerin (Glycerin), panthenol (Panthenol), papaya extract (Carica Papaya Fruit Extract), tocopherol (Tocopherol), and others.

ORIGIN:
88% natural origin ingredients. Made and packaged in France.
"""
}


In [15]:
product_8 = {
    "brand": "Devance Cosm√©tiques",
    "name": "OIL 100% NAT&BIO ‚Äì ORIGINAL COSMOS ORGANIC",
    "url": "https://devance-cosmetiques.fr/index.php/product/oil-100-natbio-original/",
    "product_information": """
OIL 100% NAT&BIO ‚Äì ORIGINAL COSMOS ORGANIC
Out of stock
SKU: DEVOIL-100
PRICE: ‚Ç¨22.00

DESCRIPTION:
Nourishing oil suitable for all hair types, particularly thick and very dry hair. Free from sulfates, synthetic products, silicones, preservatives, and fragrances. Vegan product. Available in a 100ml glass bottle. Made in France.

ACTION:
OIL 100% NAT&BIO is a concentrated blend of 5 natural and organic ingredients designed to nourish and repair all hair types, including dry, split, or brittle hair. Can be used as a daily treatment, as a finishing oil, or before blow-drying on damp hair; does not need to be rinsed. Multi-purpose: can also be used as a body oil for the whole family and to nourish the beard.

RESULTS:
Strengthens hair and leaves it soft.

KEY INGREDIENTS:
Cold-pressed vegetable oils selected for their protective and repairing properties. Enriched with avocado oil and coconut oil to deeply nourish hair fiber, prevent split ends, reduce breakage, and care for dry scalp.
Ingredients: PRUNUS AMYGDALUS DULCIS OIL*, PERSEA GRATISSIMA OIL*, BUTYROSPERMUM PARKII BUTTER*, COCOS NUCIFERA OIL*, SIMMONDSIA CHINENSIS OIL*.
*Ingredients from organic farming. 100% of total ingredients are organic.

CLAIMS:
Organic, natural, and vegan product.
Free from preservatives, fragrances, essential oils, sulfates, and quaternary ammonium compounds.
COSMOS Organic certified by Ecocert Greenlife according to the COSMOS standard (http://COSMOS.ecocert.com).

USAGE INSTRUCTIONS:
Shake well before use. Place a few drops in the palm, warm slightly by rubbing hands together. Apply to hair, massaging in, focusing on split ends. Can be used on dry or damp hair. Store away from light. For external use only; do not ingest. In case of contact with eyes, rinse immediately.

PACKAGING:
100ml, Weight: 0.205 kg, Dimensions: 4.1 √ó 4.1 √ó 12.5 cm
"""
}


In [16]:
product_9 = {
    'brand': 'Devance',
    'name': 'Masque hydratation intense Cosmos Organic',
    'url': 'https://devance-cosmetiques.fr/index.php/product/masque-capillaire-en-poudre-devance-cosmetiques/',
    'product_information': """
Devance - Cosmos Organic Intense Hydration Mask
Hair mask with coconut milk and phytokeratin
Powder to dilute, to rinse, for dry hair
Sulfate-free formula
60g powder
Price: 16.00‚Ç¨

Ingredients: phytokeratin, coconut milk, oatmeal, natural polyols

For dry and damaged hair types 2A to 4C
"""
}

In [17]:
product_9 = {
    "brand": "Devance Cosm√©tiques",
    "name": "Mask ‚Äì Intense Hydration",
    "url": "https://devance-cosmetiques.fr/index.php/product/masque-capillaire-en-poudre-devance-cosmetiques/",
    "product_information": """
PRICE: ‚Ç¨16.00
SIZE: 60 g
AVAILABILITY: In stock

DESCRIPTION: Powder hair mask to be mixed with water, formulated with coconut milk and phytokeratin, designed for dry hair. Rinse-off product, travel-friendly and economical (equivalent to 4‚Äì10 conventional masks).

KEY INGREDIENTS: Sorbitol, Avena Sativa Kernel Flour*, Cocos Nucifera Milk Powder*, Erythritol, Hydrolyzed Wheat Protein, Maltodextrin**, Xanthan Gum, Maranta Arundinacea Root Extract, Calcium Carbonate. (*from organic farming, **processed from organic ingredients).

CLAIMS: Natural, vegan product. Free from sulfates, silicones, preservatives, fragrances, and essential oils. COSMOS ORGANIC certified by Ecocert Greenlife.

USAGE INSTRUCTIONS: Mix 15 g of powder with 50 ml of very hot water to obtain a smooth cream. After shampooing, apply strand by strand on wet hair, leave on for 10 minutes under heat, then rinse. Can be combined with deep treatments for an extra hydration boost. External use only; avoid contact with eyes.

WEIGHT: 0.068 kg
DIMENSIONS: 18.3 √ó 10.9 √ó 0.7 cm
ORIGIN: Made in France
"""
}

In [18]:
product_10 = {
    "brand": "EvasHair",
    "name": "Apr√®s-Shampooing Masque Profond 2 en 1",
    "url": "https://evashair.fr/fr/produits-hydratants/28-apres-shampooing-masque-profond-2-en-1-3770027914013.html",
    "product_information": """
PRICE: ‚Ç¨24.90
SIZE: 300 ml (also available in 500 ml)
AVAILABILITY: In stock

DESCRIPTION: Multifunctional conditioner containing castor oil, sunflower oil, papaya extract, honey, and the phytobioactive HAIRILINE¬Æ (Lindera Strychnifolia Root Extract). Hydrates, repairs, and adds suppleness and shine to hair. Facilitates detangling and protects the scalp microbiome. Suitable for low, normal, and high porosity hair.

USAGE INSTRUCTIONS:
- **Detangling / Pre-poo**: Apply to damp hair, detangle strand by strand.
- **Conditioner**: Apply generously on 4 sections, leave on 5‚Äì20 minutes, then rinse.
- **Deep Mask**: Apply on 4 sections, wrap in cling film, cover with a shower cap, use a heat cap for 45 min (or 1h without heat), then rinse.

PRECAUTIONS: Apply to lengths only, avoid contact with eyes and mucous membranes. Keep out of reach of children. For external use only.

INGREDIENTS: Aqua (Water), Coco-Caprylate/Caprate, Ricinus Communis (Castor) Seed Oil, Cetearyl Alcohol, Helianthus Annuus (Sunflower) Seed Oil, Behentrimonium Methosulfate, Xylitylglucoside, Citrus Aurantium Amara Flower Water, Propanediol, Benzyl Alcohol, Anhydroxylitol, Glycerin, Xylitol, Parfum (Fragrance), Caramel, Dehydroacetic Acid, Carica Papaya Fruit Extract, Lindera Strychnifolia Root Extract, Honey Extract, Tocopherol, Potassium Sorbate, Citric Acid, Sodium Benzoate, Sorbic Acid.

97.8% of ingredients are of natural origin.
ORIGIN: Made and packaged in France
EAN-13: 3770027914013
"""
}

In [19]:
# Create list of all products
all_products = [product_1, product_2, product_3, product_4, product_5,
                product_6, product_7, product_8, product_9, product_10]

print(f"üì¶ Loaded {len(all_products)} test products")
for i, product in enumerate(all_products, 1):
    print(f"   {i}. {product['brand']}: {product['name']}")

üì¶ Loaded 10 test products
   1. EvasHair: Haitian Black Castor Oil (Lwil Maskriti)
   2. Chebhair: ChebElixir Traditional Hair Oil Bath - 200ml
   3. Soarn: Shea Whipped Butter ‚Äì Tropical Juicy
   4. Mango Butterfull: Moisturizing & Nourishing Milk ‚Äì Nourish
   5. Mango Trio: Trio Mango Wellness
   6. Maishea Natural: Moisturizing & Strengthening Hair Cream
   7. EvasHair: Moisturizing Shampoo
   8. Devance Cosm√©tiques: OIL 100% NAT&BIO ‚Äì ORIGINAL COSMOS ORGANIC
   9. Devance Cosm√©tiques: Mask ‚Äì Intense Hydration
   10. EvasHair: Apr√®s-Shampooing Masque Profond 2 en 1


## Enhanced Standardization Functions

In [20]:
def standardize_numeric_fields(extraction_dict):
    """
    Fix systematic numeric/age issues identified in 4-reality analysis
    Applied AFTER LLM extraction, BEFORE validation
    """
    standardized = extraction_dict.copy()

    # 1. FIX PRICE FORMATTING
    price = standardized.get('Product Info', {}).get('Product Sheet', {}).get('Price (euros)', '')
    if price:
        # Remove currency symbols, keep numbers, dots, commas, hyphens
        cleaned_price = re.sub(r'[^\d.,-]', '', str(price))

        # Handle European decimal format
        if ',' in cleaned_price and '.' in cleaned_price:
            cleaned_price = cleaned_price.replace('.', '').replace(',', '.')
        elif ',' in cleaned_price:
            cleaned_price = cleaned_price.replace(',', '.')

        # Handle price ranges (take first price only)
        if '-' in cleaned_price:
            cleaned_price = cleaned_price.split('-')[0].strip()

        # Final cleanup
        cleaned_price = re.sub(r'[^\d.]', '', cleaned_price)

        standardized['Product Info']['Product Sheet']['Price (euros)'] = cleaned_price
        if price != cleaned_price:
            print(f"‚úÖ Fixed price: '{price}' ‚Üí '{cleaned_price}'")

    # 2. FIX QUANTITY FORMATTING
    quantity = standardized.get('Product Info', {}).get('Product Sheet', {}).get('Quantity (ml)', '')
    if quantity:
        numbers = re.findall(r'\d+', str(quantity))
        if numbers:
            number_part = numbers[0]

            # Detect and standardize units
            if re.search(r'\b(ml|mL|milliliter)\b', str(quantity), re.IGNORECASE):
                unit_part = 'ml'
            elif re.search(r'\b(l|L|liter)\b', str(quantity), re.IGNORECASE):
                number_part = str(int(number_part) * 1000)
                unit_part = 'ml'
            else:
                unit_part = 'g'

            standardized_quantity = f"{number_part} {unit_part}"
            standardized['Product Info']['Product Sheet']['Quantity (ml)'] = standardized_quantity

            if quantity != standardized_quantity:
                print(f"‚úÖ Fixed quantity: '{quantity}' ‚Üí '{standardized_quantity}'")

    return standardized

def validate_extraction_quality(extraction_dict):
    """
    Validate extraction quality and report issues
    """
    issues = []

    # Check critical fields

    price = extraction_dict.get('Product Info', {}).get('Product Sheet', {}).get('Price (euros)', '')
    if not price:
        issues.append("Missing price")

    quantity = extraction_dict.get('Product Info', {}).get('Product Sheet', {}).get('Quantity (ml)', '')
    if not quantity:
        issues.append("Missing quantity")


    if issues:
        print(f"‚ö†Ô∏è  Quality issues: {', '.join(issues)}")
    else:
        print("‚úÖ All critical fields populated")

    return len(issues) == 0

In [21]:
# === COMPREHENSIVE STANDARDIZATION FUNCTIONS ===

def standardize_all_fields(extraction_dict):
    """
    Apply systematic standardization to all 10 critical fields
    """
    standardized = extraction_dict.copy()

    # 1. Price standardization
    price = standardized.get('Product Info', {}).get('Product Sheet', {}).get('Price (euros)', '')
    if price:
        cleaned_price = re.sub(r'[^\d.,-]', '', str(price))
        if ',' in cleaned_price and '.' in cleaned_price:
            cleaned_price = cleaned_price.replace('.', '').replace(',', '.')
        elif ',' in cleaned_price:
            cleaned_price = cleaned_price.replace(',', '.')
        if '-' in cleaned_price:
            cleaned_price = cleaned_price.split('-')[0].strip()
        cleaned_price = re.sub(r'[^\d.]', '', cleaned_price)
        standardized['Product Info']['Product Sheet']['Price (euros)'] = cleaned_price
        print(f"üí∞ Price standardized: '{price}' ‚Üí '{cleaned_price}'")

    # 2. Quantity standardization
    quantity = standardized.get('Product Info', {}).get('Product Sheet', {}).get('Quantity (ml)', '')
    if quantity:
        numbers = re.findall(r'\d+', str(quantity))
        if numbers:
            number_part = numbers[0]
            if re.search(r'\b(ml|mL|milliliter)\b', str(quantity), re.IGNORECASE):
                unit_part = 'ml'
            elif re.search(r'\b(l|L|liter)\b', str(quantity), re.IGNORECASE):
                number_part = str(int(number_part) * 1000)
                unit_part = 'ml'
            else:
                unit_part = 'g'
            standardized_quantity = f"{number_part} {unit_part}"
            standardized['Product Info']['Product Sheet']['Quantity (ml)'] = standardized_quantity
            print(f"‚öñÔ∏è  Quantity standardized: '{quantity}' ‚Üí '{standardized_quantity}'")

    # 3. Ages standardization (never empty)
    ages = standardized.get('Product Info', {}).get('Product Sheet', {}).get('Ages involved', {}).get('EN', [])
    if not ages or ages == [] or ages == [""]:
        standardized['Product Info']['Product Sheet']['Ages involved']['EN'] = [
            "6-12 years", "13-17 years", "18-24 years",
            "25-44 years", "45-64 years", "65 years and over"
        ]
        print("üë• Ages: Default ranges applied")

    # 4. Ingredients limitation (3-8 main ingredients)
    ingredients = standardized.get('Product Info', {}).get('Product Sheet', {}).get('Key ingredients', {}).get('EN', [])
    if len(ingredients) > 8:
        standardized['Product Info']['Product Sheet']['Key ingredients']['EN'] = ingredients[:8]
        print(f"üß™ Ingredients limited to 8 main actives")

    return standardized

def validate_all_fields(extraction_dict):
    """
    Comprehensive validation for all 10 fields
    """
    issues = []

    # Critical field checks
    fields_to_check = [
        ('Price (euros)', 'Price missing'),
        ('Quantity (ml)', 'Quantity missing'),
        ('Ages involved', 'Ages empty'),
        ('Key ingredients', 'No ingredients'),
        ('Category', 'Category missing')
    ]

    for field, issue in fields_to_check:
        value = extraction_dict.get('Product Info', {}).get('Product Sheet', {}).get(field)
        if not value or (isinstance(value, dict) and not value.get('EN')) or (isinstance(value, list) and not value):
            issues.append(issue)

    # Format specific validations
    price = extraction_dict.get('Product Info', {}).get('Product Sheet', {}).get('Price (euros)', '')
    if price and any(char in price for char in ['‚Ç¨', '$', '¬£']):
        issues.append("Price contains currency symbols")

    quantity = extraction_dict.get('Product Info', {}).get('Product Sheet', {}).get('Quantity (ml)', '')
    if quantity and not any(unit in quantity.lower() for unit in ['ml', 'g']):
        issues.append("Quantity missing units")

    if issues:
        print(f"‚ö†Ô∏è  Validation issues: {', '.join(issues)}")
        return False
    else:
        print("‚úÖ All fields validated successfully")
        return True

In [22]:
# === ENHANCED PROMPT TEMPLATE ===
# Configuration
completion_model = "gpt-4o-mini"
embedding_model = "text-embedding-ada-002"
style = "You are a precise cosmetics product data extractor."

# Example output structure
example = """

{
	"Product Info" : {
		"Product Sheet" : {
			"Brand" : "Elola Beaut√©",
			"Product name" : "Shampoing Boucl√©s",
			"Marketing Description" : {
				"EN" : "Silk shampoo that regenerates and gently cleanses curly and curly hair"
			},
			"Key ingredients" : {
				"EN" : [
					"Silk"
				]
			},
            "Price (euros)" : "28.99",
			"Quantity (ml)" : "500 ml",
			"Category" : {
				"EN" : "Shampoo"
			},
            "Ages involved" : {
				"EN" : [
					"13-17 years",
					"18-24 years",
					"25-44 years",
					"45-64 years",
					"65 years and over"
				]
			},
            "Suitable for pregnant women?" : {
				"EN" : [
					"Yes"
				]
			},
            "Compatible with allergies?" : {
				"EN" : [
					"Yes"
				]
			},
            "q001" : {
				"EN" : [
					"3A",
					"3B",
					"3C",
					"4A",
					"4B",
					"4C"
				]
			},
            "q002" : {
				"EN" : [
					"Natural"
				]
			},
			"q003" : {
				"EN" : [
					"Curl definition",
					"Healthy hair"
				]
			},
            "q004" : {
				"EN" : [
					"Dryness",
					"Frizz"
				]
			},
            "q005" : {
				"EN" : [
					"Oily",
					"Flaky",
					"Sensitive",
					"Dandruff",
					"Dermatitis",
					"Psoriasis"
				]
			}
		}
	}
}

"""

# Questionnaire definition
questionnaire = """

{
	"questions" : [
		{
			"label" : "q001",
			"question" : {
				"EN" : "Texture(s) concerned"
			},
			"answers" : {
				"EN" : ["1A", "1B", "1C", "2A", "2B", "2C", "3A", "3B", "3C", "4A", "4B", "4C"]
			}
		},
		{
			"label" : "q002",
			"question" : {
				"EN" : "Condition(s)"
			},
			"answers" : {
				"EN": [
					"Natural",
					"Straightened/chemically treated",
					"In transition",
					"Locs",
					"Braids"
				]
			}
		},
		{
			"label" : "q003",
			"question" : {
				"EN" : "Desired objective"
			},
			"answers" : {
				"EN": [
					"Curl definition",
					"Length retention",
					"Moisture retention",
					"Shine enhancement",
					"Healthy heat styling",
					"Colour-treated hair care",
					"Manageability",
					"Stronger hair",
					"Volume enhancement",
					"Healthy hair",
					"Shrinkage",
					"None"
				  ]
			}
		},
		{
			"label" : "q004",
			"question" : {
				"EN" : "Problem encountered"
			},
			"answers" : {
				"EN": [
					"Product build-up",
					"Dryness",
					"Greasy hair",
					"Breakage",
					"Frizz",
					"Hair loss",
					"Dull hair",
					"Porous hair",
					"Heat damage",
					"Physical damage(pulling)",
					"Hair transition",
					"Colour change",
					"Manageability",
					"Thinning hair",
					"Weak edges",
					"None"
				  ]
			},
			"max_selections" : 3,
			"importance" : 4,
			"Tag" : "Hair Challenges"
		},
		{
			"label" : "q005",
			"question" : {
				"EN" : "Suitable scalp"
			},
			"answers" : {
				"EN": [
					"Dry",
					"Oily",
					"Flaky",
					"Sensitive",
					"Dandruff",
					"Dermatitis",
					"Alopecia",
					"Psoriasis",
					"None"
				]
			}
		}
	],
	"contraindications" : [
		{
			"contraindication" : "Ages involved",
			"answers" : {
				"EN" : [
					"0-1 year",
					"2-5 years",
					"6-12 years",
					"13-17 years",
					"18-24 years",
					"25-44 years",
					"45-64 years",
					"65 years and over"
                ]
			}
		},
		{
			"contraindication" : "Suitable for pregnant women?",
			"answers" : {
				"EN" : [
					"Yes",
					"No"
				]
			}
		},
		{
			"contraindication" : "Compatible with allergies?",
			"answers" : {
				"EN" : [
					"Yes",
					"No"
				]
			}
		}
	]
}

"""

# Hair type dictionary
hair_type_dict_en = """

{
        '1A': 'straight and fine, known for its sleekness and smooth texture but may lack volume and get oily quickly',
        '1B': 'straight with some body, which holds styles better than finer hair and adds a bit more volume',
        '1C': 'straight with texture and body, making it versatile but prone to frizz in humid conditions',
        '2A': 'soft, loose waves that give your hair a gentle texture without too much frizz',
        '2B': 'wavy with more defined curls, giving your hair great texture and body but prone to frizz',
        '2C': 'wavy with thick, textured waves that bring volume and require moisture to maintain definition',
        '3A': 'curly with loose, well-defined curls that offer bounce and texture, requiring hydration for best results',
        '3B': 'curly with tighter ringlets that provide volume and definition but often need moisture to reduce frizz',
        '3C': 'curly with tight, springy curls that offer great texture but can shrink when dry, requiring intense moisture',
        '4A': 'coily with tight, well-defined curls that need deep hydration to avoid dryness and maintain strength',
        '4B': 'coily with less defined curls, offering volume and versatility but requiring moisture for definition',
        '4C': 'coily with very tight, zigzag curls, which thrive on intense moisture and need careful styling'
}

"""

# Enhanced prompt template with systematic fixes
enhanced_prompt_template = """
Your task is to extract and return **only** the following product information from the provided product description text below.
Use only the information that is **explicitly mentioned**.
Do **not guess** or infer any data.
Use only the exact values found in the allowed choices from the questionnaire or hair type dictionary.

CRITICAL BUSINESS RULES - BASED ON 4-REALITY ANALYSIS:
1. AGES INVOLVED: Never leave empty. Use default ranges if not specified: ["6-12 years", "13-17 years", "18-24 years", "25-44 years", "45-64 years", "65 years and over"]
2. PRICE: Extract numeric value only as "XX.XX" without symbols (e.g., "24.90" not "24.90‚Ç¨")
3. QUANTITY: Always include units as "XXX ml" or "XXX g" (e.g., "200 ml" not "200")
4. INGREDIENTS: Extract 3-8 main active ingredients, exclude water and common preservers
5. SAFETY: Default to "No" for pregnancy unless explicitly stated, default to "No" for allergies unless explicitly stated

Only include these fields in the output:

- Brand (Analyse all the given information and give the real brand of this product.)
- Product name
- Marketing Description (Extract the full sentence(s) that describe the product's benefits or function. Do not rephrase or summarize.)
- Key ingredients (Return the list of ingredients exactly as they appear. If no INCI list, extract only key active ingredients explicitly mentioned. Limit to 3-8 main ingredients.)
- Price (euros) (Format: "XX.XX" numeric only without symbols. Remove all currency symbols.)
- Quantity (ml) (Format: "XXX ml" or "XXX g" always with units. Convert liters to milliliters.)
- Category
- Ages involved (CRITICAL: Never leave empty. Use default ranges if not explicitly specified.)
- Suitable for pregnant women? (Must be either "Yes" or "No". Default to "No" if not explicitly mentioned.)
- Compatible with allergies? (Must be either "Yes" or "No". Default to "No" if not explicitly mentioned.)
- q001 ‚Äì Hair texture(s) (Must match exactly values from hair type dictionary. Only include explicitly mentioned textures.)
- q002 ‚Äì Hair condition(s) (Must match predefined options. Only include explicitly mentioned conditions.)
- q003 ‚Äì Desired objective(s) (Must match questionnaire options. Only include explicitly mentioned objectives.)
- q004 ‚Äì Hair problem(s) (Must match questionnaire options. Only include explicitly mentioned problems.)
- q005 ‚Äì Suitable scalp type(s) (Only include if explicitly mentioned in product information.)

Return your result in this exact JSON format:

{{
  "Product Info": {{
    "Product Sheet": {{
      "Brand": "...",
      "Product name": "...",
      "Marketing Description": {{
        "EN": "..."
      }},
      "Key ingredients": {{
        "EN": [
          "..."
        ]
      }},
      "Price (euros)": "...",
      "Quantity (ml)": "...",
      "Category": {{
        "EN": "..."
      }},
      "Ages involved": {{
        "EN": [
          "..."
        ]
      }},
      "Suitable for pregnant women?": {{
        "EN": ["Yes" or "No"]
      }},
      "Compatible with allergies?": {{
        "EN": ["Yes" or "No"]
      }},
      "q001": {{
        "EN": [
          "..."
        ]
      }},
      "q002": {{
        "EN": [
          "..."
        ]
      }},
      "q003": {{
        "EN": [
          "..."
        ]
      }},
      "q004": {{
        "EN": [
          "..."
        ]
      }},
      "q005": {{
        "EN": [
          "..."
        ]
      }}
    }}
  }}
}}

Use only values that appear in the following references:
- Hair types (q001): {hair_type_dict_en}
- Questionnaire options: {questionnaire}

Source product description: {product_info}

Return only the JSON output. Do not include comments, explanations, or introductory text. Do not infer or fabricate data.
"""

print("‚úÖ Enhanced prompt template loaded with systematic fixes")
print("üìã Includes critical business rules based on 4-reality analysis")
print("üîß Addresses: Empty ages, price formatting, quantity units, ingredient limits")

‚úÖ Enhanced prompt template loaded with systematic fixes
üìã Includes critical business rules based on 4-reality analysis
üîß Addresses: Empty ages, price formatting, quantity units, ingredient limits


## Focused Enhancement: Price & Quantity + Ages Involved + 2 booleans fields + q001-q005

In [47]:
# OPTIMAL EXTRACTION PROMPT TEMPLATE
enhanced_prompt_template = """
Your task is to extract and return **only** the following product information from the provided product description text below.
Use only the information that is **explicitly mentioned**.
Do **not guess** or infer any data.
Use only the exact values found in the allowed choices from the questionnaire or hair type dictionary.

CRITICAL BUSINESS RULES - PRECISE EXTRACTION:

CRITICAL BUSINESS RULES - PRECISE EXTRACTION:

1. PRICE (euros):
   - Format: "XX.XX" numeric only, no symbols
   - Examples: "14.40" not "14.40‚Ç¨", "24.90" not "24,90 ‚Ç¨"
   - Remove all currency symbols, keep only numbers and decimal point
   - If multiple prices, use the main product price only

2. QUANTITY (ml):
   - Format: "XXX ml" or "XXX g" always with units
   - Examples: "200 ml" not "200", "100 g" not "100gr"
   - Convert liters to ml: "1 L" ‚Üí "1000 ml", "0.5 L" ‚Üí "500 ml"
   - If no quantity found, leave as ""

3. AGES INVOLVED (CRITICAL: Never leave empty):
   - DEFAULT: ["6-12 years", "13-17 years", "18-24 years", "25-44 years", "45-64 years", "65 years and over"]
   - ONLY APPLY DEFAULTS UNLESS explicit age restrictions are mentioned
   - No age mention in product description ‚Üí USE FULL DEFAULT RANGES

   - Only modify defaults if explicit age mention:
     "for children" or "for kids" or "children's" ‚Üí ["6-12 years"]
     "for adults" or "adults only" ‚Üí ["18-24 years", "25-44 years", "45-64 years", "65 years and over"]
     "for babies" or "for infants" or "baby" ‚Üí ["0-1 year", "2-5 years"]
     "for teenagers" or "for teens" or "teen" ‚Üí ["13-17 years"]

     # Mentions avec √¢ges sp√©cifiques
     "from X years" or "over X years" or "X+ years" or "more than X years" ‚Üí include all ranges from that age
       Example: "from 12 years" ‚Üí ["13-17 years", "18-24 years", "25-44 years", "45-64 years", "65 years and over"]
       Example: "from 18 years" ‚Üí ["18-24 years", "25-44 years", "45-64 years", "65 years and over"]

     "up to X years" or "under X years" or "less than X years" ‚Üí include all ranges up to that age
       Example: "up to 12 years" ‚Üí ["0-1 year", "2-5 years", "6-12 years"]
       Example: "under 18 years" ‚Üí ["0-1 year", "2-5 years", "6-12 years", "13-17 years"]

     # Mentions en mois
     "from X months" ‚Üí convert to years and apply same logic
       Example: "from 6 months" ‚Üí ["0-1 year", "2-5 years", "6-12 years", "13-17 years", "18-24 years", "25-44 years", "45-64 years", "65 years and over"]
       Example: "up to 18 months" ‚Üí ["0-1 year"]

     # Combinaisons d'√¢ges
     "from X to Y years" ‚Üí include all ranges between X and Y
       Example: "from 6 to 12 years" ‚Üí ["6-12 years"]
       Example: "from 18 to 45 years" ‚Üí ["18-24 years", "25-44 years"]

     # Exclusions explicites
     "not for children" or "adults only" ‚Üí ["18-24 years", "25-44 years", "45-64 years", "65 years and over"]
     "not for babies" ‚Üí ["6-12 years", "13-17 years", "18-24 years", "25-44 years", "45-64 years", "65 years and over"]

   - "all ages" or "for all ages" or "everyone" ‚Üí use all default ranges
   - No age restrictions mentioned ‚Üí USE FULL DEFAULT RANGES (CRITICAL RULE)

4. PREGNANCY SAFETY:
   - Default: "No" (conservative safety approach)
   - Only "Yes" if explicit mention:
     "pregnancy safe" or "safe for pregnancy"
     "safe for pregnant women" or "safe during pregnancy"
     "suitable for pregnant women" or "suitable during pregnancy"
     "pregnancy-friendly" or "pregnancy approved"
     "recommended for pregnant women"
     "formulated for pregnancy" or "pregnancy formula"
     "obstetrician approved" or "gynecologist approved"
     "maternity safe" or "safe for expecting mothers"
     "baby-safe" (implies pregnancy safety)
     "tested for pregnancy safety"

   - Always "No" (never infer safety):
     "natural ingredients" or "100% natural"
     "organic" or "certified organic"
     "chemical-free" or "toxic-free"
     "gentle formula" or "mild formula"
     "hypoallergenic" or "dermatologist tested"
     "vegan" or "cruelty-free"
     "no parabens" or "no sulfates"
     "herbal" or "botanical"
     "baby shampoo" or "children's product"

   - Ambiguous terms that require other explicit safety mention:
     "family safe" ‚â† "Yes" (unless combined with pregnancy mention)
     "all skin types" ‚â† "Yes"
     "sensitive skin" ‚â† "Yes"

5. ALLERGY COMPATIBILITY:
   - Default: "Yes" (natural products generally safe)
   - Only "No" if explicit warning:
     "not for allergic" or "not suitable for allergies"
     "avoid if allergic" or "avoid if you have allergies"
     "contains allergens" or "may contain allergens"
     "allergy warning" or "allergen warning"
     "not for sensitive skin" (implies allergy risk)
     "contains nuts" or "contains nut oils"
     "contains essential oils" (if presented as warning)
     "patch test recommended" (implies potential reactions)
     "may cause reactions" or "may cause allergic reactions"
     "consult doctor if allergic"

   - Always "Yes" (safety indicators):
     "hypoallergenic" or "allergy tested"
     "fragrance-free" or "unscented"
     "100% natural" or "all natural"
     "dermatologist tested" or "clinically tested"
     "sensitive skin formula"
     "gentle formula" or "mild formula"
     "non-irritating" or "non-allergenic"
     "free from common allergens"
     "vegan" or "cruelty-free"
     "organic" or "certified organic"

   - Specific ingredient warnings that trigger "No":
     "contains fragrance" (only if presented as warning)
     "contains parabens" (only if presented as warning)
     "contains sulfates" (only if presented as warning)
     "contains [specific known allergen]" as caution

6. HAIR TEXTURES (q001):
   - Extract ONLY explicitly mentioned textures
   - PRECISE MAPPING:
     "straight hair" or "type 1" or "1a/1b/1c" ‚Üí ["1A", "1B", "1C"]
     "wavy hair" or "waves" or "type 2" or "2a/2b/2c" ‚Üí ["2A", "2B", "2C"]
     "curly hair" or "curls" or "type 3" or "3a/3b/3c" ‚Üí ["3A", "3B", "3C"]
     "coily hair" or "coils" or "kinky hair" or "type 4" or "4a/4b/4c" ‚Üí ["4A", "4B", "4C"]
     "frizzy hair" or "frizz" ‚Üí ["3A", "3B", "3C", "4A", "4B", "4C"]
     "afro hair" or "afro-textured" ‚Üí ["4A", "4B", "4C"]

   - Combined mentions:
     "wavy to curly" ‚Üí ["2A", "2B", "2C", "3A", "3B", "3C"]
     "curly to coily" ‚Üí ["3A", "3B", "3C", "4A", "4B", "4C"]
     "wavy to coily" ‚Üí ["2A", "2B", "2C", "3A", "3B", "3C", "4A", "4B", "4C"]

   - "all hair types" or "for all hair" or "universal" ‚Üí all 12 textures
   - Specific number mentions:
     "for type 3 and 4 hair" ‚Üí ["3A", "3B", "3C", "4A", "4B", "4C"]
     "for 2a to 3c hair" ‚Üí ["2A", "2B", "2C", "3A", "3B", "3C"]

   - No explicit mention ‚Üí []

7. HAIR CONDITIONS (q002):
   - Extract ONLY explicitly mentioned conditions
   - PRECISE MAPPING:
     "natural hair" or "virgin hair" or "unprocessed hair" ‚Üí ["Natural"]
     "chemically treated" or "relaxed hair" or "even relaxed hair" or "straightened hair" or "permed hair" or "color-treated hair" or "bleached hair" or "keratin treated" ‚Üí ["Straightened/chemically treated"]
     "transitioning hair" or "in transition" or "growing out relaxer" or "growing out color" ‚Üí ["In transition"]
     "protective styles" or "locs" or "braids" or "dreadlocks" or "twists" or "weaves" or "extensions" ‚Üí ["Locs", "Braids"]

   - Specific product mentions:
     "for relaxed and natural hair" ‚Üí ["Natural", "Straightened/chemically treated"]
     "ideal for transitioning hair" ‚Üí ["In transition"]
     "perfect for braids and locs" ‚Üí ["Locs", "Braids"]
     "even relaxed hair" ‚Üí ["Straightened/chemically treated"] (CRITICAL FIX)

   - Multiple conditions:
     "for natural, relaxed, and transitioning hair" ‚Üí ["Natural", "Straightened/chemically treated", "In transition"]
     "for both virgin and color-treated hair" ‚Üí ["Natural", "Straightened/chemically treated"]

   - CRITICAL BUSINESS RULE:
     If product mentions "all hair types" but no specific conditions ‚Üí [] (empty)
     Only extract when specific hair conditions are explicitly mentioned

   - No explicit mention ‚Üí []

8. DESIRED OBJECTIVES (q003) vs PROBLEMS (q004):
   - q003 OBJECTIVES: Extract when product PROMISES benefits:
     "promotes growth" or "stimulates growth" or "encourages growth" ‚Üí ["Healthy hair"]
     "strengthens" or "fortifies" or "reinforces" or "reduces breakage" ‚Üí ["Stronger hair"]
     "adds shine" or "radiant shine" or "enhances shine" or "adds gloss" ‚Üí ["Shine enhancement"]
     "adds moisture" or "hydrating" or "moisturizing" or "hydrates" ‚Üí ["Moisture retention"]
     "defines curls" or "curl definition" or "enhances curls" ‚Üí ["Curl definition"]
     "volume" or "adds volume" or "volumizing" ‚Üí ["Volume enhancement"]
     "length retention" or "retains length" ‚Üí ["Length retention"]
     "manageability" or "easier to manage" ‚Üí ["Manageability"]
     "heat protection" or "heat styling" ‚Üí ["Healthy heat styling"]
     "color protection" or "color care" ‚Üí ["Colour-treated hair care"]

   - q004 PROBLEMS: Extract when product SOLVES issues:
     "dryness" or "dry hair" or "dehydrated" ‚Üí ["Dryness"]
     "breakage" or "brittle" or "brittleness" or "split ends" ‚Üí ["Breakage"]  # CRITICAL FIX
     "frizz" or "frizzy" or "unruly hair" ‚Üí ["Frizz"]
     "dull hair" or "lack of shine" or "lifeless hair" ‚Üí ["Dull hair"]
     "hair loss" or "thinning" or "shedding" ‚Üí ["Hair loss"]
     "greasy hair" or "oily hair" ‚Üí ["Greasy hair"]
     "product build-up" or "build-up" ‚Üí ["Product build-up"]
     "porous hair" or "high porosity" ‚Üí ["Porous hair"]
     "heat damage" ‚Üí ["Heat damage"]
     "weak edges" or "thinning edges" ‚Üí ["Weak edges"]

   - CLEAR SEPARATION:
     "reduces breakage" ‚Üí q003 ["Stronger hair"] (OBJECTIVE - strengthens)
     "repairs breakage" ‚Üí q004 ["Breakage"] (PROBLEM - solves breakage)
     "adds moisture to dry hair" ‚Üí q003 ["Moisture retention"] AND q004 ["Dryness"]
     "revives dull hair" ‚Üí q003 ["Shine enhancement"] AND q004 ["Dull hair"]
     "reducing dryness and brittleness" ‚Üí q004 ["Dryness", "Breakage"]  # CRITICAL FIX

   - CRITICAL BUSINESS RULES:
     "brittleness" always maps to "Breakage" in q004
     "reducing [problem]" ‚Üí extract problem in q004
     "promoting [benefit]" ‚Üí extract objective in q003

   - No explicit mention ‚Üí [] for both

9. SCALP CONDITIONS (q005):
   - Extract ONLY if explicitly mentioned with scalp context
   - PRECISE MAPPING:
     "dry scalp" or "for dry scalp" or "dryness scalp" or "scalp dryness" ‚Üí ["Dry"]
     "oily scalp" or "for oily scalp" or "greasy scalp" or "scalp oiliness" ‚Üí ["Oily"]
     "sensitive scalp" or "for sensitive scalp" or "irritated scalp" or "scalp sensitivity" ‚Üí ["Sensitive"]
     "dandruff" or "against dandruff" or "anti-dandruff" or "flakes" or "scalp dandruff" ‚Üí ["Dandruff"]
     "flaky scalp" or "flaky" or "scalp flaking" or "scalp flakes" ‚Üí ["Flaky"]
     "scalp psoriasis" or "psoriasis" or "psoriatic scalp" ‚Üí ["Psoriasis"]
     "scalp dermatitis" or "dermatitis" or "seborrheic dermatitis" or "scalp eczema" ‚Üí ["Dermatitis"]
     "scalp alopecia" or "alopecia" or "hair loss scalp" ‚Üí ["Alopecia"]
     "itchy scalp" or "scalp itching" or "scalp irritation" ‚Üí ["Sensitive"]
     "scalp eczema" or "eczema" ‚Üí ["Dermatitis"]

   - CLEAR CONTEXT REQUIRED:
     Must include "scalp" or clear scalp-specific context
     "for dandruff control" ‚Üí ["Dandruff"] (clear scalp context)
     "relieves itchy scalp" ‚Üí ["Sensitive"]
     "soothes irritated scalp" ‚Üí ["Sensitive"]
     "scalp treatment" ‚Üí [] (too generic, no specific condition)
     "scalp care" ‚Üí [] (too generic, no specific condition)

   - NO INFERENCE - STRICT EXTRACTION ONLY:
     "for healthy hair" ‚â† scalp mention
     "promotes growth" ‚â† scalp mention
     "strengthens hair" ‚â† scalp mention
     "reduces dryness" ‚â† "Dry" for scalp (must specify "scalp dryness")
     "controls oil" ‚â† "Oily" for scalp (must specify "scalp oil")
     "calms irritation" ‚â† "Sensitive" for scalp (must specify "scalp irritation")

   - CRITICAL BUSINESS RULE:
     When multiple scalp conditions mentioned, extract all that apply
     "for dry and itchy scalp" ‚Üí ["Dry", "Sensitive"]
     "controls dandruff and flaking" ‚Üí ["Dandruff", "Flaky"]

   - No explicit scalp conditions mentioned ‚Üí [] (empty, not "None")

10. KEY INGREDIENTS:
    - Extract 3-8 main active ingredients
    - Exclude: water, common preservers, generic excipients
    - Prioritize marketing-active ingredients

FIELD-SPECIFIC EXTRACTION:

- Brand: Real brand from product information
- Product name: Complete marketing name
- Marketing Description: Full sentences, no rephrasing
- Category: Most specific applicable category
- All fields: Use only explicitly mentioned information

Return your result in this exact JSON format:

{{
  "Product Info": {{
    "Product Sheet": {{
      "Brand": "...",
      "Product name": "...",
      "Marketing Description": {{
        "EN": "..."
      }},
      "Key ingredients": {{
        "EN": [
          "..."
        ]
      }},
      "Price (euros)": "...",
      "Quantity (ml)": "...",
      "Category": {{
        "EN": "..."
      }},
      "Ages involved": {{
        "EN": [
          "..."
        ]
      }},
      "Suitable for pregnant women?": {{
        "EN": ["Yes" or "No"]
      }},
      "Compatible with allergies?": {{
        "EN": ["Yes" or "No"]
      }},
      "q001": {{
        "EN": [
          "..."
        ]
      }},
      "q002": {{
        "EN": [
          "..."
        ]
      }},
      "q003": {{
        "EN": [
          "..."
        ]
      }},
      "q004": {{
        "EN": [
          "..."
        ]
      }},
      "q005": {{
        "EN": [
          "..."
        ]
      }}
    }}
  }}
}}

Use only values that appear in the following references:
- Hair types (q001): {hair_type_dict_en}
- Questionnaire options: {questionnaire}

Source product description: {product_info}

Return only the JSON output. Do not include comments, explanations, or introductory text. Do not infer or fabricate data.
"""

print("‚úÖ PERFECTED EXTRACTION PROMPT LOADED")
print("üéØ PRECISE FIELD MAPPING ACHIEVED:")
print("   1.  Price: 'XX.XX' numeric only")
print("   2.  Quantity: 'XXX ml/g' with units")
print("   3.  Ages: Smart defaults with explicit triggers")
print("   4.  Pregnancy: Conservative 'No' with exact triggers")
print("   5.  Allergies: Liberal 'Yes' with specific warnings")
print("   6.  Hair Textures: Precise semantic mapping")
print("   7.  Hair Conditions: Exact phrase matching")
print("   8.  Objectives vs Problems: Benefit/Solution separation")
print("   9.  Scalp: Explicit 'scalp' mention required")
print("   10. Ingredients: 3-8 main actives")

‚úÖ PERFECTED EXTRACTION PROMPT LOADED
üéØ PRECISE FIELD MAPPING ACHIEVED:
   1.  Price: 'XX.XX' numeric only
   2.  Quantity: 'XXX ml/g' with units
   3.  Ages: Smart defaults with explicit triggers
   4.  Pregnancy: Conservative 'No' with exact triggers
   6.  Hair Textures: Precise semantic mapping
   7.  Hair Conditions: Exact phrase matching
   8.  Objectives vs Problems: Benefit/Solution separation
   9.  Scalp: Explicit 'scalp' mention required
   10. Ingredients: 3-8 main actives


## Batch Processing Function

In [48]:
def process_product_batch(products_list, client, completion_model="gpt-4o-mini"):
    """
    Process multiple products through the enhanced extraction pipeline
    """
    results = []

    for i, product in enumerate(products_list, 1):
        print(f"\n{'='*60}")
        print(f"üîÑ PROCESSING PRODUCT {i}/{len(products_list)}")
        print(f"üì¶ {product['brand']} - {product['name']}")
        print(f"{'='*60}")

        try:
            # Create product-specific prompt
            product_prompt = enhanced_prompt_template.format(
                hair_type_dict_en=hair_type_dict_en,
                questionnaire=questionnaire,
                product_info=product['product_information']
            )

            # Execute extraction
            start_time = time.time()
            raw_response = completeChat(product_prompt, style, client, completion_model)
            extraction_time = time.time() - start_time

            print(f"‚è±Ô∏è  LLM extraction: {extraction_time:.2f}s")

            # Apply standardization
            extraction_dict = json.loads(raw_response)
            standardized_dict = standardize_numeric_fields(extraction_dict)

            # Validate quality
            is_valid = validate_extraction_quality(standardized_dict)

            results.append({
                'product_index': i,
                'brand': product['brand'],
                'name': product['name'],
                'url': product['url'],
                'raw_response': raw_response,
                'standardized_response': standardized_dict,
                'extraction_time': extraction_time,
                'is_valid': is_valid,
                'processed_at': datetime.now().isoformat()
            })

            print(f"‚úÖ Product {i} processed successfully")

        except Exception as e:
            print(f"‚ùå Failed to process product {i}: {e}")
            results.append({
                'product_index': i,
                'brand': product['brand'],
                'name': product['name'],
                'url': product['url'],
                'error': str(e),
                'processed_at': datetime.now().isoformat()
            })

    return results

## Individual Product Processing Function

In [49]:
# === INDIVIDUAL PRODUCT PROCESSING FUNCTION ===
def process_single_product(product, client, completion_model="gpt-4o-mini"):
    """
    Process a single product through the enhanced extraction pipeline
    """
    print(f"\n{'='*60}")
    print(f"üîÑ PROCESSING: {product['brand']} - {product['name']}")
    print(f"{'='*60}")

    try:
        # Create product-specific prompt
        product_prompt = enhanced_prompt_template.format(
            hair_type_dict_en=hair_type_dict_en,
            questionnaire=questionnaire,
            product_info=product['product_information']
        )

        # Execute extraction
        start_time = time.time()
        raw_response = completeChat(product_prompt, style, client, completion_model)
        extraction_time = time.time() - start_time

        print(f"‚è±Ô∏è  LLM extraction: {extraction_time:.2f}s")

        # Apply standardization
        extraction_dict = json.loads(raw_response)
        standardized_dict = standardize_numeric_fields(extraction_dict)

        # Validate quality
        is_valid = validate_extraction_quality(standardized_dict)

        result = {
            'brand': product['brand'],
            'name': product['name'],
            'url': product['url'],
            'raw_response': raw_response,
            'standardized_response': standardized_dict,
            'extraction_time': extraction_time,
            'is_valid': is_valid,
            'processed_at': datetime.now().isoformat()
        }

        print(f"‚úÖ Successfully processed: {product['brand']} - {product['name']}")
        return result

    except Exception as e:
        print(f"‚ùå Failed to process {product['brand']}: {e}")
        return {
            'brand': product['brand'],
            'name': product['name'],
            'url': product['url'],
            'error': str(e),
            'processed_at': datetime.now().isoformat()
        }

## Batch Processing with Individual Product Loop

In [50]:
# === BATCH PROCESSING WITH INDIVIDUAL PRODUCT LOOP ===
def process_product_batch(products_list, client, completion_model="gpt-4o-mini"):
    """
    Process multiple products through the enhanced extraction pipeline
    Each product is processed individually with its own prompt
    """
    results = []

    for i, product in enumerate(products_list, 1):
        print(f"\nüì¶ PRODUCT {i}/{len(products_list)}")

        # Process each product individually
        product_result = process_single_product(product, client, completion_model)
        product_result['product_index'] = i
        results.append(product_result)

        # Small delay to avoid rate limiting
        time.sleep(1)

    return results

In [51]:
# Example: Process first 3 products for testing
def run_test_batch():
    """
    Run batch processing on a subset of products for testing
    """
    test_products = all_products[:3]  # First 3 products
    print("üß™ STARTING TEST BATCH PROCESSING")
    print(f"üì¶ Processing {len(test_products)} products")

    batch_results = process_product_batch(test_products, client)

    # Summary
    successful = len([r for r in batch_results if 'standardized_response' in r])
    failed = len([r for r in batch_results if 'error' in r])

    print(f"\nüéØ TEST BATCH COMPLETE")
    print(f"‚úÖ Successful: {successful}/{len(test_products)}")
    print(f"‚ùå Failed: {failed}/{len(test_products)}")

    return batch_results

## Enhanced Batch Processing for All Products

In [52]:
# ===  ENHANCED BATCH PROCESSING FOR ALL PRODUCTS ===
def process_all_products_together(client, completion_model="gpt-4o-mini"):
    """
    Process all 10 products together in a single batch execution
    """
    print("üöÄ PROCESSING ALL 10 PRODUCTS TOGETHER")
    print("=" * 60)

    all_results = []
    total_start_time = time.time()

    for i, product in enumerate(all_products, 1):
        print(f"\nüì¶ PROCESSING PRODUCT {i}/10: {product['brand']} - {product['name']}")
        print("-" * 50)

        try:
            # Create product-specific prompt
            product_prompt = enhanced_prompt_template.format(
                hair_type_dict_en=hair_type_dict_en,
                questionnaire=questionnaire,
                product_info=product['product_information']
            )

            # Execute extraction
            start_time = time.time()
            raw_response = completeChat(product_prompt, style, client, completion_model)
            extraction_time = time.time() - start_time

            print(f"‚è±Ô∏è  LLM extraction: {extraction_time:.2f}s")

            # Apply standardization
            extraction_dict = json.loads(raw_response)
            standardized_dict = standardize_numeric_fields(extraction_dict)

            # Validate quality
            is_valid = validate_extraction_quality(standardized_dict)

            result = {
                'product_index': i,
                'brand': product['brand'],
                'name': product['name'],
                'url': product['url'],
                'raw_response': raw_response,
                'standardized_response': standardized_dict,
                'extraction_time': extraction_time,
                'is_valid': is_valid,
                'processed_at': datetime.now().isoformat()
            }

            all_results.append(result)
            print(f"‚úÖ Successfully processed: {product['brand']}")

        except Exception as e:
            print(f"‚ùå Failed to process {product['brand']}: {e}")
            error_result = {
                'product_index': i,
                'brand': product['brand'],
                'name': product['name'],
                'url': product['url'],
                'error': str(e),
                'processed_at': datetime.now().isoformat()
            }
            all_results.append(error_result)

        # Small delay to avoid rate limiting
        if i < len(all_products):
            time.sleep(2)

    total_time = time.time() - total_start_time

    return all_results, total_time

## Comprehensive Results Analysis

In [53]:
# === COMPREHENSIVE RESULTS ANALYSIS ===
def analyze_batch_results(batch_results, total_time):
    """
    Analyze and display comprehensive results for all 10 products
    """
    print("\n" + "="*70)
    print("üìä COMPREHENSIVE BATCH RESULTS ANALYSIS")
    print("="*70)

    successful = [r for r in batch_results if 'standardized_response' in r]
    failed = [r for r in batch_results if 'error' in r]

    # Basic statistics
    print(f"üì¶ TOTAL PRODUCTS PROCESSED: {len(batch_results)}")
    print(f"‚úÖ SUCCESSFUL EXTRACTIONS: {len(successful)}")
    print(f"‚ùå FAILED EXTRACTIONS: {len(failed)}")
    print(f"üéØ SUCCESS RATE: {len(successful)/len(batch_results)*100:.1f}%")
    print(f"‚è±Ô∏è  TOTAL PROCESSING TIME: {total_time:.2f}s")
    print(f"üìà AVERAGE TIME PER PRODUCT: {total_time/len(batch_results):.2f}s")

    # Field completion analysis
    if successful:
        print(f"\nüîç FIELD COMPLETION ANALYSIS:")
        print("-" * 40)

        fields_analysis = {
            'Ages involved': 0,
            'Price': 0,
            'Quantity': 0,
            'Ingredients': 0,
            'Category': 0,
            'Marketing Description': 0
        }

        for result in successful:
            extraction = result['standardized_response']
            product_sheet = extraction.get('Product Info', {}).get('Product Sheet', {})

            if product_sheet.get('Ages involved', {}).get('EN'):
                fields_analysis['Ages involved'] += 1
            if product_sheet.get('Price (euros)'):
                fields_analysis['Price'] += 1
            if product_sheet.get('Quantity (ml)'):
                fields_analysis['Quantity'] += 1
            if product_sheet.get('Key ingredients', {}).get('EN'):
                fields_analysis['Ingredients'] += 1
            if product_sheet.get('Category', {}).get('EN'):
                fields_analysis['Category'] += 1
            if product_sheet.get('Marketing Description', {}).get('EN'):
                fields_analysis['Marketing Description'] += 1

        for field, count in fields_analysis.items():
            percentage = (count / len(successful)) * 100
            print(f"  {field}: {count}/{len(successful)} ({percentage:.1f}%)")

    # Product-by-product summary
    print(f"\nüìã PRODUCT-BY-PRODUCT SUMMARY:")
    print("-" * 50)
    for result in batch_results:
        status = "‚úÖ SUCCESS" if 'standardized_response' in result else "‚ùå FAILED"
        time_str = f"{result.get('extraction_time', 0):.2f}s" if 'extraction_time' in result else "N/A"
        print(f"  {result['product_index']:2d}. {result['brand']:15} - {result['name']:30} [{status}] - {time_str}")

    return {
        'total_products': len(batch_results),
        'successful': len(successful),
        'failed': len(failed),
        'success_rate': len(successful)/len(batch_results),
        'total_time': total_time,
        'avg_time_per_product': total_time/len(batch_results)
    }

## Enhanced Export Functions

In [54]:
def export_individual_product(result, export_dir="individual_exports"):
    """
    Export individual product extraction results
    """
    if not os.path.exists(export_dir):
        os.makedirs(export_dir)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    safe_name = "".join(c for c in result['name'] if c.isalnum() or c in (' ', '-', '_')).rstrip()
    filename = f"{timestamp}_{result['brand']}_{safe_name}.json"
    filepath = os.path.join(export_dir, filename)

    export_data = {
        "metadata": {
            "export_timestamp": datetime.now().isoformat(),
            "brand": result['brand'],
            "product_name": result['name'],
            "url": result['url'],
            "extraction_time": result.get('extraction_time', 0),
            "is_valid": result.get('is_valid', False),
            "processing_version": "enhanced_individual_v1"
        },
        "extraction_results": result.get('standardized_response', {})
    }

    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(export_data, f, indent=2, ensure_ascii=False)
        print(f"‚úÖ Individual export: {filepath}")
        return filepath
    except Exception as e:
        print(f"‚ùå Individual export failed: {e}")
        return None

def export_batch_results(batch_results, export_dir="batch_exports"):
    """
    Export batch processing results with comprehensive analysis
    """
    if not os.path.exists(export_dir):
        os.makedirs(export_dir)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Export individual products first
    individual_files = []
    for result in batch_results:
        if 'standardized_response' in result:
            individual_file = export_individual_product(result, export_dir)
            if individual_file:
                individual_files.append(individual_file)

    # Export detailed batch JSON
    detailed_filepath = os.path.join(export_dir, f"{timestamp}_batch_extractions.json")

    export_data = {
        "metadata": {
            "export_timestamp": datetime.now().isoformat(),
            "total_products": len(batch_results),
            "successful_extractions": len([r for r in batch_results if 'standardized_response' in r]),
            "failed_extractions": len([r for r in batch_results if 'error' in r]),
            "individual_files": individual_files,
            "processing_version": "enhanced_individual_v1"
        },
        "batch_results": batch_results
    }

    try:
        with open(detailed_filepath, 'w', encoding='utf-8') as f:
            json.dump(export_data, f, indent=2, ensure_ascii=False)
        print(f"‚úÖ Batch results exported: {detailed_filepath}")
    except Exception as e:
        print(f"‚ùå Batch export failed: {e}")

    return detailed_filepath

def generate_standardization_report(batch_results, export_dir="reports"):
    """
    Generate comprehensive standardization report
    """
    if not os.path.exists(export_dir):
        os.makedirs(export_dir)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_filepath = os.path.join(export_dir, f"{timestamp}_standardization_report.txt")

    with open(report_filepath, 'w', encoding='utf-8') as f:
        f.write("PRODUCT DATA EXTRACTION STANDARDIZATION REPORT\n")
        f.write("=" * 60 + "\n")
        f.write(f"Generated: {datetime.now().isoformat()}\n")
        f.write(f"Total Products Processed: {len(batch_results)}\n\n")

        # Age field analysis
        empty_ages_count = 0
        for result in batch_results:
            if 'raw_response' in result:
                try:
                    raw_dict = json.loads(result['raw_response'])
                    raw_ages = raw_dict.get('Product Info', {}).get('Product Sheet', {}).get('Ages involved', {}).get('EN', [])
                    if not raw_ages:
                        empty_ages_count += 1
                except:
                    pass

        f.write("AGE FIELD STANDARDIZATION:\n")
        f.write("-" * 40 + "\n")
        f.write(f"Empty age fields before standardization: {empty_ages_count}/{len(batch_results)}\n")
        f.write(f"Empty age fields after standardization: 0/{len(batch_results)}\n")
        f.write("‚úÖ Age field standardization: 100% effective\n\n")

        # Processing statistics
        successful = [r for r in batch_results if 'standardized_response' in r]
        if successful:
            avg_time = sum(r.get('extraction_time', 0) for r in successful) / len(successful)
            f.write("PROCESSING STATISTICS:\n")
            f.write("-" * 40 + "\n")
            f.write(f"Average extraction time: {avg_time:.2f}s per product\n")
            f.write(f"Total processing time: {sum(r.get('extraction_time', 0) for r in successful):.2f}s\n")
            f.write(f"Success rate: {len(successful)}/{len(batch_results)} ({len(successful)/len(batch_results)*100:.1f}%)\n")

    print(f"üìã Standardization report: {report_filepath}")
    return report_filepath

In [55]:
# === COMPREHENSIVE EXPORT FOR ALL PRODUCTS ===
def export_all_products_together(batch_results, analysis_results, export_dir="all_products_export"):
    """
    Export all 10 products together with comprehensive analysis
    """
    if not os.path.exists(export_dir):
        os.makedirs(export_dir)
        print(f"üìÅ Created export directory: {export_dir}")

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # 1. Export detailed JSON with all products
    detailed_filepath = os.path.join(export_dir, f"{timestamp}_all_10_products_detailed.json")

    export_data = {
        "metadata": {
            "export_timestamp": datetime.now().isoformat(),
            "total_products": len(batch_results),
            "successful_extractions": analysis_results['successful'],
            "failed_extractions": analysis_results['failed'],
            "success_rate": analysis_results['success_rate'],
            "total_processing_time": analysis_results['total_time'],
            "average_time_per_product": analysis_results['avg_time_per_product'],
            "processing_version": "enhanced_batch_v1"
        },
        "analysis_summary": analysis_results,
        "products": []
    }

    # Add all products to export data
    for result in batch_results:
        product_data = {
            "product_index": result['product_index'],
            "brand": result['brand'],
            "name": result['name'],
            "url": result['url'],
            "processing_status": "success" if 'standardized_response' in result else "failed",
            "extraction_time": result.get('extraction_time', 0),
            "is_valid": result.get('is_valid', False),
            "processed_at": result['processed_at']
        }

        if 'standardized_response' in result:
            product_data["extraction_results"] = result['standardized_response']
        if 'error' in result:
            product_data["error"] = result['error']

        export_data["products"].append(product_data)

    try:
        with open(detailed_filepath, 'w', encoding='utf-8') as f:
            json.dump(export_data, f, indent=2, ensure_ascii=False)
        print(f"‚úÖ Detailed export (all 10 products): {detailed_filepath}")
    except Exception as e:
        print(f"‚ùå Detailed export failed: {e}")

    # 2. Export simplified CSV for quick analysis
    csv_data = []
    for result in batch_results:
        if 'standardized_response' in result:
            extraction = result['standardized_response']
            product_sheet = extraction.get('Product Info', {}).get('Product Sheet', {})

            csv_data.append({
                'product_index': result['product_index'],
                'brand': result['brand'],
                'name': result['name'],
                'price': product_sheet.get('Price (euros)', ''),
                'quantity': product_sheet.get('Quantity (ml)', ''),
                'ages_count': len(product_sheet.get('Ages involved', {}).get('EN', [])),
                'ingredients_count': len(product_sheet.get('Key ingredients', {}).get('EN', [])),
                'category': product_sheet.get('Category', {}).get('EN', ''),
                'extraction_time': result.get('extraction_time', 0),
                'is_valid': result.get('is_valid', False)
            })

    if csv_data:
        csv_filepath = os.path.join(export_dir, f"{timestamp}_all_10_products_summary.csv")
        df = pd.DataFrame(csv_data)
        df.to_csv(csv_filepath, index=False, encoding='utf-8')
        print(f"üìä CSV summary export: {csv_filepath}")

    # 3. Export individual product files
    individual_dir = os.path.join(export_dir, "individual_products")
    if not os.path.exists(individual_dir):
        os.makedirs(individual_dir)

    individual_files = []
    for result in batch_results:
        if 'standardized_response' in result:
            safe_name = "".join(c for c in result['name'] if c.isalnum() or c in (' ', '-', '_')).rstrip()
            individual_filename = f"{result['product_index']:02d}_{result['brand']}_{safe_name}.json"
            individual_filepath = os.path.join(individual_dir, individual_filename)

            individual_data = {
                "metadata": {
                    "product_index": result['product_index'],
                    "brand": result['brand'],
                    "name": result['name'],
                    "url": result['url'],
                    "extraction_time": result.get('extraction_time', 0),
                    "is_valid": result.get('is_valid', False),
                    "processed_at": result['processed_at']
                },
                "extraction_results": result['standardized_response']
            }

            try:
                with open(individual_filepath, 'w', encoding='utf-8') as f:
                    json.dump(individual_data, f, indent=2, ensure_ascii=False)
                individual_files.append(individual_filename)
            except Exception as e:
                print(f"‚ùå Individual export failed for {result['brand']}: {e}")

    print(f"üìÅ Individual product files: {len(individual_files)} files in {individual_dir}")

    return {
        'detailed_json': detailed_filepath,
        'summary_csv': csv_filepath if csv_data else None,
        'individual_files_dir': individual_dir,
        'individual_files_count': len(individual_files)
    }

## Comprehensive Standardization Report

In [56]:
# === COMPREHENSIVE STANDARDIZATION REPORT ===
def generate_comprehensive_report(batch_results, analysis_results, export_files, export_dir="reports"):
    """
    Generate comprehensive standardization report for all 10 products
    """
    if not os.path.exists(export_dir):
        os.makedirs(export_dir)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_filepath = os.path.join(export_dir, f"{timestamp}_comprehensive_standardization_report.txt")

    with open(report_filepath, 'w', encoding='utf-8') as f:
        f.write("COMPREHENSIVE PRODUCT DATA EXTRACTION REPORT\n")
        f.write("=" * 70 + "\n")
        f.write(f"Generated: {datetime.now().isoformat()}\n")
        f.write(f"Total Products: {analysis_results['total_products']}\n")
        f.write(f"Successful Extractions: {analysis_results['successful']}\n")
        f.write(f"Failed Extractions: {analysis_results['failed']}\n")
        f.write(f"Success Rate: {analysis_results['success_rate']*100:.1f}%\n")
        f.write(f"Total Processing Time: {analysis_results['total_time']:.2f}s\n")
        f.write(f"Average Time per Product: {analysis_results['avg_time_per_product']:.2f}s\n\n")

        # Standardization Impact Analysis
        f.write("STANDARDIZATION IMPACT ANALYSIS\n")
        f.write("-" * 40 + "\n")

        # Age field analysis
        empty_ages_before = 0
        for result in batch_results:
            if 'raw_response' in result:
                try:
                    raw_dict = json.loads(result['raw_response'])
                    raw_ages = raw_dict.get('Product Info', {}).get('Product Sheet', {}).get('Ages involved', {}).get('EN', [])
                    if not raw_ages:
                        empty_ages_before += 1
                except:
                    pass

        f.write(f"AGE FIELD:\n")
        f.write(f"  - Empty before standardization: {empty_ages_before}/10\n")
        f.write(f"  - Empty after standardization: 0/10\n")
        f.write(f"  - Improvement: {empty_ages_before*10}% reduction in empty fields\n\n")

        # Field completion rates
        f.write("FIELD COMPLETION RATES (Successful Extractions):\n")
        if analysis_results['successful'] > 0:
            successful_results = [r for r in batch_results if 'standardized_response' in r]

            fields = ['Price (euros)', 'Quantity (ml)', 'Ages involved', 'Key ingredients', 'Category']
            for field in fields:
                completed = 0
                for result in successful_results:
                    extraction = result['standardized_response']
                    product_sheet = extraction.get('Product Info', {}).get('Product Sheet', {})

                    if field == 'Ages involved':
                        if product_sheet.get(field, {}).get('EN'):
                            completed += 1
                    elif field == 'Key ingredients':
                        if product_sheet.get(field, {}).get('EN'):
                            completed += 1
                    elif field == 'Category':
                        if product_sheet.get(field, {}).get('EN'):
                            completed += 1
                    else:
                        if product_sheet.get(field):
                            completed += 1

                percentage = (completed / analysis_results['successful']) * 100
                f.write(f"  - {field}: {completed}/{analysis_results['successful']} ({percentage:.1f}%)\n")

        f.write("\nPRODUCT DETAILS:\n")
        f.write("-" * 40 + "\n")
        for result in batch_results:
            status = "SUCCESS" if 'standardized_response' in result else "FAILED"
            time_str = f"{result.get('extraction_time', 0):.2f}s" if 'extraction_time' in result else "N/A"
            f.write(f"{result['product_index']:2d}. {result['brand']:15} - {result['name']:30} [{status}] - {time_str}\n")

        f.write(f"\nEXPORTED FILES:\n")
        f.write("-" * 40 + "\n")
        f.write(f"Detailed JSON: {export_files.get('detailed_json', 'N/A')}\n")
        f.write(f"Summary CSV: {export_files.get('summary_csv', 'N/A')}\n")
        f.write(f"Individual Files: {export_files.get('individual_files_count', 0)} files in {export_files.get('individual_files_dir', 'N/A')}\n")

    print(f"üìã Comprehensive report: {report_filepath}")
    return report_filepath

## Complete Pipeline Execution

In [57]:
# === COMPLETE PIPELINE EXECUTION ===
def run_complete_pipeline(products_to_process=None):
    """
    Run complete extraction pipeline on selected products
    """
    if products_to_process is None:
        products_to_process = all_products

    print("üöÄ STARTING ENHANCED INDIVIDUAL PRODUCT PIPELINE")
    print(f"üì¶ Processing {len(products_to_process)} products individually")
    print("=" * 60)

    start_time = time.time()

    # Process batch
    batch_results = process_product_batch(products_to_process, client)

    # Export results
    batch_file = export_batch_results(batch_results)
    report_file = generate_standardization_report(batch_results)

    total_time = time.time() - start_time

    print("\nüéØ PIPELINE EXECUTION COMPLETE")
    print("=" * 60)

    # Summary statistics
    successful = len([r for r in batch_results if 'standardized_response' in r])
    failed = len([r for r in batch_results if 'error' in r])

    print(f"‚úÖ Successful extractions: {successful}/{len(products_to_process)}")
    print(f"‚ùå Failed extractions: {failed}/{len(products_to_process)}")
    print(f"‚è±Ô∏è  Total processing time: {total_time:.2f}s")
    print(f"üìä Average time per product: {total_time/len(products_to_process):.2f}s")
    print(f"üìÅ Results exported to: {batch_file}")
    print(f"üìã Report generated: {report_file}")

    return batch_results, {
        'batch_file': batch_file,
        'report_file': report_file,
        'total_time': total_time,
        'success_rate': successful/len(products_to_process)
    }

# Ready for execution
print("üéØ ENHANCED PIPELINE READY")
print("Available functions:")
print("1. run_test_batch() - Process first 3 products for testing")
print("2. run_complete_pipeline() - Process all 10 products")
print("3. process_single_product(product_1, client) - Process individual product")

üéØ ENHANCED PIPELINE READY
Available functions:
1. run_test_batch() - Process first 3 products for testing
2. run_complete_pipeline() - Process all 10 products
3. process_single_product(product_1, client) - Process individual product


In [58]:
# === MAIN EXECUTION FUNCTION FOR ALL 10 PRODUCTS ===
def execute_complete_10_product_pipeline():
    """
    Execute complete pipeline for all 10 products together
    """
    print("üöÄ STARTING COMPLETE 10-PRODUCT PIPELINE")
    print("=" * 70)

    try:
        # Process all 10 products
        batch_results, total_time = process_all_products_together(client)

        # Analyze results
        analysis_results = analyze_batch_results(batch_results, total_time)

        # Export all results
        export_files = export_all_products_together(batch_results, analysis_results)

        # Generate comprehensive report
        report_file = generate_comprehensive_report(batch_results, analysis_results, export_files)

        print("\nüéØ 10-PRODUCT PIPELINE EXECUTION COMPLETE!")
        print("=" * 70)
        print(f"üìä SUCCESS RATE: {analysis_results['success_rate']*100:.1f}%")
        print(f"‚è±Ô∏è  TOTAL TIME: {total_time:.2f}s")
        print(f"üìÅ MAIN EXPORT: {export_files['detailed_json']}")
        print(f"üìã COMPREHENSIVE REPORT: {report_file}")

        return {
            'batch_results': batch_results,
            'analysis_results': analysis_results,
            'export_files': export_files,
            'report_file': report_file
        }

    except Exception as e:
        print(f"‚ùå Pipeline execution failed: {e}")
        return None

# Ready to execute all 10 products
print("üéØ 10-PRODUCT PIPELINE READY FOR EXECUTION")
print("To run the complete pipeline, uncomment the line below:")
print("# results = execute_complete_10_product_pipeline()")

üéØ 10-PRODUCT PIPELINE READY FOR EXECUTION
To run the complete pipeline, uncomment the line below:
# results = execute_complete_10_product_pipeline()


## Execution

In [59]:
run_test_batch()

üß™ STARTING TEST BATCH PROCESSING
üì¶ Processing 3 products

üì¶ PRODUCT 1/3

üîÑ PROCESSING: EvasHair - Haitian Black Castor Oil (Lwil Maskriti)
‚è±Ô∏è  LLM extraction: 3.92s
‚úÖ All critical fields populated
‚úÖ Successfully processed: EvasHair - Haitian Black Castor Oil (Lwil Maskriti)

üì¶ PRODUCT 2/3

üîÑ PROCESSING: Chebhair - ChebElixir Traditional Hair Oil Bath - 200ml
‚è±Ô∏è  LLM extraction: 4.95s
‚úÖ All critical fields populated
‚úÖ Successfully processed: Chebhair - ChebElixir Traditional Hair Oil Bath - 200ml

üì¶ PRODUCT 3/3

üîÑ PROCESSING: Soarn - Shea Whipped Butter ‚Äì Tropical Juicy
‚è±Ô∏è  LLM extraction: 4.73s
‚úÖ All critical fields populated
‚úÖ Successfully processed: Soarn - Shea Whipped Butter ‚Äì Tropical Juicy

üéØ TEST BATCH COMPLETE
‚úÖ Successful: 3/3
‚ùå Failed: 0/3


[{'brand': 'EvasHair',
  'name': 'Haitian Black Castor Oil (Lwil Maskriti)',
  'url': 'https://evashair.fr/fr/boutique/22-40-huile-de-ricin-noire-dhaiti-0745114560452.html#/1-taille-120ml',
  'raw_response': '{\n  "Product Info": {\n    "Product Sheet": {\n      "Brand": "EvasHair",\n      "Product name": "Haitian Black Castor Oil (Lwil Maskriti)",\n      "Marketing Description": {\n        "EN": "Haitian Black Castor Oil is known locally as ‚ÄúLwil Maskriti‚Äù in Haitian Creole, or ‚ÄúHuile de Carapate‚Äù in French Antilles. This oil is produced using a traditional ancestral method. The castor seeds are roasted, then ground and boiled in water until a rich amber-colored oil with a roasted hazelnut scent is obtained. This process preserves omega 3, 6, and 9 fatty acids, vitamin E, and ricinoleic acid ‚Äî making it a unique oil worldwide."\n      },\n      "Key ingredients": {\n        "EN": [\n          "Ricinus Communis"\n        ]\n      },\n      "Price (euros)": "14.40",\n      "Qu

In [60]:
run_complete_pipeline()

üöÄ STARTING ENHANCED INDIVIDUAL PRODUCT PIPELINE
üì¶ Processing 10 products individually

üì¶ PRODUCT 1/10

üîÑ PROCESSING: EvasHair - Haitian Black Castor Oil (Lwil Maskriti)
‚è±Ô∏è  LLM extraction: 3.95s
‚úÖ All critical fields populated
‚úÖ Successfully processed: EvasHair - Haitian Black Castor Oil (Lwil Maskriti)

üì¶ PRODUCT 2/10

üîÑ PROCESSING: Chebhair - ChebElixir Traditional Hair Oil Bath - 200ml
‚è±Ô∏è  LLM extraction: 4.64s
‚úÖ All critical fields populated
‚úÖ Successfully processed: Chebhair - ChebElixir Traditional Hair Oil Bath - 200ml

üì¶ PRODUCT 3/10

üîÑ PROCESSING: Soarn - Shea Whipped Butter ‚Äì Tropical Juicy
‚è±Ô∏è  LLM extraction: 4.79s
‚úÖ All critical fields populated
‚úÖ Successfully processed: Soarn - Shea Whipped Butter ‚Äì Tropical Juicy

üì¶ PRODUCT 4/10

üîÑ PROCESSING: Mango Butterfull - Moisturizing & Nourishing Milk ‚Äì Nourish
‚è±Ô∏è  LLM extraction: 3.71s
‚úÖ All critical fields populated
‚úÖ Successfully processed: Mango Butterfull - M

([{'brand': 'EvasHair',
   'name': 'Haitian Black Castor Oil (Lwil Maskriti)',
   'url': 'https://evashair.fr/fr/boutique/22-40-huile-de-ricin-noire-dhaiti-0745114560452.html#/1-taille-120ml',
   'raw_response': '{\n  "Product Info": {\n    "Product Sheet": {\n      "Brand": "EvasHair",\n      "Product name": "Haitian Black Castor Oil (Lwil Maskriti)",\n      "Marketing Description": {\n        "EN": "Haitian Black Castor Oil is known locally as ‚ÄúLwil Maskriti‚Äù in Haitian Creole, or ‚ÄúHuile de Carapate‚Äù in French Antilles. This oil is produced using a traditional ancestral method. The castor seeds are roasted, then ground and boiled in water until a rich amber-colored oil with a roasted hazelnut scent is obtained. This process preserves omega 3, 6, and 9 fatty acids, vitamin E, and ricinoleic acid ‚Äî making it a unique oil worldwide."\n      },\n      "Key ingredients": {\n        "EN": [\n          "Ricinus Communis"\n        ]\n      },\n      "Price (euros)": "14.40",\n     

In [61]:
process_single_product(product_1, client)


üîÑ PROCESSING: EvasHair - Haitian Black Castor Oil (Lwil Maskriti)
‚è±Ô∏è  LLM extraction: 4.53s
‚úÖ All critical fields populated
‚úÖ Successfully processed: EvasHair - Haitian Black Castor Oil (Lwil Maskriti)


{'brand': 'EvasHair',
 'name': 'Haitian Black Castor Oil (Lwil Maskriti)',
 'url': 'https://evashair.fr/fr/boutique/22-40-huile-de-ricin-noire-dhaiti-0745114560452.html#/1-taille-120ml',
 'raw_response': '{\n  "Product Info": {\n    "Product Sheet": {\n      "Brand": "EvasHair",\n      "Product name": "Haitian Black Castor Oil (Lwil Maskriti)",\n      "Marketing Description": {\n        "EN": "Haitian Black Castor Oil is known locally as ‚ÄúLwil Maskriti‚Äù in Haitian Creole, or ‚ÄúHuile de Carapate‚Äù in French Antilles. This oil is produced using a traditional ancestral method. The castor seeds are roasted, then ground and boiled in water until a rich amber-colored oil with a roasted hazelnut scent is obtained. This process preserves omega 3, 6, and 9 fatty acids, vitamin E, and ricinoleic acid ‚Äî making it a unique oil worldwide."\n      },\n      "Key ingredients": {\n        "EN": [\n          "Ricinus Communis"\n        ]\n      },\n      "Price (euros)": "14.40",\n      "Quanti

In [62]:
final_results = execute_complete_10_product_pipeline()

üöÄ STARTING COMPLETE 10-PRODUCT PIPELINE
üöÄ PROCESSING ALL 10 PRODUCTS TOGETHER

üì¶ PROCESSING PRODUCT 1/10: EvasHair - Haitian Black Castor Oil (Lwil Maskriti)
--------------------------------------------------
‚è±Ô∏è  LLM extraction: 3.75s
‚úÖ All critical fields populated
‚úÖ Successfully processed: EvasHair

üì¶ PROCESSING PRODUCT 2/10: Chebhair - ChebElixir Traditional Hair Oil Bath - 200ml
--------------------------------------------------
‚è±Ô∏è  LLM extraction: 3.83s
‚úÖ All critical fields populated
‚úÖ Successfully processed: Chebhair

üì¶ PROCESSING PRODUCT 3/10: Soarn - Shea Whipped Butter ‚Äì Tropical Juicy
--------------------------------------------------
‚è±Ô∏è  LLM extraction: 3.59s
‚úÖ All critical fields populated
‚úÖ Successfully processed: Soarn

üì¶ PROCESSING PRODUCT 4/10: Mango Butterfull - Moisturizing & Nourishing Milk ‚Äì Nourish
--------------------------------------------------
‚è±Ô∏è  LLM extraction: 3.88s
‚úÖ All critical fields populated
‚úÖ Suc

## FOCUSED STANDARDIZATION FUNCTIONS

In [63]:
def standardize_price_quantity_only(extraction_dict):
    """
    Focused standardization - ONLY price and quantity formatting
    Applied AFTER LLM extraction, BEFORE validation
    """
    standardized = extraction_dict.copy()

    # 1. FIX PRICE FORMATTING ONLY
    price = standardized.get('Product Info', {}).get('Product Sheet', {}).get('Price (euros)', '')
    if price:
        # Remove currency symbols, keep numbers, dots, commas, hyphens
        cleaned_price = re.sub(r'[^\d.,-]', '', str(price))

        # Handle European decimal format
        if ',' in cleaned_price and '.' in cleaned_price:
            # Format like "1.200,50" ‚Üí "1200.50"
            cleaned_price = cleaned_price.replace('.', '').replace(',', '.')
        elif ',' in cleaned_price:
            # Format like "24,90" ‚Üí "24.90"
            cleaned_price = cleaned_price.replace(',', '.')

        # Handle price ranges (take first price only)
        if '-' in cleaned_price:
            cleaned_price = cleaned_price.split('-')[0].strip()

        # Final cleanup - keep only numbers and dot
        cleaned_price = re.sub(r'[^\d.]', '', cleaned_price)

        standardized['Product Info']['Product Sheet']['Price (euros)'] = cleaned_price
        if price != cleaned_price:
            print(f"üí∞ Fixed price: '{price}' ‚Üí '{cleaned_price}'")

    # 2. FIX QUANTITY FORMATTING ONLY
    quantity = standardized.get('Product Info', {}).get('Product Sheet', {}).get('Quantity (ml)', '')
    if quantity:
        numbers = re.findall(r'\d+', str(quantity))
        if numbers:
            number_part = numbers[0]

            # Detect and standardize units
            if re.search(r'\b(ml|mL|milliliter)\b', str(quantity), re.IGNORECASE):
                unit_part = 'ml'
            elif re.search(r'\b(l|L|liter)\b', str(quantity), re.IGNORECASE):
                # Convert liters to ml
                number_part = str(int(number_part) * 1000)
                unit_part = 'ml'
            else:
                # Default to grams for creams/powders
                unit_part = 'g'

            standardized_quantity = f"{number_part} {unit_part}"
            standardized['Product Info']['Product Sheet']['Quantity (ml)'] = standardized_quantity

            if quantity != standardized_quantity:
                print(f"‚öñÔ∏è  Fixed quantity: '{quantity}' ‚Üí '{standardized_quantity}'")

    return standardized

def validate_price_quantity_formatting(extraction_dict):
    """
    Validate ONLY price and quantity formatting
    """
    issues = []

    price = extraction_dict.get('Product Info', {}).get('Product Sheet', {}).get('Price (euros)', '')
    quantity = extraction_dict.get('Product Info', {}).get('Product Sheet', {}).get('Quantity (ml)', '')

    # Check price formatting
    if price:
        if any(char in price for char in ['‚Ç¨', '$', '¬£']):
            issues.append("Price contains currency symbols")
        if ',' in price and price.count('.') > 1:
            issues.append("Price has inconsistent decimal formatting")

    # Check quantity formatting
    if quantity:
        if not any(unit in quantity.lower() for unit in ['ml', 'g', 'l']):
            issues.append("Quantity missing units")
        if re.search(r'\b\d+\b', quantity) and not re.search(r'\b(ml|g|l)\b', quantity, re.IGNORECASE):
            issues.append("Quantity has number but no units")

    if issues:
        print(f"‚ö†Ô∏è  Formatting issues: {', '.join(issues)}")
        return False
    else:
        print("‚úÖ Price & quantity formatting: OK")
        return True

## FOCUSED PROCESSING FUNCTION

In [64]:
def process_single_product_focused(product, client, completion_model="gpt-4o-mini"):
    """
    Process a single product with focused price/quantity enhancement
    """
    print(f"\n{'='*60}")
    print(f"üîÑ PROCESSING: {product['brand']} - {product['name']}")
    print(f"{'='*60}")

    try:
        # Create product-specific prompt with focused template
        product_prompt = enhanced_prompt_template.format(
            hair_type_dict_en=hair_type_dict_en,
            questionnaire=questionnaire,
            product_info=product['product_information']
        )

        # Execute extraction
        start_time = time.time()
        raw_response = completeChat(product_prompt, style, client, completion_model)
        extraction_time = time.time() - start_time

        print(f"‚è±Ô∏è  LLM extraction: {extraction_time:.2f}s")

        # Apply FOCUSED standardization (price & quantity only)
        extraction_dict = json.loads(raw_response)
        standardized_dict = standardize_price_quantity_only(extraction_dict)

        # Validate FOCUSED formatting
        formatting_ok = validate_price_quantity_formatting(standardized_dict)

        result = {
            'brand': product['brand'],
            'name': product['name'],
            'url': product['url'],
            'raw_response': raw_response,
            'standardized_response': standardized_dict,
            'extraction_time': extraction_time,
            'price_quantity_valid': formatting_ok,
            'processed_at': datetime.now().isoformat()
        }

        print(f"‚úÖ Successfully processed: {product['brand']} - {product['name']}")
        return result

    except Exception as e:
        print(f"‚ùå Failed to process {product['brand']}: {e}")
        return {
            'brand': product['brand'],
            'name': product['name'],
            'url': product['url'],
            'error': str(e),
            'processed_at': datetime.now().isoformat()
        }

## FOCUSED RESULTS ANALYSIS

In [65]:
def analyze_price_quantity_results(batch_results, total_time):
    """
    Analyze and display focused results for price & quantity formatting
    """
    print("\n" + "="*70)
    print("üìä FOCUSED ANALYSIS - PRICE & QUANTITY FORMATTING")
    print("="*70)

    successful = [r for r in batch_results if 'standardized_response' in r]
    failed = [r for r in batch_results if 'error' in r]

    # Basic statistics
    print(f"üì¶ TOTAL PRODUCTS PROCESSED: {len(batch_results)}")
    print(f"‚úÖ SUCCESSFUL EXTRACTIONS: {len(successful)}")
    print(f"‚ùå FAILED EXTRACTIONS: {len(failed)}")
    print(f"‚è±Ô∏è  TOTAL PROCESSING TIME: {total_time:.2f}s")

    # Price & Quantity specific analysis
    if successful:
        print(f"\nüí∞ PRICE FORMATTING ANALYSIS:")
        print("-" * 40)

        price_issues = 0
        quantity_issues = 0
        price_examples = []
        quantity_examples = []

        for result in successful:
            extraction = result['standardized_response']
            product_sheet = extraction.get('Product Info', {}).get('Product Sheet', {})

            price = product_sheet.get('Price (euros)', '')
            quantity = product_sheet.get('Quantity (ml)', '')

            # Check price formatting
            if price:
                if any(char in price for char in ['‚Ç¨', '$', '¬£', ',']):
                    price_issues += 1
                    price_examples.append(f"{result['brand']}: '{price}'")

            # Check quantity formatting
            if quantity:
                if not any(unit in quantity.lower() for unit in ['ml', 'g']):
                    quantity_issues += 1
                    quantity_examples.append(f"{result['brand']}: '{quantity}'")

        print(f"  Price formatting issues: {price_issues}/{len(successful)}")
        print(f"  Quantity formatting issues: {quantity_issues}/{len(successful)}")

        if price_examples:
            print(f"  Problematic prices: {', '.join(price_examples[:3])}")
        if quantity_examples:
            print(f"  Problematic quantities: {', '.join(quantity_examples[:3])}")

    # Formatting success rate
    valid_formatting = len([r for r in successful if r.get('price_quantity_valid', False)])
    print(f"\nüéØ FORMATTING SUCCESS RATE: {valid_formatting}/{len(successful)} ({valid_formatting/len(successful)*100:.1f}%)")

    return {
        'total_products': len(batch_results),
        'successful': len(successful),
        'failed': len(failed),
        'price_issues': price_issues,
        'quantity_issues': quantity_issues,
        'formatting_success_rate': valid_formatting/len(successful) if successful else 0
    }

In [66]:
# === MODIFIED COMPLETE EXECUTION WITH PRICE/QUANTITY ANALYSIS ===
def execute_focused_10_product_pipeline():
    """
    Execute focused pipeline for all 10 products with price/quantity analysis
    """
    print("üöÄ STARTING FOCUSED 10-PRODUCT PIPELINE (PRICE & QUANTITY)")
    print("=" * 70)

    try:
        # Process all 10 products with focused approach
        all_results = []
        total_start_time = time.time()

        for i, product in enumerate(all_products, 1):
            print(f"\nüì¶ PROCESSING PRODUCT {i}/10: {product['brand']} - {product['name']}")
            print("-" * 50)

            # Use focused processing function
            product_result = process_single_product_focused(product, client)
            product_result['product_index'] = i
            all_results.append(product_result)

            # Small delay to avoid rate limiting
            if i < len(all_products):
                time.sleep(2)

        total_time = time.time() - total_start_time

        # Analyze focused results (price & quantity)
        analysis_results = analyze_price_quantity_results(all_results, total_time)

        # Export results
        export_files = export_all_products_together(all_results, analysis_results, export_dir="focused_price_quantity_exports")

        # Generate focused report
        report_file = generate_focused_price_quantity_report(all_results, analysis_results, export_files)

        print("\nüéØ FOCUSED PIPELINE EXECUTION COMPLETE!")
        print("=" * 70)
        print(f"üìä SUCCESS RATE: {analysis_results['successful']}/{len(all_products)} ({analysis_results['successful']/len(all_products)*100:.1f}%)")
        print(f"üí∞ PRICE FORMATTING ISSUES: {analysis_results.get('price_issues', 0)}/{analysis_results['successful']}")
        print(f"‚öñÔ∏è  QUANTITY FORMATTING ISSUES: {analysis_results.get('quantity_issues', 0)}/{analysis_results['successful']}")
        print(f"üéØ FORMATTING SUCCESS RATE: {analysis_results.get('formatting_success_rate', 0)*100:.1f}%")
        print(f"‚è±Ô∏è  TOTAL TIME: {total_time:.2f}s")
        print(f"üìÅ MAIN EXPORT: {export_files['detailed_json']}")
        print(f"üìã FOCUSED REPORT: {report_file}")

        return {
            'batch_results': all_results,
            'analysis_results': analysis_results,
            'export_files': export_files,
            'report_file': report_file
        }

    except Exception as e:
        print(f"‚ùå Focused pipeline execution failed: {e}")
        return None

In [67]:
# === FOCUSED PRICE/QUANTITY REPORT ===
def generate_focused_price_quantity_report(batch_results, analysis_results, export_files, export_dir="focused_reports"):
    """
    Generate focused report specifically for price & quantity formatting
    """
    if not os.path.exists(export_dir):
        os.makedirs(export_dir)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_filepath = os.path.join(export_dir, f"{timestamp}_focused_price_quantity_report.txt")

    with open(report_filepath, 'w', encoding='utf-8') as f:
        f.write("FOCUSED PRODUCT DATA EXTRACTION REPORT - PRICE & QUANTITY\n")
        f.write("=" * 70 + "\n")
        f.write(f"Generated: {datetime.now().isoformat()}\n")
        f.write(f"Total Products: {analysis_results['total_products']}\n")
        f.write(f"Successful Extractions: {analysis_results['successful']}\n")
        f.write(f"Failed Extractions: {analysis_results['failed']}\n")
        f.write(f"Success Rate: {analysis_results['successful']/analysis_results['total_products']*100:.1f}%\n")
        f.write(f"Total Processing Time: {analysis_results.get('total_time', 0):.2f}s\n\n")

        # Price & Quantity Specific Analysis
        f.write("PRICE & QUANTITY FORMATTING ANALYSIS\n")
        f.write("-" * 50 + "\n")

        successful_results = [r for r in batch_results if 'standardized_response' in r]

        if successful_results:
            # Price Analysis
            f.write("PRICE FORMATTING:\n")
            price_formats = {}
            for result in successful_results:
                extraction = result['standardized_response']
                price = extraction.get('Product Info', {}).get('Product Sheet', {}).get('Price (euros)', '')
                if price:
                    # Categorize price format
                    if any(char in price for char in ['‚Ç¨', '$', '¬£']):
                        format_type = "Contains currency symbols"
                    elif ',' in price:
                        format_type = "European format (comma)"
                    elif re.match(r'^\d+\.\d{2}$', price):
                        format_type = "Standard (XX.XX)"
                    elif re.match(r'^\d+$', price):
                        format_type = "Integer only"
                    else:
                        format_type = "Other format"

                    price_formats[format_type] = price_formats.get(format_type, 0) + 1

                    # Log specific examples for problematic formats
                    if format_type != "Standard (XX.XX)":
                        f.write(f"  ‚ö†Ô∏è  {result['brand']}: '{price}' ‚Üí {format_type}\n")

            # Add summary
            f.write(f"\n  PRICE FORMAT SUMMARY:\n")
            for fmt, count in price_formats.items():
                percentage = (count / len(successful_results)) * 100
                f.write(f"    {fmt}: {count}/{len(successful_results)} ({percentage:.1f}%)\n")

            # Quantity Analysis
            f.write("\nQUANTITY FORMATTING:\n")
            quantity_formats = {}
            for result in successful_results:
                extraction = result['standardized_response']
                quantity = extraction.get('Product Info', {}).get('Product Sheet', {}).get('Quantity (ml)', '')
                if quantity:
                    # Categorize quantity format
                    if 'ml' in quantity.lower():
                        format_type = "Milliliters"
                    elif 'g' in quantity.lower():
                        format_type = "Grams"
                    elif 'l' in quantity.lower():
                        format_type = "Liters"
                    elif re.search(r'\d+', quantity) and not any(unit in quantity.lower() for unit in ['ml', 'g', 'l']):
                        format_type = "Number only (missing units)"
                    else:
                        format_type = "Other format"

                    quantity_formats[format_type] = quantity_formats.get(format_type, 0) + 1

                    # Log specific examples for problematic formats
                    if format_type == "Number only (missing units)":
                        f.write(f"  ‚ö†Ô∏è  {result['brand']}: '{quantity}' ‚Üí {format_type}\n")

            # Add summary
            f.write(f"\n  QUANTITY FORMAT SUMMARY:\n")
            for fmt, count in quantity_formats.items():
                percentage = (count / len(successful_results)) * 100
                f.write(f"    {fmt}: {count}/{len(successful_results)} ({percentage:.1f}%)\n")

        # Product-by-product formatting status
        f.write("\nPRODUCT FORMATTING STATUS:\n")
        f.write("-" * 50 + "\n")
        for result in batch_results:
            status = "‚úÖ SUCCESS" if 'standardized_response' in result else "‚ùå FAILED"
            time_str = f"{result.get('extraction_time', 0):.2f}s" if 'extraction_time' in result else "N/A"
            formatting_status = "‚úÖ VALID" if result.get('price_quantity_valid') else "‚ö†Ô∏è  INVALID"

            if 'standardized_response' in result:
                extraction = result['standardized_response']
                price = extraction.get('Product Info', {}).get('Product Sheet', {}).get('Price (euros)', 'N/A')
                quantity = extraction.get('Product Info', {}).get('Product Sheet', {}).get('Quantity (ml)', 'N/A')
                f.write(f"{result['product_index']:2d}. {result['brand']:15} - {formatting_status}\n")
                f.write(f"     Price: {price}, Quantity: {quantity}\n")
            else:
                f.write(f"{result['product_index']:2d}. {result['brand']:15} - {status} - {time_str}\n")

        # Improvement metrics
        f.write("\nIMPROVEMENT METRICS:\n")
        f.write("-" * 50 + "\n")
        if successful_results:
            standard_price_count = price_formats.get("Standard (XX.XX)", 0)
            standard_quantity_count = quantity_formats.get("Milliliters", 0) + quantity_formats.get("Grams", 0)

            f.write(f"Standard price format (XX.XX): {standard_price_count}/{len(successful_results)} ({standard_price_count/len(successful_results)*100:.1f}%)\n")
            f.write(f"Standard quantity format (with units): {standard_quantity_count}/{len(successful_results)} ({standard_quantity_count/len(successful_results)*100:.1f}%)\n")

            # Calculate improvement potential
            price_improvement = len(successful_results) - standard_price_count
            quantity_improvement = len(successful_results) - standard_quantity_count

            f.write(f"Price formatting improvements needed: {price_improvement} products\n")
            f.write(f"Quantity formatting improvements needed: {quantity_improvement} products\n")

    print(f"üìã Focused price/quantity report: {report_filepath}")
    return report_filepath

In [68]:
focused_results = execute_focused_10_product_pipeline()

üöÄ STARTING FOCUSED 10-PRODUCT PIPELINE (PRICE & QUANTITY)

üì¶ PROCESSING PRODUCT 1/10: EvasHair - Haitian Black Castor Oil (Lwil Maskriti)
--------------------------------------------------

üîÑ PROCESSING: EvasHair - Haitian Black Castor Oil (Lwil Maskriti)
‚è±Ô∏è  LLM extraction: 3.73s
‚úÖ Price & quantity formatting: OK
‚úÖ Successfully processed: EvasHair - Haitian Black Castor Oil (Lwil Maskriti)

üì¶ PROCESSING PRODUCT 2/10: Chebhair - ChebElixir Traditional Hair Oil Bath - 200ml
--------------------------------------------------

üîÑ PROCESSING: Chebhair - ChebElixir Traditional Hair Oil Bath - 200ml
‚è±Ô∏è  LLM extraction: 3.95s
‚úÖ Price & quantity formatting: OK
‚úÖ Successfully processed: Chebhair - ChebElixir Traditional Hair Oil Bath - 200ml

üì¶ PROCESSING PRODUCT 3/10: Soarn - Shea Whipped Butter ‚Äì Tropical Juicy
--------------------------------------------------

üîÑ PROCESSING: Soarn - Shea Whipped Butter ‚Äì Tropical Juicy
‚è±Ô∏è  LLM extraction: 3.45s
‚úÖ

### Downloading files

In [69]:
# Zip the contents of the current directory, excluding 'sample_data'
import os
import time
import shutil

folder = '.'  # Current directory
output_filename = f"{os.path.basename(os.getcwd())}_exports_{time.strftime('%Y%m%d_%H%M%S')}"
zip_filepath = shutil.make_archive(output_filename, 'zip', root_dir=folder )#ignore=shutil.ignore_patterns('sample_data'))

print(f"‚úÖ Zipped contents to: {zip_filepath}")

# Code to download the file
from google.colab import files
print("\nReady to download the zip file.")
files.download(zip_filepath)

OSError: [Errno 28] No space left on device

In [71]:
# Zip specific directories containing reports and exports
import os
import time
import shutil

# Define the directories to include in the zip file
dirs_to_zip = ['batch_exports', 'reports', 'focused_price_quantity_exports', 'focused_reports', 'all_products_export']

# Create a temporary directory to stage the files
temp_dir = f"temp_zip_stage_{time.strftime('%Y%m%d_%H%M%S')}"
os.makedirs(temp_dir, exist_ok=True)

# Copy the directories to the temporary staging area
for dir_name in dirs_to_zip:
    if os.path.exists(dir_name):
        destination = os.path.join(temp_dir, dir_name)
        shutil.copytree(dir_name, destination)
        print(f"üìÅ Staged directory: {dir_name}")
    else:
        print(f"‚ö†Ô∏è Directory not found, skipping: {dir_name}")

# Create the zip archive from the temporary directory
output_filename = f"{os.path.basename(os.getcwd())}_reports_exports_{time.strftime('%Y%m%d_%H%M%S')}"
zip_filepath = shutil.make_archive(output_filename, 'zip', root_dir=temp_dir)

# Clean up the temporary directory
shutil.rmtree(temp_dir)

print(f"‚úÖ Zipped contents to: {zip_filepath}")

# Code to download the file
from google.colab import files
print("\nReady to download the zip file.")
files.download(zip_filepath)

üìÅ Staged directory: batch_exports
üìÅ Staged directory: reports
üìÅ Staged directory: focused_price_quantity_exports
‚ö†Ô∏è Directory not found, skipping: focused_reports
üìÅ Staged directory: all_products_export
‚úÖ Zipped contents to: /content/content_reports_exports_20251103_191518.zip

Ready to download the zip file.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>