In [151]:
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
import re
import pandas as pd
import asyncio
from playwright.sync_api import sync_playwright

In [169]:
def get_links(link):
    with httpx.Client() as client:
        response = client.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
        urls = []

        main = soup.find('div', class_='container p-0')

        if main:
            for a_tag in main.find_all('a', href=True):
                full_url = urljoin(link, a_tag['href'])
                urls.append(full_url)

    return urls

def product_links(link):
    with httpx.Client() as client:
        response = client.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
        urls = []
        
        # Find all "viewMore" sections on the page
        view_more_sections = soup.find_all('div', class_='viewMore')
        
        for section in view_more_sections:
            # Extract links from each "viewMore" section
            for a_tag in section.find_all('a', href=True):
                full_url = urljoin(link, a_tag['href'])
                urls.append(full_url)
                
    return urls


def find_other_ingredients(soup):
    p_tags = soup.find_all('p')
    extracted_text = []
    in_target_section = False

    for p in p_tags:
        text = p.get_text(strip=True)
        if 'Other Ingredients' in text:
            in_target_section = True
            # Remove "Other Ingredients" label and add the following text
            extracted_text.append(text.replace("Other Ingredients", "").strip())
        elif 'Ingredients' in text:
            in_target_section = True
            # Remove "Other Ingredients" label and add the following text
            extracted_text.append(text.replace("Ingredients:", "").strip())
        elif 'Precaution' in text or 'Storage' in text:
            # Stop extraction once "Storage" is found
            break
        elif in_target_section:
            extracted_text.append(text)


    cleaned_text = ' '.join(extracted_text).replace('&nbsp;', ' ').strip().lower()
    return cleaned_text

def get_best_for(soup, label):
    # Find the element containing the label
    label_element = soup.find(string=lambda text: text and label in text)
    
    if label_element:
        # Find the parent of the label element to start navigating from
        parent = label_element.find_parent()
        
        # Iterate through siblings of the parent
        for sibling in parent.find_all_next():
            if sibling.name == 'ul':
                # Capture and return the concatenated text from all list items in the unordered list
                return ' '.join(li.get_text(strip=True) for li in sibling.find_all('li'))
    
    return ''



def get_text_after_label(soup, label):
    # Find the element containing the label
    label_element = soup.find(string=lambda text: text and label in text)
    
    if label_element:
        # Find the parent of the label element to start navigating from
        parent = label_element.find_parent()
        
        # Iterate through siblings of the parent
        for sibling in parent.find_all_next():
            if sibling.name == 'p':
                # Return text from the first relevant sibling found
                return sibling.get_text(strip=True)
            elif sibling.name == 'hr':
                # Stop if an 'hr' tag is encountered (end of section)
                break
    
    return ''

def extract_manufactured_nation(soup):
    manufactured_in_text = ""
    p_tags = soup.find_all('p')

    for p in p_tags:
        text = p.get_text(strip=True)
        if 'Manufactured in' in text:
            # Extract everything after "Manufactured in"
            manufactured_in_text = text.split('Manufactured in')[-1].strip()
            break

    return manufactured_in_text if manufactured_in_text else ''

In [85]:
beauty = 'https://www.lac.com/en/everyday-wellness.html/'
urls = get_links(beauty)
all_links = []

In [86]:
for i in urls:
    links_from_product = product_links(i)
    all_links.extend(links_from_product)  # Flatten the list of links
print(f"Total links found: {len(all_links)}")

Total links found: 467


In [157]:
with open('links.txt', 'w') as file:
    for link in all_links:
        file.write(link + '\n') 

In [170]:
all_details = []  # List to store details of each product

try:
    with httpx.Client() as client:
        for i, link in enumerate(all_links[:10]):  # Loop through the first 10 links
            source = client.get(link)
            soup = BeautifulSoup(source.text, 'html.parser')
            detail = {}

            content = soup.find('div', class_='fade show')

            try:
                id_num = content.find('span', class_='product-id text--bold')
                detail['id'] = id_num.text if id_num else None
            except Exception as e:
                print(f"Error finding product ID for link {i + 1}: {e}")

            try:
                name = soup.find('div', class_='d-none d-sm-block').h1.text
                detail['product_name'] = name
            except Exception as e:
                print(f"Error finding product name for link {i + 1}: {e}")

            try:
                brand = soup.find('div', class_='d-none d-sm-block').a.text
                detail['brand'] = brand
            except Exception as e:
                print(f"Error finding brand for link {i + 1}: {e}")

            try:
                breadcrumb = soup.find('ol', class_='breadcrumb text-size--small col', itemprop='category')
                model = breadcrumb.find_all('li', class_='breadcrumb-element')
                product_type = re.sub(r'\s+', ' ', model[-2].text.strip())
                detail['type'] = product_type
            except Exception as e:
                print(f"Error finding product type for link {i + 1}: {e}")

            try:
                price = soup.find('span', itemprop='price', class_='d-none').text
                detail['usual_price'] = price
            except Exception as e:
                print(f"Error finding usual price for link {i + 1}: {e}")

            try:
                vip_price = soup.find('span', itemprop='sale_price', class_='d-none').text
                detail['vip_price'] = vip_price
            except Exception as e:
                print(f"Error finding VIP price for link {i + 1}: {e}")

            divs1 = soup.find_all('div', class_='col-sm-12 col-md-4 col-lg-2 d-flex d-md-block justify-content-between pb-2')

            for div in divs1:
                try:
                    label = div.find('span', class_='d-block product-overview--label')
                    if label and label.text == "Form":
                        bold_span = div.find('span', class_='text--bold')
                        if bold_span and bold_span.a:
                            form = bold_span.a.text.strip()
                            detail['form'] = form
                except Exception as e:
                    print(f"Error finding form for link {i + 1}: {e}")

            
            try:
                servings = soup.find('div', class_='product-uom d-none').text
                detail['servings_per_container'] = servings
            except Exception as e:
                print(f"Error finding servings per container for link {i + 1}: {e}")

            
            try:
                usage = soup.find('div', id='product-usagedirection').text
                detail['usage_direction'] = usage
            except Exception as e:
                print(f"Error finding usage direction for link {i + 1}: {e}")

            try:
                key_ingredient_list = soup.find_all('div', class_='ingredient')
                key_ingredient = [div.find('a').text.strip() for div in key_ingredient_list]
                # Join the list into a single string, separating each ingredient with a comma
                key_ingredient = ', '.join(key_ingredient)
                key_ingredient = re.sub(r'\s+', ' ', key_ingredient.strip())
                detail['key_ingredient'] = key_ingredient
            except Exception as e:
                print(f"Error finding key ingredient for link {i + 1}: {e}")

            try:
                other_ingredient_section = soup.find('div', id='product-supplements')
                other_ingredients = find_other_ingredients(other_ingredient_section)
                detail['other_ingredients'] = other_ingredients
            except Exception as e:
                print(f"Error finding other ingredients for link {i + 1}: {e}")

           

            try:
                detail['storage'] = get_text_after_label(soup, "Storage")
                detail['precaution'] = get_text_after_label(soup, "Precaution")
                detail['best_for'] = get_best_for(soup, 'Best for people with')
                detail['manufacturer'] = extract_manufactured_nation(soup)
            except Exception as e:
                print(f"Error finding additional details for link {i + 1}: {e}")

            try:
                usp_list = soup.find_all('div', class_='usp-list-label')
                usp_texts = [div.find('span').text.strip() for div in usp_list]
                joined_usp_texts = ', '.join(usp_texts)
                detail['feature'] = joined_usp_texts
            except Exception as e:
                print(f"Error finding USP texts for link {i + 1}: {e}")
            
            try:
                detail['link'] = all_links[i]
            except:
                print(f"Error finding link {i + 1}: {e}")
            all_details.append(detail)  # Add the detail dictionary to the list

except Exception as e:
    print(f"General error occurred: {e}")

# Display all collected details
all_details


Error finding brand for link 1: 'NoneType' object has no attribute 'text'
Error finding usual price for link 1: 'NoneType' object has no attribute 'text'
Error finding VIP price for link 1: 'NoneType' object has no attribute 'text'
Error finding servings per container for link 1: 'NoneType' object has no attribute 'text'
Error finding usage direction for link 1: 'NoneType' object has no attribute 'text'
Error finding other ingredients for link 1: 'NoneType' object has no attribute 'find_all'


[{'id': None,
  'product_name': '16-day Transformation',
  'type': 'Anti Ageing',
  'key_ingredient': '',
  'storage': '',
  'precaution': '',
  'best_for': '',
  'manufacturer': '',
  'feature': '',
  'link': 'https://www.lac.com/en/promotions_bundles/16-day-transformation-BUNDLE1_taut16daystransformation.html?catId=everyday-wellness_anti-ageing'},
 {'id': '01400940',
  'product_name': 'French Pine Bark Extract - The Powerful Antioxidant',
  'brand': "LAC MASQUELIER's®",
  'type': 'Anti Ageing',
  'usual_price': '35.38',
  'vip_price': '28.30',
  'form': 'Tablet',
  'servings_per_container': '25 tablets',
  'usage_direction': 'Take 1 – 2 tablets per day with water or fruit juice, or at least 1 tablet for every 100lb (45 kg) of body weight.',
  'key_ingredient': 'Oligomeric proanthocyanidin complexes (OPCs)',
  'other_ingredients': 'microcrystalline cellulose, magnesium stearate  colour of tablet may vary from batch to batch due to natural ingredients used  certified free of yeast, whe

In [171]:
df = pd.DataFrame(all_details)
df

Unnamed: 0,id,product_name,type,key_ingredient,storage,precaution,best_for,manufacturer,feature,link,brand,usual_price,vip_price,form,servings_per_container,usage_direction,other_ingredients
0,,16-day Transformation,Anti Ageing,,,,,,,https://www.lac.com/en/promotions_bundles/16-d...,,,,,,,
1,1400940.0,French Pine Bark Extract - The Powerful Antiox...,Anti Ageing,Oligomeric proanthocyanidin complexes (OPCs),"Store in a cool, dry place away from direct su...","If you are pregnant, nursing, taking any medic...",,The Netherlands,"Premium Grade, Vegetarian, No Preservatives, N...",https://www.lac.com/en/anti-ageing/lac-masquel...,LAC MASQUELIER's®,35.38,28.3,Tablet,25 tablets,Take 1 – 2 tablets per day with water or fruit...,"microcrystalline cellulose, magnesium stearate..."
2,1400030.0,French Pine Bark Extract - The Powerful Antiox...,Anti Ageing,Oligomeric proanthocyanidin complexes (OPCs),"Store in a cool, dry place away from direct su...","If you are pregnant, nursing, taking any medic...",,The Netherlands,"Premium Grade, Vegetarian, No Preservatives, N...",https://www.lac.com/en/anti-ageing/lac-masquel...,LAC MASQUELIER's®,73.04,58.43,Tablet,50 tablets,Take 1 – 2 tablets per day with water or fruit...,"microcrystalline cellulose, magnesium stearate..."
3,1400240.0,French Pine Bark Extract - The Powerful Antiox...,Anti Ageing,Oligomeric proanthocyanidin complexes (OPCs),"Store in a cool, dry place away from direct su...","If you are pregnant, nursing, taking any medic...",,The Netherlands,"Premium Grade, Vegetarian, No Preservatives, N...",https://www.lac.com/en/anti-ageing_beauty/lac-...,LAC MASQUELIER's®,243.81,195.05,Tablet,175 tablets,Take 1 – 2 tablets per day with water or fruit...,"microcrystalline cellulose, magnesium stearate..."
4,1400140.0,French Pine Bark Extract - The Powerful Antiox...,Anti Ageing,Oligomeric proanthocyanidin complexes (OPCs),"Store in a cool, dry place. Avoid direct sunli...","If you are pregnant, nursing, taking any medic...",,The Netherlands,"Premium Grade, Vegetarian, No Preservatives, N...",https://www.lac.com/en/anti-ageing/lac-masquel...,LAC MASQUELIER's®,391.92,313.54,Tablet,300 tablets,"Directions: As a dietary supplement, take 1 – ...","microcrystalline cellulose, magnesium stearate..."
5,1406620.0,Miracle Intensive Age-Defying Serum,Anti Ageing,"Collagen, Glycerin, Sodium Hyaluronate, Tocoph...","Store in a cool, dry place away from direct su...",For external use only. Avoid contact with eyes...,,Japan,"Premium Grade, Men and Women of all Ages, No A...",https://www.lac.com/en/beauty/lac-taut%C2%AE/m...,LAC TAUT®,72.27,0.0,Topical,40 ml,Pump once or twice onto palm. Rub palms togeth...,"water, butylene glycol, glycerin, citric acid,..."
6,1406400.0,NMN 300mg - Ultimate NAD+ Booster,Anti Ageing,"NMN (Nicotinamide Mononucleotide), Grape Seed ...","Store in a cool, dry place away from direct su...","If you are pregnant, nursing, taking any medic...",A sedentary lifestyle Age-related weight conce...,Japan,"No Artifical Colours, No Artificial Flavours, ...",https://www.lac.com/en/anti-ageing_beauty/lac-...,LAC Anti-Ageing,295.8,295.77,Softgel,30 softgels,Take 1 softgel daily.,"grape seed oil, glycerol, silicon dioxide, gel..."
7,1403970.0,NMN 450mg - Ultimate NAD+ Booster,Anti Ageing,NMN (Nicotinamide Mononucleotide),"Store in a cool, dry place away from direct su...","If you are pregnant, nursing, taking any medic...",A sedentary lifestyle Age-related weight conce...,Japan,"Vegetarian, No Artificial Flavours, No Preserv...",https://www.lac.com/en/anti-ageing/lac-anti-ag...,LAC Anti-Ageing,385.77,308.62,,1.5g x 30 powder sticks,Take 1 stick daily with juice or your favourit...,"indigestible dextrin, orange juice powder, mon..."
8,1400670.0,Radiance+ Premium Collagen Mask,Anti Ageing,"Collagen, Glycerin, Sodium Hyaluronate, Tocoph...","Store in a cool, dry place away from direct su...",For external use only. Avoid direct contact wi...,,Japan,"Premium Grade, Men and Women of all Ages",https://www.lac.com/en/beauty/lac-taut%C2%AE/r...,LAC Taut®,38.85,31.08,Topical,5 sheets per box,Use on cleansed face. Gently fit mask over eye...,"ingredients water, butylene glycol, glycerin, ..."
9,1403960.0,"Rejuvenate+ Premium Collagen 13,000mg plus Pla...",Anti Ageing,"Collagen, Mangosteen Extract, Placenta, Hyalur...","Store in a cool, dry place. Avoid direct sunli...","If you are allergic to seafood, consult your d...",,Japan,"Premium Grade, RDS® Premium Collagen, No Artif...",https://www.lac.com/en/beauty_collagen/lac-tau...,LAC Taut®,77.65,62.12,Liquid,50ml x 8 bottles,"Take 1 bottle, preferably before bedtime, at l...","citric acid, malic acid, sodium benzoate, oran..."


In [160]:
df.columns

Index(['id', 'product_name', 'type', 'key_ingredient', 'storage', 'precaution',
       'best_for', 'manufacturer', 'feature', 'link', 'brand', 'usual_price',
       'vip_price', 'form', 'servings_per_container', 'usage_direction',
       'other_ingredients'],
      dtype='object')

In [164]:
df_merged = df.groupby('id').agg({
    'product_name': 'first',
    'type': ', '.join,
    'key_ingredient': 'first',
    'storage': 'first',
    'precaution': 'first',
    'best_for': 'first',
    'manufacturer': 'first',
    'feature': 'first',
    'link': ', '.join,
    'brand': 'first',
    'usual_price': 'first',
    'vip_price': 'first',
    'form': 'first',
    'servings_per_container': 'first',
    'usage_direction': 'first',
    'other_ingredients': 'first',
}).reset_index()

In [165]:
df_merged

Unnamed: 0,id,product_name,type,key_ingredient,storage,precaution,best_for,manufacturer,feature,link,brand,usual_price,vip_price,form,servings_per_container,usage_direction,other_ingredients
0,1400030,French Pine Bark Extract - The Powerful Antiox...,Anti Ageing,Oligomeric proanthocyanidin complexes (OPCs),"Store in a cool, dry place away from direct su...","If you are pregnant, nursing, taking any medic...",,The Netherlands,"Premium Grade, Vegetarian, No Preservatives, N...",https://www.lac.com/en/anti-ageing/lac-masquel...,LAC MASQUELIER's®,73.04,58.43,Tablet,50 tablets,Take 1 – 2 tablets per day with water or fruit...,"microcrystalline cellulose, magnesium stearate..."
1,1400140,French Pine Bark Extract - The Powerful Antiox...,Anti Ageing,Oligomeric proanthocyanidin complexes (OPCs),"Store in a cool, dry place. Avoid direct sunli...","If you are pregnant, nursing, taking any medic...",,The Netherlands,"Premium Grade, Vegetarian, No Preservatives, N...",https://www.lac.com/en/anti-ageing/lac-masquel...,LAC MASQUELIER's®,391.92,313.54,Tablet,300 tablets,"Directions: As a dietary supplement, take 1 – ...","microcrystalline cellulose, magnesium stearate..."
2,1400240,French Pine Bark Extract - The Powerful Antiox...,Anti Ageing,Oligomeric proanthocyanidin complexes (OPCs),"Store in a cool, dry place away from direct su...","If you are pregnant, nursing, taking any medic...",,The Netherlands,"Premium Grade, Vegetarian, No Preservatives, N...",https://www.lac.com/en/anti-ageing_beauty/lac-...,LAC MASQUELIER's®,243.81,195.05,Tablet,175 tablets,Take 1 – 2 tablets per day with water or fruit...,"microcrystalline cellulose, magnesium stearate..."
3,1400670,Radiance+ Premium Collagen Mask,Anti Ageing,"Collagen, Glycerin, Sodium Hyaluronate, Tocoph...","Store in a cool, dry place away from direct su...",For external use only. Avoid direct contact wi...,,Japan,"Premium Grade, Men and Women of all Ages",https://www.lac.com/en/beauty/lac-taut%C2%AE/r...,LAC Taut®,38.85,31.08,Topical,5 sheets per box,Use on cleansed face. Gently fit mask over eye...,"ingredients water, butylene glycol, glycerin, ..."
4,1400940,French Pine Bark Extract - The Powerful Antiox...,Anti Ageing,Oligomeric proanthocyanidin complexes (OPCs),"Store in a cool, dry place away from direct su...","If you are pregnant, nursing, taking any medic...",,The Netherlands,"Premium Grade, Vegetarian, No Preservatives, N...",https://www.lac.com/en/anti-ageing/lac-masquel...,LAC MASQUELIER's®,35.38,28.3,Tablet,25 tablets,Take 1 – 2 tablets per day with water or fruit...,"microcrystalline cellulose, magnesium stearate..."
5,1403960,"Rejuvenate+ Premium Collagen 13,000mg plus Pla...",Anti Ageing,"Collagen, Mangosteen Extract, Placenta, Hyalur...","Store in a cool, dry place. Avoid direct sunli...","If you are allergic to seafood, consult your d...",,Japan,"Premium Grade, RDS® Premium Collagen, No Artif...",https://www.lac.com/en/beauty_collagen/lac-tau...,LAC Taut®,77.65,62.12,Liquid,50ml x 8 bottles,"Take 1 bottle, preferably before bedtime, at l...","citric acid, malic acid, sodium benzoate, oran..."
6,1403970,NMN 450mg - Ultimate NAD+ Booster,Anti Ageing,NMN (Nicotinamide Mononucleotide),"Store in a cool, dry place away from direct su...","If you are pregnant, nursing, taking any medic...","[A sedentary lifestyle, Age-related weight con...",Japan,"Vegetarian, No Artificial Flavours, No Preserv...",https://www.lac.com/en/anti-ageing/lac-anti-ag...,LAC Anti-Ageing,385.77,308.62,,1.5g x 30 powder sticks,Take 1 stick daily with juice or your favourit...,"indigestible dextrin, orange juice powder, mon..."
7,1406400,NMN 300mg - Ultimate NAD+ Booster,Anti Ageing,"NMN (Nicotinamide Mononucleotide), Grape Seed ...","Store in a cool, dry place away from direct su...","If you are pregnant, nursing, taking any medic...","[A sedentary lifestyle, Age-related weight con...",Japan,"No Artifical Colours, No Artificial Flavours, ...",https://www.lac.com/en/anti-ageing_beauty/lac-...,LAC Anti-Ageing,295.8,295.77,Softgel,30 softgels,Take 1 softgel daily.,"grape seed oil, glycerol, silicon dioxide, gel..."
8,1406620,Miracle Intensive Age-Defying Serum,Anti Ageing,"Collagen, Glycerin, Sodium Hyaluronate, Tocoph...","Store in a cool, dry place away from direct su...",For external use only. Avoid contact with eyes...,,Japan,"Premium Grade, Men and Women of all Ages, No A...",https://www.lac.com/en/beauty/lac-taut%C2%AE/m...,LAC TAUT®,72.27,0.0,Topical,40 ml,Pump once or twice onto palm. Rub palms togeth...,"water, butylene glycol, glycerin, citric acid,..."


In [127]:
df.to_csv('product_details.csv', index=False)
print("Data has been written to 'product_details.csv'")


Data has been written to 'product_details.csv'
