#### **Objective:** <br>
Gather data on skincare products from Sephora for product analysis. <br>

#### **Method:** <br>
1. Retrieve the URLs of all skincare products.
2. Store each product's webpage as an HTML file.
3. Gather details about each individual product.
4. Export the product information into a CSV file.

#### **Summary:** <br>
When the skincare product links were obtained, there are 2894 skincare products on Sephora. However, information for only 2560 products could be obtained due to the following reasons:<br> 1. Some products have been removed from the website and are no longer accessible. <br> 2. Critical product information could not be extracted successfully.

In [204]:
#import libaries
from bs4 import BeautifulSoup
import requests
import re
import json
import pandas as pd
from urllib.parse import urlencode
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import os
from urllib.parse import urlparse, parse_qs
from requests.exceptions import ReadTimeout
 

##### **Section 1. Define Functions to Extract Product Information** <br>
Retrieve information about: <br>
 * product's hierarcy, 
 * brand, 
 * product name, 
 * product rating, 
 * number of love count, 
 * number of reviews, 
 * skin concern, 
 * skin type, 
 * item number, 
 * similar items, 
 * different item variations  


In [205]:
#get skincare hierarchy of each product such as: Skincare > Cleansers > Face Wash & Cleansers
def get_product_breadcrumbs(json_dict):
    try:
        product_info = json_dict["page"]["product"]
        breadcrumbs_json = product_info.get("breadcrumbsSeoJsonLd")
        if breadcrumbs_json:
            return [x["item"]["name"] for x in json.loads(breadcrumbs_json)["itemListElement"]]
        else:
            return None
    except KeyError:
        return None


In [206]:
#get the brand of the product
def get_brand(json_dict):
    return json_dict["page"]["product"]["productDetails"]['brand'].get('displayName', None)

In [207]:
#get product name
def get_product_name(json_dict):
    return json_dict["page"]["product"]["productDetails"].get('displayName', None)

In [208]:
#get product rating
def get_rating(json_dict):
    return json_dict["page"]["product"]["productDetails"].get('rating', None)

In [209]:
#get the number of times consumers loved this product
def get_lovesCount(json_dict):
    return json_dict["page"]["product"]["productDetails"].get('lovesCount', None)

In [210]:
#get the number of reviews
def get_review_num(json_dict):
    return json_dict["page"]["product"]["productDetails"].get('reviews', None)

In [211]:
#get the skin type the product is suitable for
def get_skin_type(json_dict):
    skin_type_results = json_dict["page"]["product"]["productDetails"].get('longDescription', None)
    if skin_type_results is not None:
        product_soup = BeautifulSoup(skin_type_results, 'html.parser')

        skin_type_tag = product_soup.find('b', string='Skin Type:')
        if skin_type_tag is not None: 
            next_sibling = skin_type_tag.next_sibling
            try:
                if next_sibling is not None:
                    skin_type_text = next_sibling.strip()
                    return skin_type_text
                else:
                    return None 
            except Exception as e:
                print("Error occurred while processing skin type:", e)
                return None
        else:
            return None  
    else:
        return None  



In [212]:
#get the skincare issue targeted by the product.
def get_skin_concern(json_dict):
    skin_concern_results = json_dict["page"]["product"]["productDetails"].get('longDescription', None)
    if skin_concern_results is not None:
        product_soup = BeautifulSoup(skin_concern_results, 'html.parser')

        skin_concern_tag = product_soup.find('b', string='Skincare Concerns:')
        if skin_concern_tag is not None: 
            next_sibling = skin_concern_tag.next_sibling
            try:
                if next_sibling is not None:
                    skin_concern_text = next_sibling.strip()
                    return skin_concern_text
                else:
                    return None 
            except Exception as e:
                print("Error occurred while processing skin type:", e)
                return None
        else:
            return None  
    else:
        return None  

In [213]:
#get the sku of each product. this is the item number on the Sephora webpage
def get_sku(json_dict):
    return json_dict["page"]["product"]["currentSku"].get("skuId", None)

In [214]:
#get the size of each product
def get_size(json_dict):
    return json_dict["page"]["product"]["currentSku"].get("size", None)

In [215]:
#get the product's price (USD)
def get_price(json_dict):
    price = json_dict["page"]["product"]["currentSku"].get("listPrice", None)
    if price!= None:
        price = price[1:]
    return price


In [216]:
#get the product's "children", which can be the same product in a different size or different versions of the product (e.g. scent)
def get_child_sku(json_dict):
    find_child = json_dict["page"]["product"].get("regularChildSkus", None)
    current_sku = json_dict["page"]["product"]["currentSku"]["skuId"]
    child_products = []
    if find_child is not None:
        for item in find_child:
            if item['skuId']!=current_sku:
                child_products.append(item['skuId'])
    return child_products


In [217]:
#get the product's unique item identifier. This is similar to the sku id 
def get_item_id(json_dict):
    return json_dict['page']["product"]["productDetails"].get('productId', None)

In [218]:
#get the sku_id of similar products 
def get_similar_products(product_id, headers):
    similar_product_url = 'https://sephora.cnstrc.com/recommendations/v1/pods/similar-products-test?c=ciojs-client-2.41.0&key=u7PNVQx-prod-en-us&i=0c7be738-3d64-4f03-b5fd-7c5df9afefdc&s=23&num_results=5&item_id='+product_id
    
    try:
        similar_page = requests.get(similar_product_url, headers=headers, timeout=20)  
        similar_page = json.loads(similar_page.text)
        
        products_list = []
        if similar_page["response"].get("results", None) is not None:
            for item in similar_page["response"]["results"]:
                products_list.append(item["data"]["currentSku"].get("skuId", None))
        else:
            products_list.append('')
        
        return products_list
    
    except ReadTimeout:
        print(f"Request for product {product_id} timed out. Skipping this product.")
        return []


##### **Section 2. Define Functions to Extract Product URLs and Save Product Information** <br>

In [219]:
#get all the product links
def get_product_links(url, total_pg, api, headers):
    all_products_list = []
    for pg in range(1, total_pg+1):
        complete_url = url + str(pg)
        params = {'api_key':api, 'url':complete_url}

        for _ in range(3):
            try:
                page = requests.get('http://api.scraperapi.com', params =urlencode(params), headers=headers)
                if page.status_code in [200, 404]:
                    break
            except requests.exceptions.ConnectionError:
                page = ''
        
        driver = webdriver.Chrome()
        driver.get(complete_url)

        scroll_height = 0
        while scroll_height < 10000:
            scroll_height += 800
            driver.execute_script(f'window.scrollTo(0, {scroll_height});')
            time.sleep(0.5)

        product_anchors = driver.find_elements(By.CSS_SELECTOR, ".css-klx76")

        product_urls = map(lambda a: a.get_attribute("href"), product_anchors)
        product_urls = list(product_urls)
        all_products_list = all_products_list + product_urls
    
    return all_products_list

In [220]:
#determine which product pages are already saved as html files. This will help avoid getting the same data twice.
def get_products_saved(folder_path):
    dir_list = os.listdir(folder_path)
    sku_saved_list = []
    for file in dir_list:
        sku = re.search(r'\d+', file).group()
        sku_saved_list.append(sku)
    return sku_saved_list


In [221]:
#for each product link, save the webpage information into a html file
def save_html_info(text_file, saved_sku_pages):

    folder_path = 'skincare_products'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    failed_to_load = []
    no_sku = []

    with open(text_file) as f:
        s = f.read()

    all_product_urls = s.strip().split('\n')

    
    options = webdriver.ChromeOptions()
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(options=options)

    counter = 0

    for num, url in enumerate(all_product_urls):
        print(num, url)
        counter+=1

        #get sku_id from url
        parsed_url = urlparse(url)
        query_params = parse_qs(parsed_url.query)
        sku_id = query_params.get('skuId')

        if len(sku_id)!=0:
            sku_id = sku_id[0]
            if sku_id in saved_sku_pages:
                continue
        else:
            no_sku.append(url)
            continue
        
        delay = random.uniform(5, 15)
        time.sleep(delay) 

        if counter%500 == 0:
            time.sleep(600)

        try:
            driver.get(url)
            file_name = f"product_{sku_id}.html"
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, 'w') as f:
                    f.write(driver.page_source)
        except requests.exceptions.RequestException as e:
            failed_to_load.append(url)
            continue
        
    driver.quit()
    
    return failed_to_load, no_sku

        

In [224]:
#get the skincare information for all products and save the information in a dictionary
def get_skincare_info(folder, headers):

    skincare_dict ={}
    no_page_json_lst = []
    
    counter = 0
    #open the text file and extract information from each link
    for html_file in os.listdir(folder):
        counter+=1

        #get sku_id from url
        parsed_url = urlparse(html_file)
        query_params = parse_qs(parsed_url.query)
        sku_id = query_params.get('skuId')


        with open(os.path.join(folder,html_file), 'r') as file:
            print(counter, html_file)
            html_content = file.read()
            product_soup = BeautifulSoup(html_content, 'html.parser')
            product_soup = BeautifulSoup(product_soup.prettify(), "html.parser")

            json_dict = json.loads(product_soup.find("script", {"id": "linkStore"}).text)

            if json_dict["page"].get("product", None) == None:
                no_page_json_lst.append(sku_id)
                continue
            else:
                hierarchy = get_product_breadcrumbs(json_dict)
                brand = get_brand(json_dict)
                product = get_product_name(json_dict)
                rating = get_rating(json_dict)
                loves_count = get_lovesCount(json_dict)
                reviews_num = get_review_num(json_dict)
                sku = get_sku(json_dict)
                size = get_size(json_dict)
                price = get_price(json_dict)
                child_sku = get_child_sku(json_dict)
                item_id = get_item_id(json_dict)
                similar_products = get_similar_products(product_id = item_id, headers =headers )
                skin_concern = get_skin_concern(json_dict)
                skin_type = get_skin_type(json_dict)

                skincare_dict[sku] = {}
                if hierarchy!= None:
                    skincare_dict[sku]['hierarchy'] = ','.join(hierarchy)
                else:
                    skincare_dict[sku]['hierarchy'] = None
                skincare_dict[sku]['brand'] = brand 
                skincare_dict[sku]['product'] = product 
                skincare_dict[sku]['rating'] = rating 
                skincare_dict[sku]['loves_count'] = loves_count 
                skincare_dict[sku]['reviews_num'] = reviews_num
                skincare_dict[sku]['size'] = size 
                skincare_dict[sku]['price'] = price 
                skincare_dict[sku]['child_sku'] = ','.join(child_sku) 
                skincare_dict[sku]['item_id'] = item_id
                skincare_dict[sku]['similar_products'] = ','.join(similar_products)
                skincare_dict[sku]['skin_concern'] = skin_concern
                skincare_dict[sku]['skin_type'] = skin_type

        

    return skincare_dict, no_page_json_lst
        
    

##### **Section 3. Function Calls** <br>
1. Loop through 49 pages of skincare products to get all the product URLs
2. For each URL, open and save the webpage information.  
3. Extract information from each html file and save the extracted information as a CSV <br>

To access approximately 3000 webpages without being blocked by the website, it's advisable to utilize either an API or proxies. Initially, I opted for an API due to the scarcity of reliable proxies, but I exhausted my free API credits. Consequently, I chose to prolong the interval between each URL request to evade website detection. However, this approach is slower.I saved the product webpage files and conducted scraping in batches.

In [None]:
#get all the skincare URLs from 49 pages
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
all_product_urls = get_product_links(url= 'https://www.sephora.com/shop/skincare?currentPage=', total_pg =49 , headers = headers, api ='57d761a02ba8a95e8bcad4641873ab9b')

In [None]:
#save all URLs/links to a txt file
with open("Sephora_URL.txt", "w") as file:
    for link in all_product_urls: 
        file.write(link+'\n')

In [223]:
#see the number of pages were extracted
saved_sku_pages = get_products_saved(folder_path='./skincare_products')
print("Number of urls saved:", len(saved_sku_pages))


Number of urls saved: 2894


In [None]:
#for each product link, save the webpage
save_html_info(text_file ="Sephora_URL.txt", saved_sku_pages=saved_sku_pages)

In [None]:
#get the product information from all the webpages
skincare_info_dict, no_page_lst = get_skincare_info(folder = "skincare_products", headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"})

In [226]:
#convert product info to csv
all_products_df = pd.DataFrame.from_dict(skincare_info_dict).transpose()
all_products_df.to_csv('SephoraSkincareData_v2.csv', sep='\t')

In [227]:
print("Number of products with extracted info: ", len(all_products_df))

Number of products with extracted info:  2560


In [228]:
print("Number of urls in which data extraction failed: ", len(no_page_lst)) 


Number of urls in which data extraction failed:  317
