Everybody start talking about SHEIN in recent days. Thus, I would like to try to scrape the website of this emerging online fast fashion brand.

In [None]:
# This is the homepage of SHEIN
url = 'https://us.shein.com/'

In [77]:
def get_product(url):
    'This is the function used to extract product details of desired products.'
    import requests
    from datetime import date 
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from selenium.webdriver import Chrome
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.common.by import By

    response = requests.get(url)

    # Make sure we get the homepage
    if response.status_code == 200:
        print("Success")
    else:
        print("Failure")

    # Here we get a soup object of SHEIN homepage
    results_page = BeautifulSoup(response.content,'lxml')

    # I will only extract dress products for the purpose of this data challenge only.
    dress_url = results_page.find("a",{"title":"DRESSES"})['href'] # a link to dress category

    # Repeat and get a soup object for dress category
    new_response = requests.get(dress_url)
    if new_response.status_code == 200:
        print("Success")
    else:
        print("Failure")
    dress_page = BeautifulSoup(new_response.content,'lxml')

    # Then we will get a list of dress products on the first page.
    dress_list = dress_page.findAll("section",{"role":"listitem"})
    product_link = list()
    for dress in dress_list:
        product_link.append(dress.find('a')['href'])

    # For demonstration purpose, I will only extract product information for first ten dresses
    product_dic = dict() # Initilize empty to store product information
    for i in range(10):
        url = 'https://us.shein.com' + product_link[i]
        service = Service('/Applications/chromedriver')
        driver = webdriver.Chrome(service=service)
        driver.get(url)

        # display_name (str)
        try:
            display_name = driver.find_element(By.CLASS_NAME, 'product-intro__head-name')
            product_dic[i]= {'display_name':display_name.text}
        except:
            product_dic[i]= {'display_name':'N/A'}

        # color list (each product has multiple avaliable colors)
        try:
            colors  = driver.find_elements(By.CLASS_NAME, 'product-intro__color-radio')
            color_list = [elem.get_attribute('aria-label') for elem in colors]
            product_dic[i]['color'] = color_list
        except:
            product_dic[i]['color'] = []

        # size (list)
        try:
            size = driver.find_elements(By.CLASS_NAME, 'product-intro__size-radio')
            size_list = [elem.get_attribute('aria-label') for elem in size]
            product_dic[i]['size'] = size_list
        except:
            product_dic[i]['size'] = []

        # price (str)
        try:
            price = driver.find_element(By.CLASS_NAME, 'original')
            product_dic[i]['price'] = price.text
        except:
            product_dic[i]['price'] = 'N/A'

        # product_url (str)
        product_dic[i]['product_url'] = url

        # image_links (list)
        try:
            img = driver.find_element(By.CLASS_NAME, 'product-intro__thumbs')
            imgs = img.find_elements(By.TAG_NAME, 'img')
            image_links = [elem.get_attribute('src') for elem in imgs]
            product_dic[i]['image_links'] = image_links
        except:
            product_dic[i]['image_links'] = []

        # brand_name (str)
        product_dic[i]['brand_name'] = "SHEIN"

        # description (str)
        # Since SHEIN updates its SKU very quickly, it doesn't provide a description for those products
        product_dic[i]['description'] = 'N/A'

        # scrapped_date (date)
        product_dic[i]['date'] = date.today()

        # low_level (str)
        product_dic[i]['low_level'] = 'dress' #As I decided early.

        # gender (str)
        product_dic[i]['gender'] = 'Women' #SHEIN seems to assume buyers' default gender is female

        # secondhand (bool)
        product_dic[i]['secondhand'] = False #Given SHEIN only sells firsthand products
        
    return product_dic
    

In [78]:
test = get_product(url)

Success
Success


In [79]:
# Based on the output, this function works!
test

{0: {'display_name': 'SHEIN Ruffle Armhole Ditsy Floral Dress',
  'color': [],
  'size': [None, 'XS(2)', 'S(4)', 'M(6)', 'L(8/10)'],
  'price': 'US$11.00',
  'product_url': 'https://us.shein.com/SHEIN-Ruffle-Armhole-Ditsy-Floral-Dress-p-2765897-cat-1727.html?scici=navbar_WomenHomePage~~tab01navbar07~~7~~webLink~~~~0',
  'image_links': ['//img.ltwebstatic.com/images3_pi/2021/06/10/16232928774476371acd50e07061068425f3301bab_thumbnail_220x293.webp',
   '//img.ltwebstatic.com/images3_pi/2021/06/10/1623292881a63e2085e5f90d2d6d4841ef49232ed8_thumbnail_220x293.webp',
   '//img.ltwebstatic.com/images3_pi/2021/06/10/1623292886de9bcd201614127621b2c5bac6eabd9e_thumbnail_220x293.webp',
   '//img.ltwebstatic.com/images3_pi/2021/06/10/1623292890c84e5227b546808ffde999c62fb7df07_thumbnail_220x293.webp',
   '//img.ltwebstatic.com/images3_pi/2021/06/10/16232928951eb11f61edc4c942f0f5cd626f2ac1ee_thumbnail_220x293.webp'],
  'brand_name': 'SHEIN',
  'description': 'N/A',
  'date': datetime.date(2022, 6, 3)

In [80]:
import pandas as pd
import sqlalchemy

In [82]:
df = pd.DataFrame.from_dict(test,orient='index')
df

Unnamed: 0,display_name,color,size,price,product_url,image_links,brand_name,description,date,low_level,gender,secondhand
0,SHEIN Ruffle Armhole Ditsy Floral Dress,[],"[None, XS(2), S(4), M(6), L(8/10)]",US$11.00,https://us.shein.com/SHEIN-Ruffle-Armhole-Dits...,[//img.ltwebstatic.com/images3_pi/2021/06/10/1...,SHEIN,,2022-06-03,dress,Women,False
1,SHEIN SXY Keyhole Back Halter Bodycon Dress,"[Apricot, Khaki, Dusty Pink, Dusty Blue, Navy ...","[XS(2), S(4), M(6), L(8/10), None]",US$14.00,https://us.shein.com/SHEIN-SXY-Keyhole-Back-Ha...,[//img.ltwebstatic.com/images3_pi/2022/03/10/1...,SHEIN,,2022-06-03,dress,Women,False
2,Random Allover Floral Print Ruched Bust Ruffle...,"[Pink, Pink, Grey, Apricot, Purple, Mint Green...","[S(4), M(6), L(8/10)]",US$17.00,https://us.shein.com/Random-Allover-Floral-Pri...,[//img.ltwebstatic.com/images3_pi/2021/12/20/1...,SHEIN,,2022-06-03,dress,Women,False
3,SHEIN SXY Solid Criss-cross Backless Bodycon D...,"[Black, Burgundy, Army Green, Red, Blue, Grey,...","[XS(2), S(4), M(6), L(8/10)]",US$10.00,https://us.shein.com/SHEIN-SXY-Solid-Criss-cro...,[//img.ltwebstatic.com/images3_pi/2021/08/03/1...,SHEIN,,2022-06-03,dress,Women,False
4,SHEIN SXY Tie Dye Cami Bodycon Dress,"[Multicolor, Multicolor]","[XS(2), S(4), M(6), L(8/10), XL(12)]",,https://us.shein.com/SHEIN-SXY-Tie-Dye-Cami-Bo...,[//sheinsz.ltwebstatic.com/she_dist/images/bg-...,SHEIN,,2022-06-03,dress,Women,False
5,Graphic Print V-Neck A Line Cami Dress,[],"[XS(2), S(4), M(6), L(8/10), None]",US$7.00,https://us.shein.com/Graphic-Print-V-Neck-A-Li...,[//img.ltwebstatic.com/images3_pi/2021/07/29/1...,SHEIN,,2022-06-03,dress,Women,False
6,SHEIN Polka Dot Belted Tunic Dress,[],"[XS(2), S(4), M(6), L(8/10)]",US$7.00,https://us.shein.com/SHEIN-Polka-Dot-Belted-Tu...,[//img.ltwebstatic.com/images3_pi/2022/04/16/1...,SHEIN,,2022-06-03,dress,Women,False
7,Allover Floral Knot Split Thigh A-line Dress,"[White, Multicolor, Multicolor, Multicolor, Bl...","[XS(2), S(4), M(6), L(8/10), XL(12), None]",US$18.00,https://us.shein.com/Allover-Floral-Knot-Split...,[//img.ltwebstatic.com/images3_pi/2021/11/19/1...,SHEIN,,2022-06-03,dress,Women,False
8,Floral Print Scoop Neck Dress,"[Multicolor, Multicolor, Multicolor]","[None, S(4), M(6), L(8/10), XL(12)]",US$11.00,https://us.shein.com/Floral-Print-Scoop-Neck-D...,[//img.ltwebstatic.com/images3_pi/2021/12/17/1...,SHEIN,,2022-06-03,dress,Women,False
9,Floral Print Shirred Waist Cut Out Tie Backles...,[],"[XS(2), S(4), M(6), L(8/10)]",US$18.00,https://us.shein.com/Floral-Print-Shirred-Wais...,[//img.ltwebstatic.com/images3_pi/2022/04/24/1...,SHEIN,,2022-06-03,dress,Women,False


In [83]:
# Store product result in excel
df.to_excel('SHEIN_Products.xlsx', index=False)

PostgreSQL Part

In [None]:
# Change passwd and localhost address for your part.
engine = sqlalchemy.create_engine('postgre://postgres:12345@localhost:5432')
df.to_sql('SHEIN_Product_Results', engine)