In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import bs4
import time
import re

# Selenium on web

In [3]:
def selenium_shopee(keyword, username, password):
    try:
        # Open browser using Selenium
        driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
        driver.get('https://shopee.co.th')
        time.sleep(3)

        # Select Thai language
        thai_button = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[1]/div[1]/div/div[3]/div[1]/button')
        thai_button.click()

        # Close Ads
        time.sleep(3)
        close_ads = driver.execute_script(
            'return document.querySelector("shopee-banner-popup-stateful").shadowRoot.querySelector("div.shopee-popup__close-btn")')
        close_ads.click()

        # Search bar
        search = driver.find_element(by=By.XPATH,value='/html/body/div[1]/div/header/div[2]/div/div[1]/form/div/div[1]/input')
        search.send_keys(keyword)
        search.send_keys(Keys.ENTER)

        # Login to Shopee
        time.sleep(5)
        mail = driver.find_element(by=By.XPATH,value='/html/body/div[1]/div/div[2]/div/div/div/div[2]/form/div/div[2]/div[2]/div[1]/input')
        mail.send_keys(username)
        pw = driver.find_element(by=By.XPATH,value='/html/body/div[1]/div/div[2]/div/div/div/div[2]/form/div/div[2]/div[3]/div[1]/input')
        pw.send_keys(password)
        time.sleep(1)
        log = driver.find_element(by=By .XPATH,value='/html/body/div[1]/div/div[2]/div/div/div/div[2]/form/div/div[2]/button')
        log.click()
        time.sleep(5)
        aut = driver.find_element(by=By.XPATH,value='/html/body/div[1]/div/div[2]/div/div/div/div/div[1]/div/div/div/button')
        aut.click()
        time.sleep(30)

    except NoSuchElementException:
        print("Element not found. Handle the error or add appropriate code.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    
    return driver


In [4]:
driver = selenium_shopee(keyword='Your Keyword',username='Your Username',password='password')

# Scraping Process

In [14]:

def shopee_scraping(len_page,web_path):
    
    product_list = []
    all_price = []
    solds = []
    locations = []


    for page in range(0,len_page):

        # Enter to each pages
        driver.get(web_path+f'{page}')

        # Zoom out pageg 10%
        driver.execute_script("document.body.style.MozTransform='scale(0.1)';")
        driver.execute_script('document.body.style.MozTransformOrigin = "0 0";')

        
        # Transform html to reaable
        time.sleep(7)
        data = driver.page_source
        soup = bs4.BeautifulSoup(data)

        # Scrap product name

        product_name = soup.find_all('div',{'class':'ie3A+n bM+7UW Cve6sh'})
        for name in product_name:
            product_list.append(name.text)

        # Scrap product full price

        product_all_price = soup.find_all('div',{'class':'hpDKMN'})
        for price in product_all_price:
            all_price.append(price.text)

        # Scrap product sold

        product_sold = soup.find_all('div',{'class':'ZnrnMl'})
        for sold in product_sold:
            solds.append(sold.text)

        # Scrap product location

        product_location = soup.find_all('div',{'class':'zGGwiV'})
        for location in product_location:
            locations.append(location.text)
        
        print(f'End page {page+1}')

    data = {'Product_list':product_list,'product_all_price':all_price,'Product_sold':solds,'Location':locations}
    df = pd.DataFrame(data=data)

    return df


In [None]:
df = shopee_scraping('Number of page',web_path='Your web path')

# Cleaning Data Process


In [7]:
df_clean = df.copy()

In [8]:

# Extract Original price
df_clean['original_price'] = df_clean['product_all_price'].str.extract(r'฿([\d,]+)')

## Extract full_price and discount_price
df_clean[['Full_price', 'discount_price']] = df_clean['product_all_price'].str.extract(r'฿([\d,]+)฿([\d,]+)')

## Extract start_price and high_price
df_clean[['Starting_price', 'Highest_price']] = df_clean['product_all_price'].str.extract(r'฿([\d,]+) - ฿([\d,]+)')


## Remove commas and convert to numeric, filling NaN with 0
df_clean['Full_price'] = pd.to_numeric(df_clean['Full_price'].str.replace(',', ''), errors='coerce').fillna(0).astype(int)
df_clean['discount_price'] = pd.to_numeric(df_clean['discount_price'].str.replace(',', ''), errors='coerce').fillna(0).astype(int)
df_clean['Starting_price'] = pd.to_numeric(df_clean['Starting_price'].str.replace(',', ''), errors='coerce').fillna(0).astype(int)
df_clean['Highest_price'] = pd.to_numeric(df_clean['Highest_price'].str.replace(',', ''), errors='coerce').fillna(0).astype(int)
df_clean['original_price'] = pd.to_numeric(df_clean['original_price'].str.replace(',', ''), errors='coerce').fillna(0).astype(int)

## Delete numbers in original_price when equal to starting_price 
mask = (df_clean['original_price'] == df_clean['Starting_price'])
df_clean['original_price'] = df_clean['original_price'].where(~mask, 0)

## Drop and change column name
df_clean.drop(['Full_price'],axis=1,inplace=True)
df_clean.rename(columns={'original_price':'Full_price'},inplace=True)

# Function to clean the product_sold column
def clean_sales(sale_str):
    # Use regular expression to find floats followed by 'พัน'
    matches = re.findall(r'(\d+\.\d+)\s*พัน', sale_str) or re.findall(r'(\d+)\s*พัน', sale_str)
    
    # Loop through matches and replace with '000พัน'
    for match in matches:
        sale_str = sale_str.replace(match + 'พัน', match + '000พัน')
    
    return sale_str

# Apply the function to the 'Product_sold' column
df_clean['Product_sold'] = df_clean['Product_sold'].apply(clean_sales)

df_clean['Product_sold']  = df_clean['Product_sold'] .str.extract(r'([\d.]+)')[0]
df_clean['Product_sold']  = pd.to_numeric(df_clean['Product_sold'].str.replace('.', ''), errors='coerce').fillna(0).astype(int)


## Rearange column
df_clean = df_clean[['Product_list','Full_price','discount_price','Starting_price','Highest_price','Product_sold','Location']]