In [1]:
# Web Scraping
from selenium import webdriver
from selenium.common.exceptions import *

# Data manipulation
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
webdriver_path = 'env/bin/chromedriver'
lazada_url = 'https://www.lazada.com.my/'
search_item = 'Nescafe Gold refill 170g'

In [3]:
import time
from random import randint

# Select custom Chrome options
options = webdriver.ChromeOptions()
# options.add_argument('--headless') 
options.add_argument('start-maximized') 
options.add_argument('disable-infobars')
options.add_argument('--disable-extensions')

# Open the Chrome browser
browser = webdriver.Chrome(webdriver_path, options=options)
browser.get(lazada_url)
time.sleep(randint(3, 5))

In [4]:
search_bar = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div[2]/div/div[2]/form/div/div[1]/input[1]')
search_bar.send_keys(search_item)

In [5]:
search_bar.submit()

In [7]:
item_titles = browser.find_elements_by_class_name('c16H9d')
item_prices = browser.find_elements_by_class_name('c13VH6')
item_links = browser.find_elements_by_class_name('cRjKsc')
item_reviews = browser.find_elements_by_class_name('c3XbGJ')

In [8]:
# Initialize empty lists
lazada_titles_list = []
lazada_prices_list = []
lazada_links_list = []

# Loop over the item_titles and item_prices
for title in item_titles:
    lazada_titles_list.append(title.text)
for price in item_prices:
    lazada_prices_list.append(price.text)
for links in item_links:
    a = links.find_element_by_tag_name('a')
    lazada_links_list.append(a.get_attribute('href'))

In [9]:
try:
    browser.find_element_by_xpath('//*[@class=”ant-pagination-next” and not(@aria-disabled)]')
    #more work here but I'am lazy
    browser.quit()
    
except NoSuchElementException: 
    browser.quit()

In [10]:
df_lazada = pd.DataFrame(
    zip(lazada_titles_list, lazada_prices_list, lazada_links_list), 
    columns=['Item Name', 'Price', 'URL'])


In [11]:
df_lazada['Price'] = df_lazada['Price'].str.replace('RM', '').astype(float)

In [12]:
df_lazada = df_lazada[df_lazada['Item Name'].str.contains('x2') == False]

In [13]:
df_lazada

Unnamed: 0,Item Name,Price,URL
0,NESCAFE Gold Coffee Refill 170g,25.0,https://www.lazada.com.my/products/nescafe-gol...
1,Nescafe Gold Blend Refill Pack (170g),25.5,https://www.lazada.com.my/products/nescafe-gol...
2,NESCAFE GOLD Refill 170g x 2 packs,26.7,https://www.lazada.com.my/products/nescafe-gol...
3,NESCAFE GOLD Refill Twin Pack(170g x 2 Packs),54.99,https://www.lazada.com.my/products/nescafe-gol...
4,NESCAFE GOLD Refill 170g,60.0,https://www.lazada.com.my/products/nescafe-gol...
5,SHOPPA Nescafe Gold Refill Pack - Rich & Smoot...,48.5,https://www.lazada.com.my/products/shoppa-nesc...
6,Nescafe Gold Refill Packs (170g x 2 Packs),24.9,https://www.lazada.com.my/products/nescafe-gol...
8,NESCAFE GOLD Refill 170g x 2 Packs,56.0,https://www.lazada.com.my/products/nescafe-gol...
9,Nescafe Gold Refill Pack - 170G,49.0,https://www.lazada.com.my/products/nescafe-gol...
11,NESCAFE GOLD Refill 170g x3 packs,49.0,https://www.lazada.com.my/products/nescafe-gol...


In [14]:
import requests

shopee_url = 'https://shopee.com.my'
keyword_search = 'Nescafe Gold refill 170g'

headers = {
 'User-Agent': 'Chrome',
 'Referer': '{}search?keyword={}'.format(shopee_url, keyword_search)
}

url = 'https://shopee.com.my/api/v2/search_items/?by=relevancy&keyword={}&limit=100&newest=0&order=desc&page_type=search'.format(keyword_search)

# Shopee API request
r = requests.get(url, headers = headers).json()

# Shopee scraping script
shopee_titles_list = []
shopee_prices_list = []
shopee_historical_list = []
shopee_rating_list = []

for item in r['items']:
    shopee_titles_list.append(item['name'])
    shopee_prices_list.append(item['price_min'])
    shopee_historical_list.append(item['historical_sold'])
    shopee_rating_list.append(item['item_rating']['rating_star'])

In [15]:
df_shopee = pd.DataFrame(
    zip(shopee_titles_list, shopee_prices_list, shopee_historical_list, shopee_rating_list), 
    columns=['Item Name', 'Price', 'Sold', 'Rating'])

In [16]:
import re

# Remove the ‘RM’ string from Price and change column type to float
df_shopee['Price'] = df_shopee['Price'] / 100000

# Remove false entries i.e. those which are not actually Nescafe Gold Refill 170g
# df_shopee = df_shopee[df_shopee['Item Name'].str.contains('170g') == True]

# Some of the items are actually x2 packs. Remove them too
df_shopee = df_shopee[df_shopee['Item Name'].str.contains(
    '[2x\s]{3}|twin', 
    flags=re.IGNORECASE, 
    regex=True) == False]

In [17]:
# Add column [‘Platform’] for each platforms
df_lazada['Platform'] = 'Lazada'
df_shopee['Platform'] = 'Shopee'

# Concatenate the Dataframes
df = pd.concat([df_lazada,df_shopee])

In [18]:
print(df.groupby(['Platform']).describe())

         Price                                                          Sold  \
         count       mean        std   min   25%   50%      75%    max count   
Platform                                                                       
Lazada    33.0  53.447273  37.866901  21.0  26.8  49.0  56.0000  175.0   0.0   
Shopee    56.0  29.230179  11.252784   5.5  25.9  26.9  29.7725   76.0  56.0   

                      ...                 Rating                           \
                mean  ...    75%      max  count      mean       std  min   
Platform              ...                                                   
Lazada           NaN  ...    NaN      NaN    0.0       NaN       NaN  NaN   
Shopee    592.482143  ...  80.25  23469.0   56.0  3.040307  2.392355  0.0   

                                   
          25%       50%  75%  max  
Platform                           
Lazada    NaN       NaN  NaN  NaN  
Shopee    0.0  4.849206  5.0  5.0  

[2 rows x 24 columns]


In [19]:
df.to_csv('result.csv')

In [21]:
# json format transform for API purpose
# json_output = df.to_json(orient='split')
# print('json ready')

json ready


'{"columns":["Item Name","Price","URL","Platform","Sold","Rating"],"index":[0,1,2,3,4,7,8,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,36,0,1,2,6,12,14,16,17,18,20,21,22,24,25,26,27,30,31,32,34,35,36,37,38,39,43,44,45,47,48,50,51,52,54,55,57,59,62,63,64,68,69,70,71,72,76,77,79],"data":[["NESCAFE Gold Coffee Refill 170g",25.0,"https:\\/\\/www.lazada.com.my\\/products\\/nescafe-gold-coffee-refill-170g-i217777180-s275840790.html?search=1","Lazada",null,null],["Nescafe Gold Blend Refill Pack (170g)",25.5,"https:\\/\\/www.lazada.com.my\\/products\\/nescafe-gold-blend-refill-pack-170g-i1497814392-s4959894747.html?search=1","Lazada",null,null],["NESCAFE GOLD Refill 170g x 2 packs",26.7,"https:\\/\\/www.lazada.com.my\\/products\\/nescafe-gold-refill-170g-x-2-packs-i422852521-s616466681.html?search=1","Lazada",null,null],["Nescafe Gold Refill Pack - 170G",54.99,"https:\\/\\/www.lazada.com.my\\/products\\/nescafe-gold-refill-pack-170g-i1132424475-s3210902542.html?se