In [2]:
# import libraries

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

import time
import requests
import pandas as pd
import numpy as np
import re

### In each product detail page, search for the product page url and then go into the page.

In [3]:
def get_product_links(website):
    
    links = []
    for page in range(1, 11):
        print("Processing page : {}".format(page))
        wd = webdriver.Chrome('chromedriver.exe')
        wd.get(website+"?p={}".format(page))
        time.sleep(5)
        product = wd.find_elements_by_class_name('product-item-link')
        
        for i in range(len(product)):
            links.append(product[i].get_attribute('href'))
        wd.close()
        
    print("Finish")
    return links

In [4]:
product_link = get_product_links("https://www.senheng.com.my/all-products/home-entertainment.html")

Processing page : 1
Processing page : 2
Processing page : 3
Processing page : 4
Processing page : 5
Processing page : 6
Processing page : 7
Processing page : 8
Processing page : 9
Processing page : 10
Finish


In [5]:
len(product_link)

433

In [6]:
pd_links = list(set(product_link))

In [7]:
len(pd_links)

252

In [15]:
SKU = []
category = []
status = []
price = []
rating = []
comments = []

for web in pd_links:
    wd = webdriver.Chrome('chromedriver.exe')
    wd.get(web)
    time.sleep(5)
    SKU.append(wd.find_element_by_class_name('base').text)
    category.append(wd.find_element_by_xpath('/html/body/div[2]/div[4]/ul/li[5]').text)
    status.append(wd.find_element_by_xpath('//*[@id="maincontent"]/div[2]/div/div/div/div[3]/div[2]/div/div[2]').text)
    price.append(wd.find_element_by_xpath('//*[@id="maincontent"]/div[2]/div/div/div/div[3]/div[1]/div[1]/span[1]').text)
    rating.append(wd.find_elements_by_class_name('cont__count-rating')[1].text[0])
    comment_list = wd.find_elements_by_class_name('review-content')
    if len(comment_list) == 0:
        comments.append('-')
    else:
        com_lst = []
        for i in range(len(comment_list)):
            com_lst.append(comment_list[i].text)
        comments.append(com_lst)
    wd.close()

In [16]:
print(len(SKU), len(category), len(status), len(price), len(rating), len(comments))

252 252 252 252 252 252


### Data Cleaning and labelling

In [19]:
product_info = pd.DataFrame([SKU, category, status, price, rating, comments]).T

In [21]:
product_info.columns = ['Product_Name', 'Category', 'Status', 'Price_RM', 'Rating', 'Comment']

In [22]:
def clean_text(text):
    text = text.replace(',', '')
    text = re.sub(r'[^0-9.\-]', '', text)
    return text

In [23]:
product_info['Price_RM'] = product_info['Price_RM'].apply(clean_text)
product_info['Price_RM'] = product_info['Price_RM'].astype(float)
product_info['Rating'] = product_info['Rating'].astype(float)
product_info['Status'] = product_info['Status'].replace('BUY NOW', 'AVAILABLE')

In [28]:
def get_brand(product):
    brand = product.split()[0]
    return brand

In [29]:
product_info['Brand'] = product_info['Product_Name'].apply(get_brand)

In [30]:
def comment_count(comment):
    if comment == '-':
        return 0
    else:
        return len(comment)

In [31]:
product_info['Comment_num'] = product_info['Comment'].apply(comment_count)

In [32]:
product_info[product_info['Comment'] != '-']

Unnamed: 0,Product_Name,Category,Status,Price_RM,Rating,Comment,Brand,Comment_num
11,LG UN73 Series 55” 4K Active HDR Smart UHD TV ...,TV,AVAILABLE,2799.0,5.0,"[Good#SenhengKOC,#31Anniversary, Good service ...",LG,10
16,Samsung 65 Inches Q60T QLED 4K UHD HDR Smart T...,TV,AVAILABLE,6299.0,5.0,"[Helpful staff #31Anniversary,#SenhengKOC]",Samsung,1
18,Samsung 2.1 Channel HW-T550 Soundbar,Audio,AVAILABLE,949.0,5.0,"[service baik#31Anniversary,#SenhengKOC, Fast ...",Samsung,2
20,LG 75 Inch UN72 Series Active HDR Smart UHD TV...,TV,AVAILABLE,6666.0,5.0,[I bought this television about a month ago. H...,LG,1
22,Samsung Q60T QLED 4K UHD HDR Smart TV 2020,TV,AVAILABLE,3149.0,5.0,[Great reccomrndations by salesperson#31Annive...,Samsung,2
31,LG NANO95 65” NanoCell 8K Smart TV with AI Thi...,TV,OUT OF STOCK,15999.0,5.0,[The remote is easy to use and was able to con...,LG,2
32,LG NANO86 65’’ NanoCell 4K Smart TV with AI Th...,TV,AVAILABLE,7399.0,5.0,"[I. Love. This. TV. It is, by far, the best TV...",LG,1
38,Sony Microphone SNY-FV120,Audio,AVAILABLE,32.0,5.0,[Good service and staff friendly #Grandsenheng...,Sony,1
46,LG UN71 Series 60” 4K Active HDR Smart UHD TV ...,TV,AVAILABLE,3199.0,5.0,[The Feature that crowns it all for me is the ...,LG,1
48,Samsung TU8000 Crystal UHD 4K Smart TV (2020),TV,OUT OF STOCK,2719.0,5.0,[best price form senheng #senhengPOKOKMANGGA#3...,Samsung,7


In [33]:
product_info.columns

Index(['Product_Name', 'Category', 'Status', 'Price_RM', 'Rating', 'Comment',
       'Brand', 'Comment_num'],
      dtype='object')

In [34]:
product_info = product_info[['Product_Name', 'Brand', 'Category', 'Status', 'Price_RM', 'Rating', 'Comment', 'Comment_num']]

In [35]:
product_info

Unnamed: 0,Product_Name,Brand,Category,Status,Price_RM,Rating,Comment,Comment_num
0,MasterLink Adjustable HDMI Cable 2M MSL-HD02-360,MasterLink,Mobile Accessories,AVAILABLE,165.0,0.0,-,0
1,Sharp 32-Inch Full HD TV 2TC32BD1X,Sharp,TV,AVAILABLE,779.0,0.0,-,0
2,Belkin 1 Meters High Speed HDMI® Cable with Et...,Belkin,Accessories,OUT OF STOCK,46.0,0.0,-,0
3,MasterLink HDMI Cable 2M MSL-HD02SM,MasterLink,Mobile Accessories,AVAILABLE,165.0,0.0,-,0
4,Haier 32-Inch Full HD LED TV LE32B9600T,Haier,TV,OUT OF STOCK,699.0,0.0,-,0
...,...,...,...,...,...,...,...,...
247,Sony Extra Bass In-Ear Headphones (Red) - SNY-...,Sony,Audio,OUT OF STOCK,169.0,0.0,-,0
248,LG NANO86 55’’ NanoCell 4K Smart TV with AI Th...,LG,TV,AVAILABLE,5199.0,5.0,[I've purchased my TV for my family. This TV c...,1
249,Sony 2.1ch Soundbar with powerful wireless sub...,Sony,Audio,AVAILABLE,699.0,5.0,[Good service #grandsenhengtemerloh#31Annivers...,4
250,Hisense 43 Inch Full HD Android A6000F Series ...,Hisense,TV,OUT OF STOCK,1199.0,0.0,-,0


### Save into csv format.

In [36]:
product_info.to_csv('senheng_home_entertainment.csv', index=False)

In [37]:
product_info = pd.read_csv('senheng_home_entertainment.csv')