In [1]:
import requests
import os
import time
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

In [5]:
class Best_buy_scrap:
    def __init__(self , product_name):
        self.search_term = product_name
        
    ### request search page information and return bs4 object
    def get_search_pages(self , search_term , p_number):
        ### URL
        search_page_url = "https://www.bestbuy.com/site/searchpage.jsp?st="
        ### search product
        search_product = search_term
        
        ### page_number
        page_number = 0
        if p_number == 0:
            page_number = 1
        else:
            page_number = p_number

        ### page_number search term
        search_page_encoding = f"&cp={page_number}"

        ### User agent
        User_Agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'

        ### request access
        r = requests.get(search_page_url + search_product + search_page_encoding , headers = {'User-Agent' : User_Agent})

        ### Assign bs4 object to soup
        soup = BeautifulSoup(r.content, 'lxml')

        return soup
    
    ### handle price text
    def handle_price(self , price):
        try:
            price = price.replace("$" , "")
            price = float(price)
        except:
            pass
        return price

    ### exrtract brand name from header
    def get_brand(self , header):
        try:
            brand = header.split("-")[0].strip(" ")
        except:
            brand = ""
        return brand
    
    ### get product info from bs4 object and return dict of info
    def get_product_info(self , product):
        return_dict = {}

        ### get header
        header = product.find('h4').text
        return_dict.update({'header' : header})

        ### get brand
        brand = self.get_brand(header)
        return_dict.update({'brand' : brand})

        ### get labels
        try:
            labels = product.find('div' , attrs = {'class' : 'lv-stacked-carousel'}).find_all('button')
            labels = [lab.text for lab in labels]
        except:
            labels = []
        return_dict.update({'labels' : labels})


        ### get model
        model = product.find_all('span' , attrs = {'class' : 'sku-value'})[0].text
        return_dict.update({'model' : model})

        ### get sku
        sku = product.find_all('span' , attrs = {'class' : 'sku-value'})[1].text
        return_dict.update({'sku' : sku})

        ### get stars
        try:
            stars = product.find('p').text.split(' ')[1]
            return_dict.update({'stars' : float(stars)})
        except:
            pass 

        ### get number of reviews
        number_reviews = product.find('p').text.split(' ')[-2]
        return_dict.update({'number_of_reivews' : number_reviews})

        ### get price
        try:
            price = product.find("div" , attrs = {"class" : "priceView-hero-price priceView-customer-price"}).find('span').text
            return_dict.update({"price" : handle_price(price)})
        except:
            pass

        ### get old price
        try:
            old_price = product.find("div" , attrs = {"class" : "pricing-price__regular-price"}).text.split(' ')[1]
            return_dict.update({'original_price' : handle_price(old_price)})
        except:
            pass

        ### price diff
        try:
            reduction = handle_price(old_price) - handle_price(price)
            return_dict.update({'price_reduction' : reduction})
        except:
            pass

        return return_dict
    
    ### execute whole process
    def execute_scrapping(self):
        re_list = []
        stopper = False
        counter = 0

        while stopper != True:
            counter = counter + 1
            page_n = self.get_search_pages(self.search_term , counter)
            page_source = page_n.find_all('div' , attrs = {'class' : 'shop-sku-list-item'})
            
            if len(page_source) == 0:
                stopper = True
            else:
                for product in page_source:
                    re_list.append(self.get_product_info(product))
                print(f"page {counter} done!")
            
            ### pause after request
            time.sleep(5)
        ### return dict of products
        return re_list

In [3]:
if __name__ == '__main__': 
    
    ### initiate object
    bb_init = Best_buy_scrap("TV")
    
    ### start scrapping
    dict_info = bb_init.execute_scrapping()

page 1 done!
page 2 done!
page 3 done!
page 4 done!
page 5 done!
page 6 done!
page 7 done!
page 8 done!
page 9 done!
page 10 done!
page 11 done!
page 12 done!
page 13 done!
page 14 done!
page 15 done!
page 16 done!
page 17 done!
page 18 done!
page 19 done!
page 20 done!
page 21 done!
page 22 done!
page 23 done!
page 24 done!
page 25 done!
page 26 done!
page 27 done!
page 28 done!


In [4]:
### check results
dict_info

[{'header': 'Samsung - 65" Class 7 Series LED 4K UHD Smart Tizen TV',
  'brand': 'Samsung',
  'labels': ['4K UHD', 'Direct Lit', 'Smart'],
  'model': 'UN65TU7000FXZA',
  'sku': '6401722',
  'stars': 4.6,
  'number_of_reivews': '12656'},
 {'header': 'Samsung - 55" Class 7 Series LED 4K UHD Smart Tizen TV',
  'brand': 'Samsung',
  'labels': ['4K UHD', 'Direct Lit', 'Smart'],
  'model': 'UN55TU7000FXZA',
  'sku': '6401735',
  'stars': 4.6,
  'number_of_reivews': '8765'},
 {'header': 'Insignia™ - 55" Class F30 Series LED 4K UHD Smart Fire TV',
  'brand': 'Insignia™',
  'labels': ['4K UHD', 'Direct Lit', 'Smart', 'Voice Assist'],
  'model': 'NS-55F301NA22',
  'sku': '6450247',
  'stars': 4.6,
  'number_of_reivews': '3063'},
 {'header': 'Samsung - 85" Class 7 Series LED 4K UHD Smart Tizen TV',
  'brand': 'Samsung',
  'labels': ['4K UHD', 'Direct Lit', 'Smart'],
  'model': 'UN85TU7000FXZA',
  'sku': '6485132',
  'stars': 4.7,
  'number_of_reivews': '484'},
 {'header': 'Samsung - 75" Class 7 S