In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import os

# Requests

In [3]:
def get_req(term):
    params = {
        "q": term.replace(" ", "+"),
        "hl": "en",     # language
        "gl": "us",     # country of the search, US -> USA
        "tbm": "shop",   # google search shopping
        "pagesize": 10,
        "tbs": "p_ord:rv,new:1",
    }

    # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
    }

    html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)

    print(f"Request Status code: {html.status_code}")
    soup = BeautifulSoup(html.text, 'lxml')
    return soup

def get_brands(device):
    soup = get_req(device)
    if device[:5].lower() in ["apple", "smart"]:
        return ["Apple"]
    brands = list(filter(lambda x: x.text == "Brand", soup.select('.ELcVZ')))[0].parent.select('.lg3aE')[1:]
    brands = list(map(lambda x: x.text, brands))
    return brands

def match(title, brands):
    for b in brands:
        if re.match(b.lower(), title.lower()):
            return True
    return False

In [17]:

# a list of devices to feed in 'android phone', 'android tablet', 'apple phone', 'apple tablet', 'smart watch', 'smart tv', 'voice assistant'
devices = pd.read_csv('devices.csv')["Devices"].to_list()

def get_data(devices, devices_data_all = {}):
    brands_all = {}

    for device in devices:
        print(device)

        brands = get_brands(device)
        brands_all[device] = brands
        
        device_cat_candidates = []
        print(brands)
        for brand in brands:
            soup = get_req(brand + " " + device)
            cards = soup.select(".sh-dgr__content")
            not_match = 0
            first_n_items = 10

            for item in cards[:first_n_items]:
                try:
                    title = item.select('.tAxDx')[0].text
                    price = item.select('.a8Pemb')[0].text
                    rating = item.select(".Rsc7Yb")[0].text
                    num_of_reviews = int(item.select(".NzUzee")[0].select("div > span")[0].text[3:].replace(",", ""))
                    seller = item.select('.aULzUe.IuHnof')[0].text

                    device_cat_candidates.append({
                        "title": title,
                        'brand': brand,
                        "brand2": title.split(" ", 1)[0],
                        "price": price,
                        "rating": float(rating),
                        "num_of_reviews": num_of_reviews,
                        'seller': seller
                    })
                except:
                    print("Error on ", title)

        devices_data_all[device] = device_cat_candidates

    return devices_data_all

In [None]:
devices_data_all = get_data(devices)

# Save data

In [20]:
filename = "device_data_120522.xlsx"
if filename in os.listdir():
    os.remove(filename)


with pd.ExcelWriter(filename) as f:
    for device in devices_data_all:
        df = pd.DataFrame(devices_data_all[device])
        df.to_excel(f, sheet_name=f"{device}", index = False)

with pd.ExcelWriter(filename, mode='a') as f:
    for device in append_data1:
        df = pd.DataFrame(append_data1[device])
        df.to_excel(f, sheet_name=f"{device}", index = False)

In [14]:
devices_data_all['smart tv']

[{'title': 'LG 77" Oledc2 4K UHD Ai ThinQ Smart TV w/ 5-Year Coverage',
  'brand': 'Apple',
  'price': '$2,987.59',
  'rating': 4.9,
  'num_of_reviews': 2944,
  'seller': 'Truegether - CrazyGoodeal'},
 {'title': 'LG OLED B1 Series 65” Alexa Built-in 4K Smart TV, 120Hz Refresh Rate, AI-Powered ...',
  'brand': 'Apple',
  'price': '$1,729.39',
  'rating': 4.8,
  'num_of_reviews': 3209,
  'seller': 'Amazon.com'},
 {'title': 'LG C2 Series 77-inch 4K Smart OLED Evo TV',
  'brand': 'Apple',
  'price': '$2,699.99',
  'rating': 4.8,
  'num_of_reviews': 7420,
  'seller': 'Best Buy'},
 {'title': 'LG BX 65" 4K Smart OLED TV-OLED65BXPUA',
  'brand': 'Apple',
  'price': '$1,599.00',
  'rating': 4.8,
  'num_of_reviews': 3905,
  'seller': 'B&H Photo Video Audio'},
 {'title': 'LG 77" 4K Smart OLED TV - OLED77CXPUA',
  'brand': 'Apple',
  'price': '$1,199.00',
  'rating': 4.8,
  'num_of_reviews': 17816,
  'seller': 'kidospace'},
 {'title': 'LG - 55" Class A2 Series OLED 4K UHD Smart webOS TV',
  'brand