# Code to extract TV info from Good Guys

In [1]:
# Dependencies
import requests
from bs4 import BeautifulSoup as bs
from time import sleep
import pandas as pd
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from selenium.webdriver import ActionChains
from webdriver_manager.chrome import ChromeDriverManager

There are 136 televisions listed on the Good Guys website contained in 3 webpages (60 per page) so we will be using Splinter to help us automate through the pages

In [2]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

url = "https://www.thegoodguys.com.au/televisions/all-tvs"
browser.visit(url)

[WDM] - Current google-chrome version is 84.0.4147
[WDM] - Get LATEST driver version for 84.0.4147


 


[WDM] - Driver [C:\Users\foong\.wdm\drivers\chromedriver\win32\84.0.4147.30\chromedriver.exe] found in cache


## Automate Browser Navigation

In [3]:
# Create function to automate browser navigation

def scroll_and_next(y):

    # Ensuring the link is in view
    browser.execute_script('window.scrollTo(0, 10000);')

    # Find Next button and click to navigate to next page
    browser.find_by_text("Next").first.click()
    
    sleep(0.01)

In [6]:
# Create function to retrieve url page, create Beautiful Soup object, 
# parse and get results for loop.

def get_results(page_no): 
    
    # Create a Beautiful Soup object; parse with 'html.parser'
    soup = bs(browser.html, "html.parser")

    # results are returned as an iterable list
    results = soup.select("#product_listing_tab")[0].find_all("li")
    
    return results
      
# Example: to find the Product Category for the first result
print(f"There are {len(get_results(1))} results on this page")

There are 60 results on this page


In [None]:
# Create function to retrieve url page, create Beautiful Soup object, 
# parse and get results for loop.

def get_results(page_no): 

    # Retrieve page with the requests module
    response = requests.get(urls[page_no])

    # Create a Beautiful Soup object; parse with 'html.parser'
    soup = bs(response.text, "html.parser")

    # results are returned as an iterable list
    results = soup.select("#product_listing_tab")[0].find_all("li")
    
    return results
      
# Example: to find the Product Category for the first result
print(f"There are {len(get_results(1))} results on this page")

In [35]:
scroll_and_next(2)

In [36]:
results = get_results(3)

In [37]:
len(results)

16

## Find Product Category

In [10]:
# Create function to get product category for loop

def category_finder(x):
    onclick = results[x].a["onclick"]
    start = onclick.rfind("Product Category L1 :") + len("Product Category L1 :")
    end = onclick.rfind("','TGGCATLPEventAction':'Product Category L2 :")
    category = onclick[start:end]
    return category

# Example: to find the Product Category for the first result
print(category_finder(0))

Televisions


## Find Currency Code

In [11]:
# Create function to get currency code for loop

def currency_finder(x):
    scripts = results[x].script.find_next("script").string
    start = scripts.rfind("currencyCode")+len("currencyCode")+4
    end = start + 3
    currency = scripts[start:end]
    return currency

# Example: to find the Currency Code for the first result
print(currency_finder(0))

AUD


## Find Brand

In [12]:
# Create function to get brand for loop

def brand_finder(x):
    brand = results[x].select('input[type="hidden"]')[3]["value"]
    return brand

# Example: to find the Brand for the first result
print(brand_finder(0))

Sony


## Find Model Number

In [13]:
# Create function to get model number for loop

def model_finder(x):
    model = results[x].find("div", class_="product-tile-model").text
    return model

# Example: to find the Model Number for the first result
print(model_finder(0))

KDL32W660E


## Find Name

In [14]:
# Create function to get name for loop

def name_finder(x):
    brand = results[x].select('input[type="hidden"]')[3]["value"]
    name = results[x].select('input[type="hidden"]')[4]["value"]
    return brand + " " + name

# Example: to find the Product Name for the first result
print(name_finder(0))

Sony 32"(81cm) FHD LED LCD Smart TV


## Find Screen Size

In [15]:
# Create function to get screen size for loop

def size_finder(x):
    name = results[x].select('input[type="hidden"]')[4]["value"]
    size = name.split()[0]
    return size

# Example: to find the Screen Size for the first result
print(size_finder(0))

32"(81cm)


## Find Price

In [16]:
# Create function to get price for loop

def price_finder(x):
    price = results[x].select('input[type="hidden"]')[5]["value"]
    return price

# Example: to find the Price for the first result
print(price_finder(0))

$595.00


## Find Link to Product Image

In [17]:
# Create function to get image link for loop

def image_finder(x):
    image = results[x].img["data-src"]
    return image

# Example: to find the link to Product Image for the first result
print(image_finder(0))

//thegoodguys.sirv.com/products/50048259/50048259_511653.PNG?scale.height=215&scale.width=215&canvas.height=215&canvas.width=215&canvas.opacity=0


## The Loop

In [18]:
# Create lists to hold values

categories = list()
currencies = list()
brands = list()
models = list()
names = list()
sizes = list()
prices = list()
images = list()

In [38]:
# Loop through first set of returned results

x = 0

for x in range(len(results)):
    
    try:
        
        category = category_finder(x)
        currency = currency_finder(x)
        brand = brand_finder(x)
        model = model_finder(x)
        name = name_finder(x)
        size = size_finder(x)
        price = price_finder(x)
        image = image_finder(x)
        
        if (category and currency and brand and model and name and size and price and image):
            
                    categories.append(category)
                    currencies.append(currency)
                    brands.append(brand)
                    models.append(model)
                    names.append(name)
                    sizes.append(size)
                    prices.append(price)
                    images.append(image)
                    
    except:
        print("Price not available: ",name_finder(x))
            
    x += 1

Price not available:  Sony 65" A8H 4K UHD ANDROID BRAVIA OLED TV
Price not available:  Sony 55" A8H 4K UHD ANDROID BRAVIA OLED TV
Price not available:  Hisense 24"(60cm) HD LED LCD TV


In [39]:
# Check number of results - site lists 136 products in total

print(len(categories))
print(len(currencies))
print(len(brands))
print(len(names))
print(len(sizes))
print(len(prices))
print(len(images))

131
131
131
131
131
131
131


In [21]:
len(results)

60

In [22]:
print(names[0])
print(names[57])

Sony 32"(81cm) FHD LED LCD Smart TV
Sony 49" X8000H 4K UHD ANDROID BRAVIA LED TV


In [32]:
print(names[58])
print(names[117])

Sony 55" X8000H 4K UHD ANDROID BRAVIA LED TV
Hisense 85" Q8 4K UHD SMART ULED TV


In [40]:
print(names[118])
print(names[130])

TCL 40" S615 FHD ANDROID LED TV
TCL 65" P615 4K UHD ANDROID LED TV


In [41]:
import pandas as pd

df = pd.DataFrame({
        "retailer": "Good Guys",
        "category": "Televisions",
        "currency": currencies,
        "brand": brands,
        "model": models,
        "name": names,
        "size": sizes,
        "price": prices,
        "image": images
})

df

Unnamed: 0,retailer,category,currency,brand,model,name,size,price,image
0,Good Guys,Televisions,AUD,Sony,KDL32W660E,"Sony 32""(81cm) FHD LED LCD Smart TV","32""(81cm)",$595.00,//thegoodguys.sirv.com/products/50048259/50048...
1,Good Guys,Televisions,AUD,TCL,40D3000F,"TCL 40""(101cm) FHD LED LCD TV","40""(101cm)",$445.00,//thegoodguys.sirv.com/products/50052593/50052...
2,Good Guys,Televisions,AUD,Hitachi,32HDSM8,"Hitachi 32"" HD Smart LED TV","32""",$299.00,//thegoodguys.sirv.com/products/50064547/50064...
3,Good Guys,Televisions,AUD,Hitachi,65UHDSM8,"Hitachi 65"" UHD Smart LED TV","65""",$999.00,//thegoodguys.sirv.com/products/50064544/50064...
4,Good Guys,Televisions,AUD,Hitachi,55UHDSM8,"Hitachi 55""(140cm) UHD HDR LED LCD Smart TV","55""(140cm)",$695.00,//thegoodguys.sirv.com/products/50063085/50063...
...,...,...,...,...,...,...,...,...,...
126,Good Guys,Televisions,AUD,Hitachi,50QLEDSM20,"Hitachi 50"" SM20 4K UHD ANDROID QLED TV","50""",$899.00,//thegoodguys.sirv.com/products/50071408/50071...
127,Good Guys,Televisions,AUD,Hitachi,55QLEDSM20,"Hitachi 55"" SM20 4K UHD ANDROID QLED TV","55""",$1099.00,//thegoodguys.sirv.com/products/50071409/50071...
128,Good Guys,Televisions,AUD,Hitachi,65QLEDSM20,"Hitachi 65"" SM20 4K UHD ANDROID QLED TV","65""",$1499.00,//thegoodguys.sirv.com/products/50071410/50071...
129,Good Guys,Televisions,AUD,Samsung,QA32LS03TBWXXY,"Samsung 32"" LS03T 4K UHD SMART FRAME QLED TV","32""",$915.00,//thegoodguys.sirv.com/products/50072034/50072...


In [42]:
df.describe()

Unnamed: 0,retailer,category,currency,brand,model,name,size,price,image
count,131,131,131,131,131,131,131,131,131
unique,1,1,1,11,131,131,21,68,131
top,Good Guys,Televisions,AUD,Samsung,OLED55BXPTA,"Samsung 75"" Q70T 4K UHD SMART QLED TV","65""",$4495.00,//thegoodguys.sirv.com/products/50071569/50071...
freq,131,131,131,40,1,1,29,5,1


In [43]:
df[df.duplicated()] # Page not refreshing when input new URL - need to fix

Unnamed: 0,retailer,category,currency,brand,model,name,size,price,image


In [44]:
df.to_csv("output/good_guys.csv")