In [14]:
import os
from zenrows import ZenRowsClient
from bs4 import BeautifulSoup
from utils.helpers import dill_save, dill_load 
from dotenv import load_dotenv
import pandas as pd
import numpy as np 

In [2]:
load_dotenv()
API_KEY = os.getenv('API_KEY')

In [3]:
client = ZenRowsClient(API_KEY)

In [4]:
def get_main_page(pagenumber):
    url = f"https://www.houseofwatches.co.uk/mens-watches/?p={pagenumber}"
    response = client.get(url)
    dill_save(response.text, 'data/web', f'page_{pagenumber}')

# Scrape Main Information

In [100]:
# for i in range(117):
#     get_main_page(i+1)

# Get Main Information

In [37]:
cols = ['Brand', 'Name', 'Original Price', 'Discounted Price', 'Link']
dataframe = pd.DataFrame(columns = cols)
for pagenumber in range(117):
    testing = dill_load('data/web', f'page_{pagenumber + 1}')
    
    # Parse the HTML
    soup = BeautifulSoup(testing, 'html.parser')
    
    # Find all product grid items
    product_grid_items = soup.find_all(class_='product-grid-item__details')

    # Iterate through each product grid item
    for item in product_grid_items:
        # Extract brand
        brand = item.find(class_='product-brand').text.strip()
        
        # Extract name
        name = item.find(class_='product-grid-item__name').text.strip()
        
        # Extract price
        price = item.find(class_='price-final_price').text.strip()
        price = price.replace("\n", "").split(' ')
        if len(price) == 2:
            original_price = price[0]
            discounted_price = price[1]
        else:
            original_price = price[0]
            discounted_price = ''
            
        # Extract link
        link = item.find('a')['href']

        data = [brand, name, original_price, discounted_price, link]
        df = pd.DataFrame(data).T
        df.columns = cols
        dataframe = pd.concat([dataframe, df])

  soup = BeautifulSoup(testing, 'html.parser')


In [38]:
dataframe.reset_index(drop=True, inplace=True)
dataframe.to_csv('data/main.csv')

# Get image and description

In [64]:
def get_product_details(url, index = 0):
    html_content = client.get(url)
    try:
        soup = BeautifulSoup(html_content.text, 'html.parser')
        # Extract image link
        image_tag = soup.find('div', class_ = 'main-gallery').find('img')
        if image_tag:
            image_link = image_tag['src']
        
        # Extract product details
        details_div = soup.find('div', class_='product-details-text')
        if details_div:
            product_details = details_div.text.strip()
    
        dill_save({'description': product_details, 'image': image_link}, 'data/watchinfo', f'watch_{index}')
        
        return product_details, image_link    
    except:
        return None, None

In [58]:
dataframe.head()

Unnamed: 0,Brand,Name,Original Price,Discounted Price,Link
0,Casio,Vintage Silver Watch AQ-230A-7AMQYES,£44.90,£37.95,https://www.houseofwatches.co.uk/casio-vintage...
1,Citizen,Mens Chronograph Bracelet Watch AT2121-50E,£329.00,£196.99,https://www.houseofwatches.co.uk/citizen-mens-...
2,Seiko,Presage Cocktail Time Mojito Automatic Strap W...,£400.00,£379.95,https://www.houseofwatches.co.uk/seiko-presage...
3,Swatch,What If Mint Bioceramic Watch SO34G701,£96.00,£91.95,https://www.houseofwatches.co.uk/swatch-what-i...
4,Tissot,Mens T-Sport Chrono Xl Classic Blue Dial Brace...,£385.00,£354.95,https://www.houseofwatches.co.uk/tissot-mens-c...


In [66]:
for index, link in enumerate(dataframe['Link'].values[42:]):
    details, image = get_product_details(url=link, index = index+42)

  soup = BeautifulSoup(html_content.text, 'html.parser')
