## This notebook will focus on fetching as many links as possible from the websites that did not have a sitemap (and that work)

In [27]:

import csv # importing the csv module
import operator
import json

import requests
from tqdm import tqdm # we use this to keep track of the progress of a loop
from bs4 import BeautifulSoup

from concurrent.futures import ThreadPoolExecutor
import concurrent
from urllib.parse import urlparse
import random

In [28]:
def get_base_url(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url
    except Exception as e:
        # print(f"Error parsing URL {url}: {e}")
        return None
    
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
]    

def check_website(url):
    headers = {"User-Agent": random.choice(USER_AGENTS)}  # Rotate user-agent
    try:
        response = requests.get(url, headers = headers ,timeout=5)  # timeout after 5 seconds
        if 200 <= response.status_code < 300:
            return True  # the website is accessible if the status code is between 200 and 299
        else:
            return False  # website returned an error (not accessible)
    except requests.RequestException as e:
        return False  # there was an issue with the request (e.g., domain error, timeout, etc.)

def check_websites_concurrently(links):
    accessible_links = []
    with ThreadPoolExecutor(max_workers=30) as executor:
        futures = {executor.submit(check_website, link): link for link in links}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(links), desc="Checking links"):
            # futures work like promises in JavaScript
            link = futures[future] # get the link associated with the future from the futures dictionary (futures is a dictionary with the future as the key and the link as the value)
            if future.result():
                accessible_links.append(link)
    return accessible_links

def trim_url(url):
    # find the first occurrence of "collections" and "products" in the URL
    collections_index = url.find('/collections/')
    products_index = url.find('/products/')
    
    # we trim the URL to the last "/" before "collections" or "products" using slicing
    if collections_index != -1:
        return url[:collections_index + len('/collections/')]
    elif products_index != -1:
        # by changing the products from the url with collections we get the page with the types of products and from my testing this approach proves best in order to access all the products from a webpage
        # additionally, most websites have a collections path that points to the same place as the products path (don't know how this works, but I get over 400 working pages with this method which is way more than enough for training the model)
        # throughout the code I will consider the products path or the collections path at the end of every URL (even though I may only keep one of them)
        return url[:products_index] + len('/products/')
    else:
        return url  # if neither "collections" nor "products" is found, we return the original URL

In [25]:
# getting only the sitemaps from the csv file
no_sitemaps = []
with open('../sitemap_results.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if row[1] == 'No sitemap found':
            no_sitemaps.append(row[0])
            
no_sitemaps = no_sitemaps[2:]
basic_csv = []

with open ('../data/furniture stores pages.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        basic_csv.append(row)
        
basic_csv = basic_csv[1:]

print(len(no_sitemaps)) # 320

for i in range(len(no_sitemaps)):
    for j in range(len(basic_csv)):
        if no_sitemaps[i] in basic_csv[j][0]:
            print(no_sitemaps[i], basic_csv[j][0])
            no_sitemaps[i] = basic_csv[j][0]
            break
            
print(len(no_sitemaps)) # 320
        
# Trimming all websites until the words collections or products show up in the URL
no_sitemaps = [trim_url(url) for url in no_sitemaps]

print(no_sitemaps)
print(len(no_sitemaps)) # 320




320
https://home-buy.com.au https://home-buy.com.au/products/bridger-pendant-larger-lamp-metal-brass
https://curiousgrace.com.au https://curiousgrace.com.au/products/cleo-desk-lamp
https://themodern.net.au https://themodern.net.au/products/hamar-plant-stand-ash
https://www.homekoncepts.com https://www.homekoncepts.com/products/furniture/tables/end-tables/
https://pinchdesign.com https://pinchdesign.com/products/yves-desk
http://www.vawayside.net http://www.vawayside.net/store/products/tag/beds
https://furnish123watertown.com https://furnish123watertown.com/products/
https://nicoyafurniture.com.au https://nicoyafurniture.com.au/products/playa-bowl
https://stationfurniture.store https://stationfurniture.store/products/milano
https://cityfurnitureshop.com https://cityfurnitureshop.com/collections/greenington/products/azara-bed
https://tyfinefurniture.com https://tyfinefurniture.com/products/enso-platform-bed
https://www.goodwoodfurniture.com.au https://www.goodwoodfurniture.com.au/product

In [29]:
final_links = check_websites_concurrently(no_sitemaps)

print(len(final_links)) # 73

Checking links: 100%|██████████| 320/320 [00:16<00:00, 19.03it/s]

73





In [30]:
# saving the final links to a csv file

with open('../data/no_sitemap_links.csv', mode='w', newline='') as file:
    csv_writer = csv.writer(file)
    csv_writer.writerow(['URL'])
    for link in final_links:
        csv_writer.writerow([link])