# Web scraping all the sitemaps found from the CSV

- This notebook aims to search all the webpages inside `furniture stores pages.csv` for sitemaps, store them and get as many pages from each sitemap found.
- I find this method to work best in terms of gathering a lot of training data fast. Sitemaps are predictable and easier to scrape in contrast to going through all the `a` tags from each website manually. The only downside to this method is that you lose some diversity in your data (due to losing the websites that don't contain a sitemap or have an inaccessible sitemap), but the number of training entries gathered from the websites that do have a sitemap more than offset for this loss.

In [None]:
# importing libraries
import csv 
import operator
import requests
from tqdm import tqdm 
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import concurrent
from urllib.parse import urljoin
from urllib.parse import urlparse
import time
import random

### Method for Retrieving Data from a Website
- This method will be used across multiple notebooks of mine to extract data while minimizing the risk of IP bans. It will also handle potential website status errors that may occur during the process.

In [None]:
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
]

def get_data(url):
    headers = {"User-Agent": random.choice(USER_AGENTS)}  # it uses a random user agent from the list above - it avoids getting IP banned from most websites inside the CSV if I scrape multiple times
    try:
        response = requests.get(url, headers=headers, timeout=3)

        # handle rate-limiting (HTTP 429) by pausing and retrying
        if response.status_code == 429:
            tqdm.write(f"FROM GET_DATA: Rate limit reached. Sleeping before retrying {url}")
            # print(f"FROM GET_DATA: Rate limit reached. Sleeping before retrying {url}")
            time.sleep(random.uniform(4, 8))  # random delay to avoid detection
            return get_data(url)

        if response.status_code == 200:
            return response.content  # return content if successful

        tqdm.write(f"FROM GET_DATA: Failed to retrieve {url}, Status Code: {response.status_code}")
        # print(f"FROM GET_DATA: Failed to retrieve {url}, Status Code: {response.status_code}")
        return None

    except requests.RequestException as e:
        tqdm.write(f"FROM GET_DATA: Error fetching {url}: {e}")
        # print(f"FROM GET_DATA: Error fetching {url}: {e}")
        return None

### Methods for getting the base url and for checking if the website has a sitemap
- One thing to note in my approach here is that I combined two methods of finding sitemaps (trying to find the path brute force from some common sitemap extensions appended to the base url or by using the robots.txt file if it exits).
- I learned about the existence of robots.txt after implementing the brute force approach so I combined both (individually from using just the brute force method I got 384 sitemaps and from the robots.txt method only I got 418).
- Another thing to note is that I will use 32 workers for the ThreadPoolExecutor. This is the number that worked best for me, but anyone that wants to use this code should adjust that based on the number of cores they have. (I have 8 cores and 32 workers worked best for me).

In [None]:

# this contains all the common sitemap paths
SITEMAP_PATHS = [
    "sitemap.xml",
    "sitemap_index.xml",
    ".sitemap.xml",
    "sitemap/sitemap.xml",
    "sitemap_index/sitemap.xml"
]

# simple method to get the base url of a website
def get_base_url(url):
    try:
        parsed_url = urlparse(url) # parsed the url using urllib
        new_url = f"{parsed_url.scheme}://{parsed_url.netloc}" 
        return new_url
    except Exception as e:
        # print(f"Error parsing URL {url}: {e}") - commented in order to not clutter the output 
        return None

# brute force solution for finding the sitemap
# checks if the given base URL has a sitemap in one of the common paths.
# returns the URL of the sitemap if found, otherwise None.
def check_sitemap(base_url):
    for sitemap_path in SITEMAP_PATHS:
        sitemap_url = urljoin(base_url, sitemap_path)
        try:
            response = requests.head(sitemap_url, timeout=10)
            # Check if the URL exists and returns a successful status code (200)
            if 200 <= response.status_code < 300:
                return sitemap_url
        except requests.RequestException as e:
            # print(f"Error checking {sitemap_url}: {e}")
            continue
    return None

# this searches the robots.txt file for sitemaps
# if it finds the sitemap inside this file it returns it
def scrape_sitemap_from_robots(base_url):
    robots_url = base_url + "/robots.txt" if base_url else None
    if not robots_url:
        return None
    try:
        print(f"Checking robots.txt at {robots_url}")
        data = get_data(robots_url)
        if data:
            soup = BeautifulSoup(data, 'html.parser')
            for line in soup.get_text().split('\n'):
                if 'sitemap:' in line.lower():
                    sitemap_url = line[len("sitemap:"):].strip()
                    for attempt in range(3): 
                        try:
                            response = requests.head(sitemap_url, timeout=10) 
                            if 200 <= response.status_code < 300:
                                return sitemap_url
                        except requests.RequestException:
                            continue
    except Exception as e:
        print(f"Error scraping robots.txt: {e}")
        return None
    return None

# checks sitemaps for a list of base URLs concurrently.
# returns a list of results with the base URL and sitemap URL.
def check_sitemap_concurrently(base_urls):
    results = []
    with ThreadPoolExecutor(max_workers=16) as executor:
        # here we send the check_sitemap method to the executor, and we send the base_url as a parameter
        futures = {executor.submit(check_sitemap, base_url): base_url for base_url in base_urls} # I used 16 workers here because there was no need for more
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Checking sitemaps"): # for all futures
            base_url = futures[future]
            sitemap_url = future.result()
            if sitemap_url: # if a sitemap was found in the common paths
                results.append([base_url, "Sitemap found", sitemap_url])
            else:
                # if no sitemap was found from the common paths...
                sitemap_from_robots = scrape_sitemap_from_robots(base_url)
                if sitemap_from_robots:
                    results.append([base_url, "Sitemap found", sitemap_from_robots])
                else:
                    results.append([base_url, "No sitemap found", ""])
    return results

### Creating a CSV that will store the result of computing all the links from the input CSV
- This is where the file with the sitemap results is created. All files I compute in these notebooks are stored in Data/InUseData (the .csv files that are being used by my notebooks trhoughtout the project).

In [None]:
# reading the csv file and storing the links in the links list
links = []
with open('../../Data/MiscData/furniture stores pages.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        try:
            link = get_base_url(row[0])
            if link: # append if there is a link
                links.append(link)
        except Exception as e:
            continue
    
# get the sitemaps
sitemap_results = check_sitemap_concurrently(links)

# write the results to the output CSV file
output_csv = "../../Data/InUseData/sitemap_results.csv" 
with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["URL", "Status", "Sitemap URL"])  # header row
    csvwriter.writerows(sitemap_results) 

print(f"Results saved to {output_csv}.")

### Testing the results
- The combined method gets us 443 sitemaps from the 705 websites in the input CSV file which is better than any method on its own.
- They will be used for gathering as much training data as possible.

In [15]:
# getting only the sitemaps from the csv file
sitemaps = []
with open("../../Data/InUseData/sitemap_results.csv" , mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if row[1] != 'No sitemap found':
            sitemaps.append(row[2])

print(sitemaps, len(sitemaps)) # 443

['Sitemap URL', 'https://cane-line.co.uk/sitemap.xml', 'https://edenliving.online/sitemap.xml', 'https://vastinterior.com.au/sitemap.xml', 'https://www.factorybuys.com.au/sitemap.xml', 'https://dhfonline.com/sitemap.xml', 'https://dunlin.com.au/sitemap.xml', 'https://www.tandemarbor.com/sitemap.xml', 'https://www.perchfurniture.com/sitemap.xml', 'https://www.knoll.com/sitemap.xml', 'https://modshop1.com/sitemap.xml', 'https://www.ourfurniturewarehouse.com.au/sitemap.xml', 'https://www.sofamania.com/sitemap.xml', 'https://www.hudsonfurniture.com.au/sitemap.xml', 'https://themodern.com.au/sitemap.xml', 'https://www.scandesign.com/sitemap.xml', 'https://www.fentonandfenton.com.au/sitemap.xml', 'https://pinchdesign.com/sitemap.xml', 'https://claytongrayhome.com/sitemap.xml', 'https://www.do-shop.com/sitemap.xml', 'https://acmeshelving.com/sitemap.xml', 'https://www.theinside.com/sitemap.xml', 'https://www.jseitz.com/sitemap.xml', 'https://www.kmpfurniture.com/sitemap.xml', 'https://www.fur

### Methods for getting the links from the sitemaps
- The methods below are meant to search any links from sitemaps that contain the paths '/products/' or '/collections/'. It will also catch other sitemaps from inside the sitemaps so that it can recursively get as many links as possible.
- Because the algorithm would probably take days to finish if I dont cut it short manually, I will only let it run until it gets most of the links.
- From testing once it gets at just about 300000 links it starts to slow down significantly to the point that there are no new links to be found.
- By looking at more sitemaps you can probably get more sitemap paths / product paths that lead to good results (apart from sitemap_products_1.xml), but that would require a lot of time. The code can be scaled to fit such needs.

In [None]:

def is_valid_product_link(url):
    # exclude common unwanted patterns from links
    unwanted_patterns = ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.css', '.js', '.ico', 'tel:', 'mailto:', '#'] # add more if needed ...
    for pattern in unwanted_patterns:
        if pattern in url:
            return False
    # for training purposes I will only get the links that contain '/products/' or '/collections/' ( I will get over 2,000,000 links from the sitemaps so I will not need more than that)
    if '/products/' in url: # or '/collections/' in url: but I decided not to use this since collections gets me pages with more products or none at all
        return True
    return False

def is_desired_site_map_link(url): #  
    # only accept URLs that contain "sitemap" common paths
    if 'sitemap_products_1.xml' in url :  # or 'sitemap_collections_1.xml' in url or 'sitemap.xml' in url:
        return True
    return False

dict_href_links = {} # this will store all the links that I get from the sitemaps - in the server code its sent as a parameter instead of a "global" variable, but in jpynb I left it as is

def get_links_from_sitemap(website_link): # modified version from the one in the other notebook
    # Set the base of the URL depending on whether "collections" or "products" is in the link
    website_origin = website_link
    sitemap_index = website_link.find('/sitemap')
    if sitemap_index == -1:
        sitemap_index = website_link.find('/.sitemap')
    if sitemap_index != -1:
        website_origin = website_link[:sitemap_index + 1] 

    html_data = get_data(website_link)
    soup = BeautifulSoup(html_data, "html.parser")
    list_links = []

    for link in soup.find_all("loc"): # this contains the links inside xml files
        link = link.text
        if not is_valid_product_link(link) and not is_desired_site_map_link(link): 
            continue
        link_to_append = None
        # handle absolute URLs that start with the origin
        if link.startswith(str(website_origin)):
            link_to_append = link
        # handle relative URLs that start with "/"
        elif link.startswith("/"):
            link_with_www = website_origin + link[1:]
            link_to_append = link_with_www
        # if link_to_append is not None, check if it's already in dict_href_links and if it's accessible
        if link_to_append is not None:
            if link_to_append not in dict_href_links: #  and check_website(link_to_append) - I will not check the links here, I will check them after I get all the links
                dict_href_links[link_to_append] = None  # mark it as seen
                list_links.append(link_to_append)

    # convert list of links to a dictionary with "Not-checked" as the default value for each
    dict_links = dict.fromkeys(list_links, "Not-checked")
    return dict_links

def get_subpage_links(l, max_depth=3, current_depth=0, write_frequency=500, csv_filename="../../Data/InUseData/link_data.csv"):
    processed_links_count = 0
    if current_depth >= max_depth:
        return l

    with ThreadPoolExecutor(max_workers=32) as executor:
        # same multiprocessing as before
        futures = {executor.submit(get_links_from_sitemap, link): link for link in l if l[link] == "Not-checked"}
        
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing subpage links"):
            link = futures[future]
            try:
                dict_links_subpages = future.result()
            except Exception as e:
                print(f"Error fetching {link}: {e}")
                continue
                
            l[link] = "Checked"
            l.update(dict_links_subpages)

            processed_links_count += 1
        
            # write to file every 'write_frequency' processed links
            if processed_links_count >= write_frequency: # this actually writes all the links to the csv file - even the not checked ones but in my case it is better this way
                write_links_to_csv(l, csv_filename)
                processed_links_count = 0  # reset the counter

    # Recursively call the function for the next depth level
    return get_subpage_links(l, max_depth, current_depth + 1, write_frequency, csv_filename)

def write_links_to_csv(links_dict, csv_filename):
    # writes the current state of the links dictionary to a CSV file.
    with open(csv_filename, "w", newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        for link in links_dict.keys():
            csvwriter.writerow([link])

    print(f"Links saved to {csv_filename}.")


### Getting the links from the sitemaps

In [None]:
# we gather all the links from some pages
websites = sitemaps[1:] # exclude the first row
# create dictionary of website
dict_links = { website : "Not-checked" for website in websites }

counter, counter2 = None, 0
csv_filename = "../../Data/InUseData/link_data.csv"

while counter != 0:
    counter2 += 1
    dict_links2 = get_subpage_links(dict_links, csv_filename=csv_filename)
    counter = operator.countOf(dict_links2.values(), "Not-checked")  # number of "Not-checked" links
    
    print("")
    print("THIS IS LOOP ITERATION NUMBER", counter2)
    print("LENGTH OF DICTIONARY WITH LINKS =", len(dict_links2))
    print("NUMBER OF 'Not-checked' LINKS = ", counter)
    print("")
    
    dict_links = dict_links2
    
write_links_to_csv(dict_links, csv_filename)
print("Links saved to link_data.csv.")


### Removing the sitemap links from the csv file
- We dont want sitemaps inside our training data links.

In [13]:
links = []
csv_filename = "../../Data/InUseData/link_data.csv"

with open(csv_filename, mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if '.xml' not in row[0]:
            links.append(row[0])

with open(csv_filename, mode='w', newline='') as file:
    csv_writer = csv.writer(file)
    for link in links:
        csv_writer.writerow([link])
        
print("Links saved to link_data.csv.")

# getting the number of unique links
unique_urls = set()
for link in links:
    origin = get_base_url(link)
    unique_urls.add(origin)
    
print(len(unique_urls))

Links saved to link_data.csv.
302


### Dispersing the links
- I will distribute the links in such a way that if I want to get a smaller slice out of the links (a smaller dataset), I will get as much diversity in the domains as possible.
- Since a link will feature thousands of pages from the same domain, ensuring that I don't feed the same origin multiple times is crucial.

In [14]:
# GETTING THE LINKS FROM 'link_data.csv'
urls = []
with open(csv_filename, mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        urls.append(row[0])
# import random
# random.shuffle(urls)
print(urls[:1000])

['https://www.hopewells.co.uk/products/beds-bedroom-furniture', 'https://www.hopewells.co.uk/products/beds-bedroom-furniture/beds', 'https://www.hopewells.co.uk/products/beds-bedroom-furniture/beds/somnus-ambassador-30500-springs', 'https://www.hopewells.co.uk/products/beds-bedroom-furniture/beds/somnus-lifestyle-12000-springs', 'https://www.hopewells.co.uk/products/beds-bedroom-furniture/beds/somnus-connaught-35500-springs', 'https://www.hopewells.co.uk/products/beds-bedroom-furniture/beds/somnus-viceroy-4550-springs', 'https://www.hopewells.co.uk/products/beds-bedroom-furniture/beds/somnus-diplomat-6550-springs', 'https://www.hopewells.co.uk/products/beds-bedroom-furniture/beds/somnus-legend-22500-springs', 'https://www.hopewells.co.uk/products/beds-bedroom-furniture/beds/somnus-viscount-18000-springs', 'https://www.hopewells.co.uk/products/beds-bedroom-furniture/beds/somnus-marquis-21000-springs', 'https://www.hopewells.co.uk/products/beds-bedroom-furniture/bedroom-furniture', 'http

In [None]:
# Chat gpt made this method - I will not pretend to understand how it works but it does !!!
import itertools
from collections import defaultdict
from urllib.parse import urlparse

# fead all URLs from the CSV file
urls = []
with open(csv_filename, mode='r', newline='', encoding='utf-8') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        urls.append(row[0])

# group URLs by their base URL
url_groups = defaultdict(list)
for url in urls:
    base_url = get_base_url(url)
    url_groups[base_url].append(url)

# create a round-robin iterator of the base URL groups
group_iterators = {base: itertools.cycle(group) for base, group in url_groups.items()}

# create a round-robin cycle of the base URLs
base_urls = list(group_iterators.keys())
round_robin_bases = itertools.cycle(base_urls)

# generate the dispersed list of URLs
dispersed_urls = []
used_urls = set()

# loop until all URLs are used
while len(used_urls) < len(urls):
    for base_url in base_urls:
        # get the next URL from the current base URL group
        try:
            next_url = next(group_iterators[base_url])
            if next_url not in used_urls:
                dispersed_urls.append(next_url)
                used_urls.add(next_url)
        except StopIteration:
            pass

# save the dispersed URLs back to a CSV file
# this file will be used in the future notebook
with open('../../Data/InUseData/dispersed_link_data.csv', mode='w', newline='', encoding='utf-8') as file:
    csv_writer = csv.writer(file)
    for url in dispersed_urls:
        csv_writer.writerow([url])

print(f"Dispersed {len(dispersed_urls)} URLs successfully!")