# Web scraping the sites that contain site maps

This notebook aims to gather all the data from multiple websites that contain site maps. The target is to get as many product websites from different pages with as many different furniture types as possible.

In [36]:
# importing libraries

import csv 
import operator

import sys

import requests
from tqdm import tqdm 
from bs4 import BeautifulSoup

from concurrent.futures import ThreadPoolExecutor
import concurrent

from urllib.parse import urlparse
from urllib.parse import urljoin

### Methods for getting the base url and for checking if the website has a sitemap

In [8]:



SITEMAP_PATHS = [
    "sitemap.xml",
    "sitemap_index.xml",
    ".sitemap.xml",
    "sitemap/sitemap.xml",
    "sitemap_index/sitemap.xml"
]

def get_base_url(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url
    except Exception as e:
        # print(f"Error parsing URL {url}: {e}")
        return None

def check_sitemap(base_url):
    """
    Check if the given base URL has a sitemap in one of the common paths.
    Returns the URL of the sitemap if found, otherwise None.
    """
    for sitemap_path in SITEMAP_PATHS:
        sitemap_url = urljoin(base_url, sitemap_path)
        try:
            response = requests.head(sitemap_url, timeout=10)
            # Check if the URL exists and returns a successful status code (200)
            if 200 <= response.status_code < 300:
                return sitemap_url
        except requests.RequestException as e:
            # print(f"Error checking {sitemap_url}: {e}")
            continue
    return None

def check_sitemap_concurrently(base_urls):
    """
    Checks sitemaps for a list of base URLs concurrently.
    Returns a list of results with the base URL and sitemap URL.
    """
    results = []
    with ThreadPoolExecutor(max_workers=16) as executor:
        futures = {executor.submit(check_sitemap, base_url): base_url for base_url in base_urls}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Checking sitemaps"):
            base_url = futures[future]
            sitemap_url = future.result()
            if sitemap_url:
                results.append([base_url, "Sitemap found", sitemap_url])
            else:
                results.append([base_url, "No sitemap found", ""])
    return results

### Filtering out the websites that are not accessible that have a site map

In [None]:
# reading the csv file and storing the links in the links list
links = []
with open('Data/furniture stores pages.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        try:
            link = get_base_url(row[0])
            links.append(link)
        except Exception as e:
            continue
        

# Check sitemaps concurrently
sitemap_results = check_sitemap_concurrently(links)

# Write the results to the output CSV file
output_csv = "sitemap_results.csv"
with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["URL", "Status", "Sitemap URL"])  # Header row
    csvwriter.writerows(sitemap_results)  # Write all results

print(f"Results saved to {output_csv}.")

### Getting the sitemaps from the csv file

In [3]:
# getting only the sitemaps from the csv file
sitemaps = []
with open('sitemap_results.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if row[1] != 'No sitemap found':
            sitemaps.append(row[2])

print(sitemaps[1:], len(sitemaps))

['https://cane-line.co.uk/sitemap.xml', 'https://edenliving.online/sitemap.xml', 'https://dunlin.com.au/sitemap.xml', 'https://vastinterior.com.au/sitemap.xml', 'https://hemisphereliving.com.au/sitemap.xml', 'https://www.factorybuys.com.au/sitemap.xml', 'https://dhfonline.com/sitemap.xml', 'https://www.tandemarbor.com/sitemap.xml', 'https://www.perchfurniture.com/sitemap.xml', 'https://modshop1.com/sitemap.xml', 'https://www.ourfurniturewarehouse.com.au/sitemap.xml', 'https://www.hudsonfurniture.com.au/sitemap.xml', 'https://www.scandesign.com/sitemap.xml', 'https://www.sofamania.com/sitemap.xml', 'https://www.fentonandfenton.com.au/sitemap.xml', 'https://4-chairs.com/sitemap.xml', 'https://www.knoll.com/sitemap.xml', 'https://acmeshelving.com/sitemap.xml', 'https://claytongrayhome.com/sitemap.xml', 'https://www.do-shop.com/sitemap.xml', 'https://premiumpatio.com.au/sitemap.xml', 'https://www.theinside.com/sitemap.xml', 'https://www.kmpfurniture.com/sitemap.xml', 'https://www.jseitz.co

### Methods for getting the links from the sitemaps

In [9]:
def get_data(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None
    return response.text

def is_valid_product_link(url):
    # Exclude common unwanted patterns
    unwanted_patterns = ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.css', '.js', '.ico', 'tel:', 'mailto:', '#']
    for pattern in unwanted_patterns:
        if pattern in url:
            return False
    # Only accept URLs that contain "collections" or "products"
    if '/products/' in url: # or  'collections' in url: - right now I will only focus on the products path
        return True
    return False

def is_desired_site_map_link(url): #  
    # Only accept URLs that contain "sitemap"
    if 'sitemap_products_1.xml' in url: # most website have this in the url that I am looking for
        return True
    return False


dict_href_links = {}

def get_links_from_sitemap(website_link): # modified version from the one in the other notebook
    # Set the base of the URL depending on whether "collections" or "products" is in the link
    website_origin = website_link
    sitemap_index = website_link.find('/sitemap')
    if sitemap_index == -1:
        sitemap_index = website_link.find('/.sitemap')
    if sitemap_index != -1:
        website_origin = website_link[:sitemap_index + 1] 

    html_data = get_data(website_link)
    soup = BeautifulSoup(html_data, "html.parser")
    list_links = []

    for link in soup.find_all("loc"): # this contains the links inside xml files
        link = link.text
        # Filter out invalid links (non-product/collection pages)
        
        if not is_valid_product_link(link) and not is_desired_site_map_link(link): 
            continue
        
        link_to_append = None

        # Handle absolute URLs that start with the origin
        if link.startswith(str(website_origin)):
            link_to_append = link
        
        # Handle relative URLs that start with "/"
        elif link.startswith("/"):
            #print(href)
            link_with_www = website_origin + link[1:]
            #print("adjusted link =", link_with_www)
            link_to_append = link_with_www
        

        
        # If link_to_append is not None, check if it's already in dict_href_links and if it's accessible
        if link_to_append is not None:
            if link_to_append not in dict_href_links: #  and check_website(link_to_append) - I will not check the links here, I will check them after I get all the links
                dict_href_links[link_to_append] = None  # Mark it as seen

                list_links.append(link_to_append)

    # Convert list of links to a dictionary with "Not-checked" as the default value for each
    dict_links = dict.fromkeys(list_links, "Not-checked")
    return dict_links

def get_subpage_links(l, max_depth=3, current_depth=0, write_frequency=500, csv_filename="link_data.csv"):
    processed_links_count = 0
    
    if current_depth >= max_depth:
        return l

    with ThreadPoolExecutor(max_workers=32) as executor:
        futures = {executor.submit(get_links_from_sitemap, link): link for link in l if l[link] == "Not-checked"}
        
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing subpage links"):
            link = futures[future]
            try:
                dict_links_subpages = future.result()
                # print(f"Processed {link} with {len(dict_links_subpages)} subpages.")
            except Exception as e:
                print(f"Error fetching {link}: {e}")
                continue
            l[link] = "Checked"
            l.update(dict_links_subpages)

            processed_links_count += 1
            
            # Write to file every 'write_frequency' processed links
            if processed_links_count >= write_frequency: # this actually writes all the links to the csv file - even the not checked ones but in my case it is sufficient
                write_links_to_csv(l, csv_filename)
                processed_links_count = 0  # Reset the counter

    # Recursively call the function for the next depth level
    return get_subpage_links(l, max_depth, current_depth + 1, write_frequency, csv_filename)

def write_links_to_csv(links_dict, csv_filename):
    """Writes the current state of the links dictionary to a CSV file."""
    with open(csv_filename, "w", newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        for link in links_dict.keys():
            csvwriter.writerow([link])

    print(f"Links saved to {csv_filename}.")


### Getting the links from the sitemaps


In [4]:
# we gather all the links from some pages

# we test with the first page inside the final_links csv

websites = sitemaps[1:]  # exclude the first row
# create dictionary of website
dict_links = { website : "Not-checked" for website in websites }

counter, counter2 = None, 0
csv_filename = "link_data.csv"

while counter != 0:
    counter2 += 1
    dict_links2 = get_subpage_links(dict_links, csv_filename=csv_filename)
    counter = operator.countOf(dict_links2.values(), "Not-checked")  # Number of "Not-checked" links
    
    # Print some statements for debugging
    print("")
    print("THIS IS LOOP ITERATION NUMBER", counter2)
    print("LENGTH OF DICTIONARY WITH LINKS =", len(dict_links2))
    print("NUMBER OF 'Not-checked' LINKS = ", counter)
    print("")
    
    dict_links = dict_links2
    

write_links_to_csv(dict_links, csv_filename)

print("Links saved to link_data.csv.")

# removing any link that points to a sitemap



NameError: name 'sitemaps' is not defined

### Removing the sitemap links from the csv file

In [10]:
links = []

with open('link_data.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if '.xml' not in row[0]:
            links.append(row[0])

with open('link_data.csv', mode='w', newline='') as file:
    csv_writer = csv.writer(file)
    for link in links:
        csv_writer.writerow([link])
        
print("Links saved to link_data.csv.")

Links saved to link_data.csv.


# DECIDED TO NOT USE THIS METHOD 
### Creating a new csv file with the links content 

First csv that will contain only raw html data (from h1, p tags etc.)
- URL,Source,title,h1,h2,h3,p,span

Second csv will contain the links will contain a more finallized version of the data
- URL,Source,Product_Name,Description,Price

In [None]:

# tags_to_extract = ['h1', 'p'] # change this in order to get information from different tags
# 
# output_file = 'raw_content.csv'
# 
# def extract_text_from_url(url):
#     html_data = get_data(url) # this is "None" if the link is not accessible
#     if html_data is None:
#         return None
#     soup = BeautifulSoup(html_data, "html.parser")
#     
#     extracted_text = { tag: [] for tag in tags_to_extract }
#     
#     for tag in tags_to_extract:
#         elements = soup.find_all(tag)
#         for element in elements:
#             text = element.get_text(strip=True)
#             if text:
#                 extracted_text[tag].append(text)
#     for tag in extracted_text:
#         extracted_text[tag] = ' ### '.join(extracted_text[tag])
#         
#     return extracted_text
# 
# def process_url(url):
#     extracted_text = extract_text_from_url(url)
#     if extracted_text:
#         row = [url, get_base_url(url)]  # Replace 'Source Name' with your actual source
#         for tag in tags_to_extract:
#             row.append(extracted_text.get(tag, ''))  # Append text for each tag
#         return row
#     return None
# 
# def save_text_to_csv(data):
#     with open(output_file, mode='w', newline='', encoding='utf-8') as file:
#         writer = csv.writer(file)
#         headers = ['URL', 'Source'] + tags_to_extract
#         writer.writerow(headers)
#         for row in data:
#             if row[2] and row[3]:
#                 writer.writerow(row)

### Extracting all relevant text from the links and maintaining its structure

In this code block I will take a different approach and get all relevant data from the page and not altering its ordering (like I did above by separating the tags content apart). If this proves inefficient I will go back to the previous method but this makes more sense logically as long as the text segments are short enough for the model to understand, yet not to small for the model to not be able to understand the context (even though the context is some random text from the page like hyperlink text etc.)

In [39]:
def extract_text_from_url(url):
    html_data = get_data(url)
    if html_data is None:
        return None

    soup = BeautifulSoup(html_data, "html.parser")

    # Remove scripts, styles, and irrelevant content
    for script in soup(["script", "style", "footer", "nav", "header", "noscript"]):
        script.extract()

    # Extract text while keeping the structure
    text_blocks = set()  # Use a set to store unique text blocks
    for element in soup.find_all(['h1', 'h2', 'h3', 'p', 'a', 'li', 'span', 'div']):
        text = element.get_text(separator=" ", strip=True)
        if text:
            text_blocks.add(text)

    # Join the text blocks to maintain structure, using ' ### ' as separator for clarity
    return ' ### '.join(text_blocks)

# Process a single URL and return a row with extracted text
def process_url(url):
    extracted_text = extract_text_from_url(url)
    if extracted_text:
        return [url, get_base_url(url), extracted_text]
    return None

# Save extracted text data into a CSV file
import pandas as pd

def save_text_to_csv(data, output_file='raw_contents2.csv'):
    # Convert the list of rows into a pandas DataFrame
    df = pd.DataFrame(data, columns=['URL', 'Source', 'Content'])
    
    # Ensure that all values are treated as strings (this handles potential large integers)
    df = df.astype(str)
    
    # Drop any rows where the 'Content' column is empty or null
    df = df[df['Content'].notnull() & (df['Content'] != '')]
    
    # Save the DataFrame to a CSV file
    df.to_csv(output_file, index=False, encoding='utf-8')

    print(f"Data saved to {output_file}")


In [12]:
# GETTING THE LINKS FROM 'link_data.csv'

urls = []

with open('link_data.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        urls.append(row[0])
        
import random
random.shuffle(urls)

print(urls[:1000])


['https://www.antoninimodernliving.com/products/neon-stroke-i', 'https://www.collectioni.com/products/fluffy-bear-pouf-trompette-1', 'https://vincentdesign.com.au/products/heller-max-dinnerware-maxmug', 'https://mossgardenhome.com/products/cardinal-shoes-toss-cushion-18sq', 'https://dkmodernfurniture.com/products/maze-sideboard', 'https://midcenturymasters.com/products/bamboo-twin-bed-frame-from-the-new-york-apartment-of-barbara-streisand', 'https://designkollective.com/stores/mixfurniture/products/146283', 'https://www.tandemarbor.com/products/rivington-sofa-alabaster-crushed-velvet', 'https://www.willowcreekteak.com/products/two-tone-rope-handwoven-indoor-outdoor-rug', 'https://www.loungesplus.com.au/products/orlando-bedside-table', 'https://www.laura-james.co.uk/products/rattan-corner-sofa-set-grey-weave', 'https://designkollective.com/stores/mixfurniture/products/148191', 'https://shophorne.com/products/ginger-table-lamp-plug-and-cord', 'https://eurolivingfurniture.com/products/spi

In [13]:
from collections import defaultdict
extracted_data = []
    

with ThreadPoolExecutor(max_workers=32) as executor:
    future_to_url = {executor.submit(process_url, url): url for url in urls[:5000]}
    for future in tqdm(concurrent.futures.as_completed(future_to_url), total=len(future_to_url), desc="Extracting content from URLs (p, h1, h2 etc.)"): 
        result = future.result()
        if result:
            extracted_data.append(result)
            
print(extracted_data)


Extracting content from URLs (p, h1, h2 etc.):  12%|█▏        | 611/5000 [00:49<04:05, 17.87it/s]

Error fetching https://www.scotmeachamwoodhome.com/products/heather-twill-clay: 404 Client Error: Not Found for url: https://www.scotmeachamwoodhome.com/products/heather-twill-clay


Extracting content from URLs (p, h1, h2 etc.):  40%|███▉      | 1978/5000 [02:41<02:19, 21.63it/s]

Error fetching https://designkollective.com/stores/de-cor-globally-inspired/products/173513: 404 Client Error: Not Found for url: https://designkollective.com/stores/de-cor-globally-inspired/products/173513


Extracting content from URLs (p, h1, h2 etc.):  45%|████▌     | 2251/5000 [03:02<03:41, 12.43it/s]

Error fetching https://www.quaysideinteriors.co.uk/products/rio-ottoman-bed-ruby-headbord: 404 Client Error: Not Found for url: https://www.quaysideinteriors.co.uk/products/rio-ottoman-bed-ruby-headbord


Extracting content from URLs (p, h1, h2 etc.):  53%|█████▎    | 2627/5000 [03:32<03:49, 10.34it/s]

Error fetching https://yoyo.co.nz/products/emerson-floor-rug: 404 Client Error: Not Found for url: https://yoyo.co.nz/products/emerson-floor-rug


Extracting content from URLs (p, h1, h2 etc.):  75%|███████▍  | 3748/5000 [05:01<02:09,  9.71it/s]

Error fetching https://homeresource.com/products/imperial: 404 Client Error: Not Found for url: https://homeresource.com/products/imperial


Extracting content from URLs (p, h1, h2 etc.): 100%|██████████| 5000/5000 [06:42<00:00, 12.43it/s]
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [41]:
# Save the extracted content to a CSV file


save_text_to_csv(extracted_data)
print(f"Extraction completed. Data saved to {'raw_content.csv'}.")

Data saved to raw_contents2.csv
Extraction completed. Data saved to raw_content.csv.


In [45]:
data = []
counter = 0


# Increase the field size limit
csv.field_size_limit(10**7)


with open('raw_contents3.csv', mode='r', newline='', encoding='utf-8') as file: 
    csv_reader = csv.reader(file)
    for row in csv_reader:
        counter += 1
        data.append(row)
        
        


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




row number 4968 Popular Styles Custom Made Leather Sofas Nordic Recliners ### TV Units ### Daybeds ### Our bestsellers shop Seville  > ### shop tv cabinets > ### Hanging & Egg Chairs ### new > ### Rattan Patio range ### Account ### > Find out more ### Recycled & Sustainable shop Dining Tables > ### Artful storage shop tv cabinets > ### Old Door ### The timber we use is sustainably sourced from a variety of recycled ### Seville Buffet 4 Door - Waterwood Sale price $1,799 ### Chat > Jump on our website chat during business hours. ### Leather Sofas ### Seville Dining Table (220 x 100) Sale price $1,299 Size: 220X100X78 220X100X78 220X100X78 Colour: Waterwood Waterwood Quantity: Decrease quantity Increase quantity Add to cart Need help? Need help? Got a question or product query? Email > click here to get in touch. Chat > Jump on our website chat during business hours. Phone > (07) 3160 7555 Got a question or product query? Email > click here to get in touch. Chat > Jump on our website ch

In [48]:
print(data[203][0], data[203][2])

