# Web scraping the sites that contain site maps

This notebook aims to gather all the data from multiple websites that contain site maps. The target is to get as many product websites from different pages with as many different furniture types as possible.

In [1]:
# importing libraries

import csv 
import operator
import re

import sys

import requests
from tqdm import tqdm 
from bs4 import BeautifulSoup

from concurrent.futures import ThreadPoolExecutor
import concurrent

from urllib.parse import urlparse
from urllib.parse import urljoin

from collections import defaultdict

### Methods for getting the base url and for checking if the website has a sitemap

In [13]:



SITEMAP_PATHS = [
    "sitemap.xml",
    "sitemap_index.xml",
    ".sitemap.xml",
    "sitemap/sitemap.xml",
    "sitemap_index/sitemap.xml"
]

def get_base_url(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url
    except Exception as e:
        # print(f"Error parsing URL {url}: {e}")
        return None

def check_sitemap(base_url):
    """
    Check if the given base URL has a sitemap in one of the common paths.
    Returns the URL of the sitemap if found, otherwise None.
    """
    for sitemap_path in SITEMAP_PATHS:
        sitemap_url = urljoin(base_url, sitemap_path)
        try:
            response = requests.head(sitemap_url, timeout=10)
            # Check if the URL exists and returns a successful status code (200)
            if 200 <= response.status_code < 300:
                return sitemap_url
        except requests.RequestException as e:
            # print(f"Error checking {sitemap_url}: {e}")
            continue
    return None

def check_sitemap_concurrently(base_urls):
    """
    Checks sitemaps for a list of base URLs concurrently.
    Returns a list of results with the base URL and sitemap URL.
    """
    results = []
    with ThreadPoolExecutor(max_workers=16) as executor:
        futures = {executor.submit(check_sitemap, base_url): base_url for base_url in base_urls}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Checking sitemaps"):
            base_url = futures[future]
            sitemap_url = future.result()
            if sitemap_url:
                results.append([base_url, "Sitemap found", sitemap_url])
            else:
                results.append([base_url, "No sitemap found", ""])
    return results

### Filtering out the websites that are not accessible that have a site map

In [None]:
# reading the csv file and storing the links in the links list
links = []
with open('Data/furniture stores pages.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        try:
            link = get_base_url(row[0])
            links.append(link)
        except Exception as e:
            continue
        

# Check sitemaps concurrently
sitemap_results = check_sitemap_concurrently(links)

# Write the results to the output CSV file
output_csv = "sitemap_results.csv"
with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["URL", "Status", "Sitemap URL"])  # Header row
    csvwriter.writerows(sitemap_results)  # Write all results

print(f"Results saved to {output_csv}.")

### Getting the sitemaps from the csv file

In [3]:
# getting only the sitemaps from the csv file
sitemaps = []
with open('sitemap_results.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if row[1] != 'No sitemap found':
            sitemaps.append(row[2])

print(sitemaps[1:], len(sitemaps))

['https://cane-line.co.uk/sitemap.xml', 'https://edenliving.online/sitemap.xml', 'https://dunlin.com.au/sitemap.xml', 'https://vastinterior.com.au/sitemap.xml', 'https://hemisphereliving.com.au/sitemap.xml', 'https://www.factorybuys.com.au/sitemap.xml', 'https://dhfonline.com/sitemap.xml', 'https://www.tandemarbor.com/sitemap.xml', 'https://www.perchfurniture.com/sitemap.xml', 'https://modshop1.com/sitemap.xml', 'https://www.ourfurniturewarehouse.com.au/sitemap.xml', 'https://www.hudsonfurniture.com.au/sitemap.xml', 'https://www.scandesign.com/sitemap.xml', 'https://www.sofamania.com/sitemap.xml', 'https://www.fentonandfenton.com.au/sitemap.xml', 'https://4-chairs.com/sitemap.xml', 'https://www.knoll.com/sitemap.xml', 'https://acmeshelving.com/sitemap.xml', 'https://claytongrayhome.com/sitemap.xml', 'https://www.do-shop.com/sitemap.xml', 'https://premiumpatio.com.au/sitemap.xml', 'https://www.theinside.com/sitemap.xml', 'https://www.kmpfurniture.com/sitemap.xml', 'https://www.jseitz.co

### Methods for getting the links from the sitemaps

In [12]:
def get_data(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None
    return response.text

def is_valid_product_link(url):
    # Exclude common unwanted patterns
    unwanted_patterns = ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.css', '.js', '.ico', 'tel:', 'mailto:', '#']
    for pattern in unwanted_patterns:
        if pattern in url:
            return False
    # Only accept URLs that contain "collections" or "products"
    if '/products/' in url: # or  'collections' in url: - right now I will only focus on the products path
        return True
    return False

def is_desired_site_map_link(url): #  
    # Only accept URLs that contain "sitemap"
    if 'sitemap_products_1.xml' in url: # most website have this in the url that I am looking for
        return True
    return False


dict_href_links = {}

def get_links_from_sitemap(website_link): # modified version from the one in the other notebook
    # Set the base of the URL depending on whether "collections" or "products" is in the link
    website_origin = website_link
    sitemap_index = website_link.find('/sitemap')
    if sitemap_index == -1:
        sitemap_index = website_link.find('/.sitemap')
    if sitemap_index != -1:
        website_origin = website_link[:sitemap_index + 1] 

    html_data = get_data(website_link)
    soup = BeautifulSoup(html_data, "html.parser")
    list_links = []

    for link in soup.find_all("loc"): # this contains the links inside xml files
        link = link.text
        # Filter out invalid links (non-product/collection pages)
        
        if not is_valid_product_link(link) and not is_desired_site_map_link(link): 
            continue
        
        link_to_append = None

        # Handle absolute URLs that start with the origin
        if link.startswith(str(website_origin)):
            link_to_append = link
        
        # Handle relative URLs that start with "/"
        elif link.startswith("/"):
            #print(href)
            link_with_www = website_origin + link[1:]
            #print("adjusted link =", link_with_www)
            link_to_append = link_with_www
        

        
        # If link_to_append is not None, check if it's already in dict_href_links and if it's accessible
        if link_to_append is not None:
            if link_to_append not in dict_href_links: #  and check_website(link_to_append) - I will not check the links here, I will check them after I get all the links
                dict_href_links[link_to_append] = None  # Mark it as seen

                list_links.append(link_to_append)

    # Convert list of links to a dictionary with "Not-checked" as the default value for each
    dict_links = dict.fromkeys(list_links, "Not-checked")
    return dict_links

def get_subpage_links(l, max_depth=3, current_depth=0, write_frequency=500, csv_filename="link_data.csv"):
    processed_links_count = 0
    
    if current_depth >= max_depth:
        return l

    with ThreadPoolExecutor(max_workers=32) as executor:
        futures = {executor.submit(get_links_from_sitemap, link): link for link in l if l[link] == "Not-checked"}
        
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing subpage links"):
            link = futures[future]
            try:
                dict_links_subpages = future.result()
                # print(f"Processed {link} with {len(dict_links_subpages)} subpages.")
            except Exception as e:
                print(f"Error fetching {link}: {e}")
                continue
            l[link] = "Checked"
            l.update(dict_links_subpages)

            processed_links_count += 1
            
            # Write to file every 'write_frequency' processed links
            if processed_links_count >= write_frequency: # this actually writes all the links to the csv file - even the not checked ones but in my case it is sufficient
                write_links_to_csv(l, csv_filename)
                processed_links_count = 0  # Reset the counter

    # Recursively call the function for the next depth level
    return get_subpage_links(l, max_depth, current_depth + 1, write_frequency, csv_filename)

def write_links_to_csv(links_dict, csv_filename):
    """Writes the current state of the links dictionary to a CSV file."""
    with open(csv_filename, "w", newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        for link in links_dict.keys():
            csvwriter.writerow([link])

    print(f"Links saved to {csv_filename}.")


### Getting the links from the sitemaps


In [4]:
# we gather all the links from some pages

# we test with the first page inside the final_links csv

websites = sitemaps[1:]  # exclude the first row
# create dictionary of website
dict_links = { website : "Not-checked" for website in websites }

counter, counter2 = None, 0
csv_filename = "link_data.csv"

while counter != 0:
    counter2 += 1
    dict_links2 = get_subpage_links(dict_links, csv_filename=csv_filename)
    counter = operator.countOf(dict_links2.values(), "Not-checked")  # Number of "Not-checked" links
    
    # Print some statements for debugging
    print("")
    print("THIS IS LOOP ITERATION NUMBER", counter2)
    print("LENGTH OF DICTIONARY WITH LINKS =", len(dict_links2))
    print("NUMBER OF 'Not-checked' LINKS = ", counter)
    print("")
    
    dict_links = dict_links2
    

write_links_to_csv(dict_links, csv_filename)

print("Links saved to link_data.csv.")

# removing any link that points to a sitemap



NameError: name 'sitemaps' is not defined

### Removing the sitemap links from the csv file

In [3]:
links = []

with open('link_data.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if '.xml' not in row[0]:
            links.append(row[0])

with open('link_data.csv', mode='w', newline='') as file:
    csv_writer = csv.writer(file)
    for link in links:
        csv_writer.writerow([link])
        
print("Links saved to link_data.csv.")

Links saved to link_data.csv.


# DECIDED TO NOT USE THIS METHOD 
### Creating a new csv file with the links content 

First csv that will contain only raw html data (from h1, p tags etc.)
- URL,Source,title,h1,h2,h3,p,span

Second csv will contain the links will contain a more finallized version of the data
- URL,Source,Product_Name,Description,Price

In [None]:

# tags_to_extract = ['h1', 'p'] # change this in order to get information from different tags
# 
# output_file = 'raw_content.csv'
# 
# def extract_text_from_url(url):
#     html_data = get_data(url) # this is "None" if the link is not accessible
#     if html_data is None:
#         return None
#     soup = BeautifulSoup(html_data, "html.parser")
#     
#     extracted_text = { tag: [] for tag in tags_to_extract }
#     
#     for tag in tags_to_extract:
#         elements = soup.find_all(tag)
#         for element in elements:
#             text = element.get_text(strip=True)
#             if text:
#                 extracted_text[tag].append(text)
#     for tag in extracted_text:
#         extracted_text[tag] = ' ### '.join(extracted_text[tag])
#         
#     return extracted_text
# 
# def process_url(url):
#     extracted_text = extract_text_from_url(url)
#     if extracted_text:
#         row = [url, get_base_url(url)]  # Replace 'Source Name' with your actual source
#         for tag in tags_to_extract:
#             row.append(extracted_text.get(tag, ''))  # Append text for each tag
#         return row
#     return None
# 
# def save_text_to_csv(data):
#     with open(output_file, mode='w', newline='', encoding='utf-8') as file:
#         writer = csv.writer(file)
#         headers = ['URL', 'Source'] + tags_to_extract
#         writer.writerow(headers)
#         for row in data:
#             if row[2] and row[3]:
#                 writer.writerow(row)

### Extracting all relevant text from the links and maintaining its structure

In this code block I will take a different approach and get all relevant data from the page and not altering its ordering (like I did above by separating the tags content apart). If this proves inefficient I will go back to the previous method but this makes more sense logically as long as the text segments are short enough for the model to understand, yet not to small for the model to not be able to understand the context (even though the context is some random text from the page like hyperlink text etc.)

In [4]:
PRODUCT_TITLE_KEYWORDS = ['product', 'item', 'title', 'name', 'description', 'head']


def clean_text(text):
    # Replace multiple spaces and line breaks with a single space
    cleaned_text = re.sub(r'\s+', ' ', text).strip()
    return cleaned_text

def extract_text_from_url(url):
    html_data = get_data(url)
    if html_data is None:
        return None
    
    soup = BeautifulSoup(html_data, "html.parser")

    # Remove scripts, styles, and irrelevant content
    for script in soup(["script", "style", "footer", "nav", "header", "noscript"]):
        script.extract()

    h1_tags = [clean_text(h1.get_text()) for h1 in soup.find_all('h1')]
    
    filtered_h1_tags = []

    for tag in h1_tags:
        if any(keyword in tag.lower() for keyword in PRODUCT_TITLE_KEYWORDS):
            filtered_h1_tags.append(tag)
    
    h1_tags = filtered_h1_tags
    
    page_text = clean_text(soup.get_text(separator=' '))

    # If no h1 tags are found, return None
    if not h1_tags:
        return None

    # Only take the first h1 tag and its position
    h1_tag = h1_tags[0]  # Assuming we only care about the first h1 tag
    if h1_tag in page_text:
        start_idx = page_text.index(h1_tag)
        end_idx = start_idx + len(h1_tag)
        h1_tag_position = (h1_tag, start_idx, end_idx)
        return [h1_tag_position], page_text
    return None

def tokenize_and_label(text, h1_tag_positions, token_window=30):
    tokens = text.split()
    labels = ['O'] * len(tokens)  # Default all tokens to 'O'
    
    # Create character-to-token index mapping
    char_to_token_idx = []
    current_pos = 0
    for token in tokens:
        char_to_token_idx.append(current_pos)
        current_pos += len(token) + 1  # Adding 1 for the space separator
    
    # We only care about the first h1_tag position
    h1_text, start_idx, end_idx = h1_tag_positions[0]

    # Find the token indices corresponding to the h1_tag
    start_token_idx = None
    end_token_idx = None
    for i, char_idx in enumerate(char_to_token_idx):
        if start_token_idx is None and char_idx >= start_idx:
            start_token_idx = i
        if char_idx > end_idx:
            end_token_idx = i
            break

    if start_token_idx is not None:
        # Label the h1 tag tokens
        labels[start_token_idx] = 'B-PRODUCT'
        if end_token_idx is None:  # If the tag is the last part of the text
            end_token_idx = len(tokens)
        for j in range(start_token_idx + 1, end_token_idx):
            labels[j] = 'I-PRODUCT'

        # Trim the tokens to keep a window around the product
        start_window = max(0, start_token_idx - token_window)
        end_window = min(len(tokens), end_token_idx + token_window)
        
        # Trim the tokens and labels to the window
        tokens = tokens[start_window:end_window]
        labels = labels[start_window:end_window]

    return tokens, labels

# Process a single URL and return a row with extracted text
def process_url(url):
    try:
        result = extract_text_from_url(url)
        if result is None:
            return None  # Skip processing if extraction failed

        h1_tag_positions, page_text = result

        if page_text:
            tokens, labels = tokenize_and_label(page_text, h1_tag_positions, token_window=30)
            return [url, get_base_url(url), tokens, labels]
        return None
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        return None

# Save extracted text data into a CSV file
import pandas as pd

def save_text_to_csv(data, output_file='tokenized_data.csv'):
    rows = []
    for url, source, tokens, labels in data:
        rows.append([url, source, tokens, labels])

    # Create a DataFrame and save it to a CSV
    df = pd.DataFrame(rows, columns=['URL', 'Source', 'Tokens', 'Labels'])
    
    # Save DataFrame to a CSV file
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Data saved to {output_file}")


In [19]:
# GETTING THE LINKS FROM 'link_data.csv'

urls = []

with open('link_data.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        urls.append(row[0])
        
import random
random.shuffle(urls)

print(urls[:1000])


['https://www.ledecostyle.com/products/stunning-pair-of-murano-mushroom-table-lamps-by-vistosi', 'https://www.do-shop.com/products/soho-round-pedestal-table-small', 'https://www.myconcept.com.hk/zh/products/kare-design-shelf-authentico-zick-zack-150', 'https://modshop1.com/products/mulberry-protection-83', 'https://www.stacksfurniture.co.nz/products/umbrapillarcoarrack-black', 'https://yoyo.co.nz/products/io-table-light', 'https://limitedabode.co.uk/products/lemon-vase', 'https://www.skandium.com/en-kr/products/one-step-up-bookcase', 'https://emfurn.com/products/lp-kiana-queen-metal-stainless-steel-headboard', 'https://www.gowfb.com/products/hoof-floor-lamp-in-off-white-finish', 'https://hauslondon.com/products/cache-floor-lamp-by-aurelien-barbry-for-le-klint', 'https://www.thefurnituremegastore.co.uk/products/alaska-triple-sleeper-futon-bunk-bed', 'https://cane-line.co.uk/products/cushion-set-basket-chair', 'https://lexiconhome.com/products/b181534t-1', 'https://classicwithatwist.com.

In [20]:

extracted_data = []
    

with ThreadPoolExecutor(max_workers=32) as executor:
    future_to_url = {executor.submit(process_url, url): url for url in urls[:5000]}
    for future in tqdm(concurrent.futures.as_completed(future_to_url), total=len(future_to_url), desc="Extracting content from URLs (p, h1, h2 etc.)"): 
        result = future.result()
        if result:
            extracted_data.append(result)
            
print(extracted_data)


Extracting content from URLs (p, h1, h2 etc.):   5%|▍         | 247/5000 [00:24<12:37,  6.28it/s]

Error fetching https://modernhomefurniture.com.au/products/noah-coffee-table: 404 Client Error: Not Found for url: https://modernhomefurniture.com.au/products/noah-coffee-table


Extracting content from URLs (p, h1, h2 etc.):   6%|▌         | 293/5000 [00:31<25:29,  3.08it/s]

Error fetching https://modernkomfort.ca/products/white-charcuterie-plank: 404 Client Error: Not Found for url: https://modernkomfort.ca/products/white-charcuterie-plank


Extracting content from URLs (p, h1, h2 etc.):   9%|▉         | 438/5000 [00:42<04:03, 18.72it/s]

Error fetching https://www.somedaydesigns.co.uk/products/rust-rich-velvet-ferm-living: 404 Client Error: Not Found for url: https://www.somedaydesigns.co.uk/products/rust-rich-velvet-ferm-living


Extracting content from URLs (p, h1, h2 etc.):  10%|█         | 515/5000 [00:50<06:46, 11.04it/s]

Error fetching https://thecountryfurniture.com/products/fair-warm-brown-accent-cabinet: 404 Client Error: Not Found for url: https://thecountryfurniture.com/products/fair-warm-brown-accent-cabinet


Extracting content from URLs (p, h1, h2 etc.):  11%|█         | 536/5000 [00:52<11:49,  6.29it/s]

Error fetching https://www.modernfurnituredeals.co.uk/products/small-copper-brushed-elephant-figurine: 404 Client Error: Not Found for url: https://www.modernfurnituredeals.co.uk/products/small-copper-brushed-elephant-figurine


Extracting content from URLs (p, h1, h2 etc.):  18%|█▊        | 890/5000 [01:23<03:36, 18.97it/s]

Error fetching https://designhouse.com/products/gaze-of-the-predator: 404 Client Error: Not Found for url: https://designhouse.com/products/gaze-of-the-predator


Extracting content from URLs (p, h1, h2 etc.):  22%|██▏       | 1080/5000 [01:41<08:03,  8.11it/s]

Error fetching https://designkollective.com/stores/de-cor-globally-inspired/products/175276: 404 Client Error: Not Found for url: https://designkollective.com/stores/de-cor-globally-inspired/products/175276


Extracting content from URLs (p, h1, h2 etc.):  29%|██▉       | 1471/5000 [02:18<03:06, 18.90it/s]

Error fetching https://www.prestige-affairs.com/products/soraya-chair: 404 Client Error: Not Found for url: https://www.prestige-affairs.com/products/soraya-chair


Extracting content from URLs (p, h1, h2 etc.):  32%|███▏      | 1618/5000 [02:30<02:44, 20.58it/s]

Error fetching https://bungalowfurniture.com/products/4-7770-paw-resist-pet-bowl-s-green: 404 Client Error: Not Found for url: https://bungalowfurniture.com/products/4-7770-paw-resist-pet-bowl-s-green


Extracting content from URLs (p, h1, h2 etc.):  34%|███▍      | 1711/5000 [02:35<03:04, 17.81it/s]

Error fetching https://www.shopden-la.com/products/brass-conch-ashtray: 404 Client Error: Not Found for url: https://www.shopden-la.com/products/brass-conch-ashtray


Extracting content from URLs (p, h1, h2 etc.):  35%|███▍      | 1726/5000 [02:36<02:59, 18.28it/s]

Error fetching https://magnolialane.biz/products/camps-bay-covers-white: 404 Client Error: Not Found for url: https://magnolialane.biz/products/camps-bay-covers-white


Extracting content from URLs (p, h1, h2 etc.):  35%|███▍      | 1744/5000 [02:38<03:16, 16.58it/s]

Error fetching https://www.idcmn.com/products/triplex-coffee-table: 404 Client Error: Not Found for url: https://www.idcmn.com/products/triplex-coffee-table


Extracting content from URLs (p, h1, h2 etc.):  42%|████▏     | 2090/5000 [03:03<04:09, 11.64it/s]

Error fetching https://hklivingusa.com/products/cover-for-aluminum-outdoor-sofa: 404 Client Error: Not Found for url: https://hklivingusa.com/products/cover-for-aluminum-outdoor-sofa


Extracting content from URLs (p, h1, h2 etc.):  43%|████▎     | 2134/5000 [03:05<02:38, 18.11it/s]

Error fetching https://www.georgestreet.co.uk/products/turin-crushed-mirror-nest-of-2-tables/: 404 Client Error: Not Found for url: https://www.georgestreet.co.uk/products/turin-crushed-mirror-nest-of-2-tables/


Extracting content from URLs (p, h1, h2 etc.):  44%|████▍     | 2217/5000 [03:11<02:27, 18.87it/s]

Error fetching https://www.scandesign.com/products/sassi-sofa-teal: 404 Client Error: Not Found for url: https://www.scandesign.com/products/sassi-sofa-teal


Extracting content from URLs (p, h1, h2 etc.):  50%|████▉     | 2495/5000 [03:31<02:12, 18.93it/s]

Error fetching https://www.modernfurnituredeals.co.uk/products/3-drawer-bedside-cabinet: 404 Client Error: Not Found for url: https://www.modernfurnituredeals.co.uk/products/3-drawer-bedside-cabinet


Extracting content from URLs (p, h1, h2 etc.):  51%|█████     | 2561/5000 [03:36<02:11, 18.60it/s]

Error fetching https://www.vavoom.com.au/products/terrier-wall-art-gloss-finish-with-gold-metal-frame: 404 Client Error: Not Found for url: https://www.vavoom.com.au/products/terrier-wall-art-gloss-finish-with-gold-metal-frame


Extracting content from URLs (p, h1, h2 etc.):  55%|█████▌    | 2758/5000 [03:50<01:36, 23.18it/s]

Error fetching https://designkollective.com/stores/mixfurniture/products/148089: 404 Client Error: Not Found for url: https://designkollective.com/stores/mixfurniture/products/148089


Extracting content from URLs (p, h1, h2 etc.):  63%|██████▎   | 3171/5000 [04:22<02:22, 12.81it/s]

Error fetching https://thecountryfurniture.com/products/nassau-sofa-table-dark-brown: 404 Client Error: Not Found for url: https://thecountryfurniture.com/products/nassau-sofa-table-dark-brown


Extracting content from URLs (p, h1, h2 etc.):  65%|██████▍   | 3232/5000 [04:26<01:32, 19.07it/s]

Error fetching https://hamptonsstyle.com.au/products/exotique-ceramic-plate: 404 Client Error: Not Found for url: https://hamptonsstyle.com.au/products/exotique-ceramic-plate


Extracting content from URLs (p, h1, h2 etc.):  66%|██████▋   | 3321/5000 [04:32<02:18, 12.10it/s]

Error fetching https://thebanyantree.com.au/products/globewest-normandy-twist-sofa-chair: 404 Client Error: Not Found for url: https://thebanyantree.com.au/products/globewest-normandy-twist-sofa-chair


Extracting content from URLs (p, h1, h2 etc.):  69%|██████▉   | 3447/5000 [04:43<02:29, 10.42it/s]

Error fetching https://thecountryfurniture.com/products/bayflynn-white-black-home-office-desk-clean-lined: 404 Client Error: Not Found for url: https://thecountryfurniture.com/products/bayflynn-white-black-home-office-desk-clean-lined


Extracting content from URLs (p, h1, h2 etc.):  73%|███████▎  | 3639/5000 [05:00<01:46, 12.82it/s]

Error fetching https://brisbanefurniture.com.au/products/factory-second-white-french-mirror: 404 Client Error: Not Found for url: https://brisbanefurniture.com.au/products/factory-second-white-french-mirror


Extracting content from URLs (p, h1, h2 etc.):  75%|███████▌  | 3772/5000 [05:10<01:53, 10.79it/s]

Error fetching https://eurolivingfurniture.com/products/tevo-chest: 404 Client Error: Not Found for url: https://eurolivingfurniture.com/products/tevo-chest


Extracting content from URLs (p, h1, h2 etc.):  80%|████████  | 4012/5000 [05:27<01:03, 15.68it/s]

Error fetching https://www.modernfurnituredeals.co.uk/products/black-tweed-bar-stool-solid-wood: 404 Client Error: Not Found for url: https://www.modernfurnituredeals.co.uk/products/black-tweed-bar-stool-solid-wood


Extracting content from URLs (p, h1, h2 etc.):  84%|████████▍ | 4225/5000 [05:46<02:18,  5.59it/s]

Error fetching https://limitedabode.co.uk/products/flower-names-vase: 404 Client Error: Not Found for url: https://limitedabode.co.uk/products/flower-names-vase


Extracting content from URLs (p, h1, h2 etc.):  87%|████████▋ | 4363/5000 [05:55<00:34, 18.39it/s]

Error fetching https://woodaction.com/products/modern-5-by-8-ft-sl7: 404 Client Error: Not Found for url: https://woodaction.com/products/modern-5-by-8-ft-sl7


Extracting content from URLs (p, h1, h2 etc.):  91%|█████████ | 4535/5000 [06:10<00:44, 10.54it/s]

Error fetching https://foreverfurniture.ca/products/krahn-lime-green-5-piece-deluxe-bistro-set-and-chairs: 404 Client Error: Not Found for url: https://foreverfurniture.ca/products/krahn-lime-green-5-piece-deluxe-bistro-set-and-chairs


Extracting content from URLs (p, h1, h2 etc.):  98%|█████████▊| 4894/5000 [06:38<00:05, 17.98it/s]

Error fetching https://livingbydesign.net.au/products/ecology-samara-goblet-wine-glasses-set-of-4-white: 404 Client Error: Not Found for url: https://livingbydesign.net.au/products/ecology-samara-goblet-wine-glasses-set-of-4-white


Extracting content from URLs (p, h1, h2 etc.):  98%|█████████▊| 4899/5000 [06:39<00:06, 16.11it/s]

Error fetching https://www.simplyhammocks.co.uk/products/clearance-aruba-cayenne-hammock: 404 Client Error: Not Found for url: https://www.simplyhammocks.co.uk/products/clearance-aruba-cayenne-hammock


Extracting content from URLs (p, h1, h2 etc.): 100%|██████████| 5000/5000 [06:47<00:00, 12.27it/s]

Error fetching https://midinmod.com/products/aliana-dining-set-with-ohio-light-grey-chairs: 404 Client Error: Not Found for url: https://midinmod.com/products/aliana-dining-set-with-ohio-light-grey-chairs





In [22]:
# Save the extracted content to a CSV file



save_text_to_csv(extracted_data, output_file='Data/dataset_for_training_1.csv')

Data saved to Data/dataset_for_training_1.csv


In [23]:
data = []
counter = 0


# Increase the field size limit
csv.field_size_limit(10**7)


with open('Data/dataset_for_training_1.csv', mode='r', newline='', encoding='utf-8') as file:  # 46986
    csv_reader = csv.reader(file)
    for row in csv_reader:
        counter += 1
        data.append([row[2]])
        

with open('Data/dataset_for_training_1.csv', mode='w', newline='', encoding='utf-8') as file:
    csv_writer = csv.writer(file)
    for row in data:
        row = row[0].replace('[', '').replace(']', '').replace('\'', '').replace(',', '').replace('\"', '')
        csv_writer.writerow([row])



In [22]:
print(data[200][0], data[200][2])

https://www.ambersfurniture.com/products/dc-rory DC Rory – Amber's Furniture Lock icon Rectangle 1 Rectangle 2 + Rectangle 2 Copy Shape Rectangle 1 Shopify logo Skip to content Search Sale Event Expand menu Collapse menu In Stock & On Sale Decor-rest Sale Dining Expand menu Collapse menu Tables Dining Collections Fine Dining Works Buffets & Sideboards Canadian Made Dining Chairs Bedroom Expand menu Collapse menu Bedroom Collections Magniflex Mattresses Bedroom Sale Living Room Expand menu Collapse menu Sofas Motion Leather Occasional Chairs & Ottomans Sofa Beds Popular Sectionals Decor-rest Gallery Occasional & Office Expand menu Collapse menu Occasional Tables Bookcases Desks Amish Collections Custom Design Inspiration Contact Huge Selection In-Stock & On Sale in our 40,000 SQFT Showroom! Call us at 403-291-3858 or come on in! Close Amber Imports DC Rory Sale $1,749 Regular price $2,499 Default Title - $1,749.00 CAD Quantity Add to Cart The Rory Dining Collection 48" Round Table w/ 4 

In [18]:
print(len(extracted_data))

23


In [11]:
url = 'https://www.ikea.com/us/en/p/brimnes-bed-frame-w-storage-and-headboard-white-luroey-s69216757/'

html_data = extract_text_from_url(url)

print(html_data[0])

print(html_data[1][1027:1091])

[('FRIHETEN Sleeper sectional,3 seat w/storage, Skiftebo dark gray', 1027, 1090)]
FRIHETEN Sleeper sectional,3 seat w/storage, Skiftebo dark gray 


In [21]:
print(len(extracted_data))

125
