# Web Scraping and Data preprocessing tests and code attempts

In [1]:

import csv # importing the csv module
import operator
import json

import requests
from tqdm import tqdm # we use this to keep track of the progress of a loop
from bs4 import BeautifulSoup

from concurrent.futures import ThreadPoolExecutor
import concurrent


In [14]:



# function to check if a website is accessible
def check_website(url):
    try:
        response = requests.get(url, timeout=5)  # timeout after 5 seconds
        if 200 <= response.status_code < 300:
            return True  # the website is accessible if the status code is between 200 and 299
        else:
            return False  # website returned an error (not accessible)
    except requests.RequestException as e:
        return False  # there was an issue with the request (e.g., domain error, timeout, etc.)

def check_websites_concurrently(links):
    accessible_links = []
    with ThreadPoolExecutor(max_workers=30) as executor:
        futures = {executor.submit(check_website, link): link for link in links}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(links), desc="Checking links"):
            # futures work like promises in JavaScript
            link = futures[future] # get the link associated with the future from the futures dictionary (futures is a dictionary with the future as the key and the link as the value)
            if future.result():
                accessible_links.append(link)
    return accessible_links



def trim_url(url):
    # find the first occurrence of "collections" and "products" in the URL
    collections_index = url.find('/collections/')
    products_index = url.find('/products/')
    
    # we trim the URL to the last "/" before "collections" or "products" using slicing
    if collections_index != -1:
        return url[:collections_index + len('/collections/')]
    elif products_index != -1:
        # by changing the products from the url with collections we get the page with the types of products and from my testing this approach proves best in order to access all the products from a webpage
        # additionally, most websites have a collections path that points to the same place as the products path (don't know how this works, but I get over 400 working pages with this method which is way more than enough for training the model)
        # throughout the code I will consider the products path or the collections path at the end of every URL (even though I may only keep one of them)
        return url[:products_index] + '/collections/' # for having the products in the URL this must be changed with url[:products_index + len('/products/')]
    else:
        return url  # if neither "collections" nor "products" is found, we return the original URL
    


In [None]:
# reading the csv file and storing the links in the links list
links = []
with open('Data/furniture stores pages.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        trimmed_url = trim_url(row[0])  # the trimmed url (the url is located in the first column)
        links.append(trimmed_url) 

# we delete links that are not accessible
final_links = check_websites_concurrently(links)

# write the accessible links to a new csv file
with open('final_links.csv', mode='w', newline='') as file:
    csv_writer = csv.writer(file)
    for link in final_links:
        csv_writer.writerow([link])

print(final_links, len(final_links))

In [2]:

# getting each pages links and going through them
# major props to this medium post: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db113

def get_data(url):
    response = requests.get(url)
    return response.text

def is_valid_product_link(url):
    # Exclude common unwanted patterns
    unwanted_patterns = ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.css', '.js', '.ico', 'tel:', 'mailto:', '#']
    for pattern in unwanted_patterns:
        if pattern in url:
            return False
    # Only accept URLs that contain "collections" or "products"
    if '/collections/' in url or '/products/' in url:
        return True
    return False


dict_href_links = {}

def get_links(website_link):
    # Set the base of the URL depending on whether "collections" or "products" is in the link
    website_origin = website_link
    collections_index = website_link.find('/collections/')
    if collections_index != -1:
        website_origin = website_link[:collections_index + 1] 

    html_data = get_data(website_link)
    soup = BeautifulSoup(html_data, "html.parser")
    list_links = []

    for link in soup.find_all("a", href=True):
        href = link["href"]

        # Filter out invalid links (non-product/collection pages)
        if not is_valid_product_link(href):
            continue
        
        link_to_append = None

        # Handle absolute URLs that start with the origin
        if href.startswith(str(website_origin)):
            link_to_append = href
        
        # Handle relative URLs that start with "/"
        elif href.startswith("/"):
            #print(href)
            link_with_www = website_origin + href[1:]
            #print("adjusted link =", link_with_www)
            link_to_append = link_with_www
        
        # If link_to_append is not None, check if it's already in dict_href_links and if it's accessible
        if link_to_append is not None:
            if link_to_append not in dict_href_links: #  and check_website(link_to_append) - I will not check the links here, I will check them after I get all the links
                dict_href_links[link_to_append] = None  # Mark it as seen
                list_links.append(link_to_append)

    # Convert list of links to a dictionary with "Not-checked" as the default value for each
    dict_links = dict.fromkeys(list_links, "Not-checked")
    return dict_links

def get_subpage_links(l, max_depth=3, current_depth=0):
    if current_depth >= max_depth:
        return l

    with ThreadPoolExecutor(max_workers=16) as executor:
        futures = {executor.submit(get_links, link): link for link in l if l[link] == "Not-checked"}
        
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing subpage links"):
            link = futures[future]
            try:
                dict_links_subpages = future.result()
            except Exception as e:
                print(f"Error fetching {link}: {e}")
                continue
            l[link] = "Checked"
            l.update(dict_links_subpages)

    return get_subpage_links(l, max_depth, current_depth + 1)

# old version of the function
# def get_subpage_links(l, max_depth=3, current_depth=0):
#     if current_depth >= max_depth:
#         return l
# 
#     for link in tqdm(l):
#         if l[link] == "Not-checked":
#             dict_links_subpages = get_links(link) 
#             l[link] = "Checked"
#         else:
#             dict_links_subpages = {}
# 
#         l = {**dict_links_subpages, **l}
#     
#     return get_subpage_links(l, max_depth, current_depth + 1)


In [None]:
# we gather all the links from some pages

# we test with the first page inside the final_links csv

website = "https://www.factorybuys.com.au/collections/"
# create dictionary of website
dict_links = {website:"Not-checked"}

counter, counter2 = None, 0

while counter != 0:
    counter2 += 1
    dict_links2 = get_subpage_links(dict_links)
    counter = operator.countOf(dict_links2.values(), "Not-checked")  # Number of "Not-checked" links
    
    # Print some statements for debugging
    print("")
    print("THIS IS LOOP ITERATION NUMBER", counter2)
    print("LENGTH OF DICTIONARY WITH LINKS =", len(dict_links2))
    print("NUMBER OF 'Not-checked' LINKS = ", counter)
    print("")
    
    dict_links = dict_links2

# Write only the links to a CSV file after the loop completes
with open("link_dat.csv", "w", newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    
    # Write each link as a new row in the CSV file
    for link in dict_links.keys():
        csvwriter.writerow([link])

print("Links saved to link_data.csv.")

Processing subpage links: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
Processing subpage links:   7%|▋         | 40/579 [00:44<10:05,  1.12s/it]
