## This notebook will focus on fetching as many links as possible from the websites that did not have a sitemap (and that work)

In [None]:

import csv # importing the csv module
import operator
import json

import requests
from tqdm import tqdm # we use this to keep track of the progress of a loop
from bs4 import BeautifulSoup

from concurrent.futures import ThreadPoolExecutor
import concurrent
from urllib.parse import urlparse
import random
import time

In [None]:
def get_base_url(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url
    except Exception as e:
        # print(f"Error parsing URL {url}: {e}")
        return None
    
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
]    

def check_website(url):
    headers = {"User-Agent": random.choice(USER_AGENTS)}  # Rotate user-agent
    try:
        response = requests.get(url, headers = headers ,timeout=5)  # timeout after 5 seconds
        if 200 <= response.status_code < 300:
            return True  # the website is accessible if the status code is between 200 and 299
        else:
            return False  # website returned an error (not accessible)
    except requests.RequestException as e:
        return False  # there was an issue with the request (e.g., domain error, timeout, etc.)

def check_websites_concurrently(links):
    accessible_links = []
    with ThreadPoolExecutor(max_workers=30) as executor:
        futures = {executor.submit(check_website, link): link for link in links}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(links), desc="Checking links"):
            # futures work like promises in JavaScript
            link = futures[future] # get the link associated with the future from the futures dictionary (futures is a dictionary with the future as the key and the link as the value)
            if future.result():
                accessible_links.append(link)
    return accessible_links


In [None]:
# getting only the sitemaps from the csv file
no_sitemaps = []
with open('../sitemap_results.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if row[1] == 'No sitemap found':
            no_sitemaps.append(row[0])
            
no_sitemaps = no_sitemaps[2:]
basic_csv = []

with open ('../data/furniture stores pages.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        basic_csv.append(row)
        
basic_csv = basic_csv[1:]

print(len(no_sitemaps)) # 320

for i in range(len(no_sitemaps)):
    for j in range(len(basic_csv)):
        if no_sitemaps[i] in basic_csv[j][0]:
            print(no_sitemaps[i], basic_csv[j][0])
            no_sitemaps[i] = basic_csv[j][0]
            break
            
print(len(no_sitemaps)) # 320
        
# trimming all websites until the words collections or products show up in the URL
no_sitemaps = [trim_url(url) for url in no_sitemaps]

print(no_sitemaps)
print(len(no_sitemaps)) # 320




In [None]:
final_links = check_websites_concurrently(no_sitemaps)

print(len(final_links)) # 73

In [None]:
# saving the final links to a csv file

with open('../data/no_sitemap_links.csv', mode='w', newline='') as file:
    csv_writer = csv.writer(file)
    csv_writer.writerow(['URL'])
    for link in final_links:
        csv_writer.writerow([link])

In [None]:

# getting each pages links and going through them
# major props to this medium post: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db113

# function to get page content
def get_data(url):
    headers = {"User-Agent": random.choice(USER_AGENTS)}  # Rotate user-agent

    try:
        response = requests.get(url, headers=headers, timeout=3)

        # Handle rate-limiting (HTTP 429) by pausing and retrying
        if response.status_code == 429:
            tqdm.write(f"FROM GET_DATA: Rate limit reached. Sleeping before retrying {url}")
            # print(f"FROM GET_DATA: Rate limit reached. Sleeping before retrying {url}")
            time.sleep(random.uniform(4, 8))  # Random delay to avoid detection
            return get_data(url)

        if response.status_code == 200:
            return response.content  # Return HTML content if successful

        tqdm.write(f"FROM GET_DATA: Failed to retrieve {url}, Status Code: {response.status_code}")
        # print(f"FROM GET_DATA: Failed to retrieve {url}, Status Code: {response.status_code}")
        return None

    except requests.RequestException as e:
        tqdm.write(f"FROM GET_DATA: Error fetching {url}: {e}")
        # print(f"FROM GET_DATA: Error fetching {url}: {e}")
        return None

def is_valid_link(url):
    # exclude common unwanted patterns
    unwanted_patterns = ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.css', '.js', '.ico', 'tel:', 'mailto:', '#']
    for pattern in unwanted_patterns:
        if pattern in url:
            return False
    # only accept URLs that contain "collections" or "products"
    if '/collections/' in url or '/products/' in url:
        return True
    return True


dict_href_links = {}

def get_links(website_link):
    # set the base of the URL depending on whether "collections" or "products" is in the link
    website_origin = website_link
    collections_index = website_link.find('/collections/')
    if collections_index != -1:
        website_origin = website_link[:collections_index + 1] 

    html_data = get_data(website_link)
    soup = BeautifulSoup(html_data, "html.parser")
    list_links = []

    for link in soup.find_all("a", href=True):
        href = link["href"]

        # filter out invalid links (non-product/collection pages)
        if not is_valid_link(href):
            continue
        
        link_to_append = None

        # handle absolute URLs that start with the origin
        if href.startswith(str(website_origin)):
            link_to_append = href
        
        # handle relative URLs that start with "/"
        elif href.startswith("/"):
            #print(href)
            link_with_www = website_origin + href[1:]
            #print("adjusted link =", link_with_www)
            link_to_append = link_with_www
        
        # if link_to_append is not None, check if it's already in dict_href_links and if it's accessible
        if link_to_append is not None:
            if link_to_append not in dict_href_links: #  and check_website(link_to_append) - I will not check the links here, I will check them after I get all the links
                dict_href_links[link_to_append] = None  # mark it as seen
                list_links.append(link_to_append)

    # convert list of links to a dictionary with "Not-checked" as the default value for each
    dict_links = dict.fromkeys(list_links, "Not-checked")
    return dict_links

def get_subpage_links(l, max_depth=3, current_depth=0, write_frequency=500, csv_filename="no_sitemap_data.csv"):
    processed_links_count = 0
    
    if current_depth >= max_depth:
        return l

    with ThreadPoolExecutor(max_workers=32) as executor:
        futures = {executor.submit(get_links, link): link for link in l if l[link] == "Not-checked"}
        
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing subpage links"):
            link = futures[future]
            try:
                dict_links_subpages = future.result()
                # print(f"Processed {link} with {len(dict_links_subpages)} subpages.")
            except Exception as e:
                print(f"Error fetching {link}: {e}")
                continue
            l[link] = "Checked"
            l.update(dict_links_subpages)

            processed_links_count += 1
            
            # write to file every 'write_frequency' processed links
            if processed_links_count >= write_frequency: # this actually writes all the links to the csv file - even the not checked ones but in my case it is sufficient
                write_links_to_csv(l, csv_filename)
                processed_links_count = 0  # reset the counter

    # recursively call the function for the next depth level
    return get_subpage_links(l, max_depth, current_depth + 1, write_frequency, csv_filename)

def write_links_to_csv(links_dict, csv_filename):
    """writes the current state of the links dictionary to a CSV file."""
    with open(csv_filename, "w", newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        for link in links_dict.keys():
            csvwriter.writerow([link])

    print(f"Links saved to {csv_filename}.")

In [None]:
# we gather all the links from some pages

# we test with the first page inside the final_links csv

websites = []
with open('../data/no_sitemap_links.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        websites.append(row[0])
    
# create dictionary of websites
dict_links = dict.fromkeys(websites, "Not-checked")

counter, counter2 = None, 0

while counter != 0:
    counter2 += 1
    dict_links2 = get_subpage_links(dict_links)
    counter = operator.countOf(dict_links2.values(), "Not-checked")  # number of "Not-checked" links
    
    # print some statements for debugging
    print("")
    print("THIS IS LOOP ITERATION NUMBER", counter2)
    print("LENGTH OF DICTIONARY WITH LINKS =", len(dict_links2))
    print("NUMBER OF 'Not-checked' LINKS = ", counter)
    print("")
    
    dict_links = dict_links2

# write only the links to a CSV file after the loop completes
with open("", "w", newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    
    # write each link as a new row in the CSV file
    for link in dict_links.keys():
        csvwriter.writerow([link])

print("Links saved to link_data.csv.")

Processing subpage links:   6%|▌         | 7219/130497 [09:44<3:51:19,  8.88it/s]

FROM GET_DATA: Error fetching https://themodern.net.au/collections/weave-by-warwick/products/weave-ava-cushion-50-x-50cm: HTTPSConnectionPool(host='themodern.net.au', port=443): Read timed out. (read timeout=3)
Error fetching https://themodern.net.au/collections/weave-by-warwick/products/weave-ava-cushion-50-x-50cm: object of type 'NoneType' has no len()


Processing subpage links:   6%|▌         | 7285/130497 [09:49<2:46:11, 12.36it/s]
