# Generica web scraping notebook

In [None]:
# importing libraries

import csv 
import operator
import re
import threading

import sys

import requests
from tqdm import tqdm 
from bs4 import BeautifulSoup

from concurrent.futures import ThreadPoolExecutor
import concurrent

from urllib.parse import urlparse
from urllib.parse import urljoin

import spacy # we use this for word similarity

from collections import defaultdict
import random
import time

In [None]:
def get_base_url(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url
    except Exception as e:
        # print(f"Error parsing URL {url}: {e}")
        return None

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
]    

def get_data(url):
    headers = {"User-Agent": random.choice(USER_AGENTS)}  # Rotate user-agent

    try:
        response = requests.get(url, headers=headers, timeout=3)

        # Handle rate-limiting (HTTP 429) by pausing and retrying
        if response.status_code == 429:
            tqdm.write(f"FROM GET_DATA: Rate limit reached. Sleeping before retrying {url}")
            # print(f"FROM GET_DATA: Rate limit reached. Sleeping before retrying {url}")
            time.sleep(random.uniform(4, 8))  
            return get_data(url)

        if response.status_code == 200:
            return response.content  

        tqdm.write(f"FROM GET_DATA: Failed to retrieve {url}, Status Code: {response.status_code}")
        # print(f"FROM GET_DATA: Failed to retrieve {url}, Status Code: {response.status_code}")
        return None

    except requests.RequestException as e:
        tqdm.write(f"FROM GET_DATA: Error fetching {url}: {e}")
        # print(f"FROM GET_DATA: Error fetching {url}: {e}")
        return None
        


In [18]:
# reading the csv file and storing the links in the links list
links = [] # link
with open('../Data/furniture stores pages.csv', mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        origin = get_base_url(row[0])  # the trimmed url (the url is located in the first column)
        origin = origin + '/robots.txt'
        links.append((origin, row[0]))  
        

link_and_sitemap = []
for link in links:
    data = get_data(link[0])
    if data is not None:
        soup = BeautifulSoup(data, 'html.parser')
        for line in soup.get_text().split('\n'):
            if 'sitemap:' in line.lower():
                line = line[len("sitemap:"):]
                print(link[0], line)
                link_and_sitemap.append((link[0], line))
                break

print(len(link_and_sitemap))


FROM GET_DATA: Error fetching :///robots.txt: No connection adapters were found for ':///robots.txt'
https://www.factorybuys.com.au/robots.txt  https://www.factorybuys.com.au/sitemap.xml
https://dunlin.com.au/robots.txt  https://dunlin.com.au/sitemap.xml
https://themodern.net.au/robots.txt  https://themodern.com.au/sitemap.xml


  soup = BeautifulSoup(data, 'html.parser')


https://hemisphereliving.com.au/robots.txt  https://hemisphereliving.com.au/sitemap_index.xml
FROM GET_DATA: Error fetching https://home-buy.com.au/robots.txt: HTTPSConnectionPool(host='home-buy.com.au', port=443): Max retries exceeded with url: /robots.txt (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'home-buy.com.au'. (_ssl.c:1000)")))
FROM GET_DATA: Error fetching https://beckurbanfurniture.com.au/robots.txt: HTTPSConnectionPool(host='beckurbanfurniture.com.au', port=443): Max retries exceeded with url: /robots.txt (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000204CA4203B0>: Failed to resolve 'beckurbanfurniture.com.au' ([Errno 11001] getaddrinfo failed)"))
https://livingedge.com.au/robots.txt  https://www.livingedge.com.au/sitemap_index.xml
https://edenliving.online/robots.txt  https://edenliving.online/sitemap.xml
https://www.ourfurn

In [19]:
# writing the sitemaps to a csv file
with open('../Data/sitemaps_robots.csv', mode='w', newline='') as file: # If I decide to use this (probably not this late and its only 30 urls better than what I have for training)
    csv_writer = csv.writer(file)
    for link in link_and_sitemap:
        csv_writer.writerow(link)

## Methods that will be used for scraping the sitemaps / the websites themselves