# Web scraping for most of the sitemap websites

In [3]:
# importing libraries

import csv 
import operator
import re
import threading
import random

import sys

import requests
from tqdm import tqdm 
from bs4 import BeautifulSoup

from concurrent.futures import ThreadPoolExecutor
import concurrent

from urllib.parse import urlparse
from urllib.parse import urljoin

import spacy # we use this for word similarity

from collections import defaultdict
import time


### Loading the links for scraping

In [4]:
links = []

with open('../dispersed_link_data.csv', mode='r', newline='', encoding='utf-8') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        links.append(row[0])

### Additional methods for storing data

In [5]:
def write_data_to_csv(data_to_write, csv_filename):
    # Open the file in append mode 'a' and ensure newline is handled correctly
    with open(csv_filename, "a", newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        # Iterate over the list of tuples and append each to the CSV
        for row in data_to_write:
            csvwriter.writerow(row)

### Methods that will be used for scraping


In [None]:
def get_base_url(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url
    except Exception as e:
        # print(f"Error parsing URL {url}: {e}")
        return None


def clean_text(text):
    # Replace multiple spaces and line breaks with a single space
    cleaned_text = re.sub(r'\s+', ' ', text).strip()
    return cleaned_text

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
]

# List of proxy servers (this can be dynamically updated with proxy pools or services)
PROXIES = [
    {"http": "http://178.48.68.61:18080",  "https": "http://178.48.68.61:18080"},    
]

# Function to get page content
def get_data(url):
    headers = {"User-Agent": random.choice(USER_AGENTS)}  # Rotate user-agent

    try:
        response = requests.get(url, headers=headers, timeout=3)

        # Handle rate-limiting (HTTP 429) by pausing and retrying
        if response.status_code == 429:
            tqdm.write(f"FROM GET_DATA: Rate limit reached. Sleeping before retrying {url}")
            # print(f"FROM GET_DATA: Rate limit reached. Sleeping before retrying {url}")
            time.sleep(random.uniform(4, 8))  # Random delay to avoid detection
            return get_data(url)

        if response.status_code == 200:
            return response.content  # Return HTML content if successful

        tqdm.write(f"FROM GET_DATA: Failed to retrieve {url}, Status Code: {response.status_code}")
        # print(f"FROM GET_DATA: Failed to retrieve {url}, Status Code: {response.status_code}")
        return None

    except requests.RequestException as e:
        tqdm.write(f"FROM GET_DATA: Error fetching {url}: {e}")
        # print(f"FROM GET_DATA: Error fetching {url}: {e}")
        return None


def has_letters(input_string):
    return any(char.isalpha() for char in input_string)

def extract_data(url):
    html_data = get_data(url)
    if html_data is None:
        return None
    
    soup = BeautifulSoup(html_data, "html.parser")
    
    # Finding the title in the meta tags
    title = soup.find('title')
    if title:
        title = title.get_text()
    else: title = None
        
    # Remove scripts, styles, and irrelevant content
    for script in soup(["script", "style", "footer", "nav", "header", "noscript", "head"]):
        script.extract()

    h1_tags = [clean_text(h1.get_text()) for h1 in soup.find_all('h1')]
    
    page_text = clean_text(soup.get_text(separator=' '))
    
    h1_tag = None
    # If no h1 tags are found, return None
    if h1_tags:
        h1_tag = h1_tags[0] 
 
    # We get the last path of the url
    url_index = url.rfind('/')
    url_last_path = url[url_index + 1:].replace('-', ' ')
    
    if not has_letters(url_last_path):
        url_last_path = None
    
    h1_tag_position = None
    if h1_tag and page_text:  # Ensure both are not None
        if h1_tag in page_text:
            start_idx = page_text.index(h1_tag)
            end_idx = start_idx + len(h1_tag)
            h1_tag_position = (h1_tag, start_idx, end_idx)
    
    return h1_tag_position, title, url_last_path, page_text
    

final_data = []
links = links[:100000]  

final_data_lock = threading.Lock()  # A lock to ensure thread-safe appending to the list

# Function to process a single URL
def process_url(link):
    try:
        result = extract_data(link)
        if result is not None:
            h1_tag, title, url_last_path, page_text = result
            with final_data_lock:
                final_data.append([link, h1_tag, title, url_last_path, page_text])
        else:
            tqdm.write(f"FROM:PROCESS_URL: Skipped processing {link} because extract_data returned None")
    except Exception as e:
        tqdm.write(f"FROM:PROCESS_URL: Error processing {link}: {e}")


# Read from CSV and process in parallel
with open('../dispersed_link_data.csv', mode='r', newline='', encoding='utf-8') as file:

    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=32) as executor:  # Adjust max_workers as needed
        futures = []
        for link in links:
            futures.append(executor.submit(process_url, link))  # Submit tasks to the thread pool

        # Track progress with tqdm
        for _ in tqdm(concurrent.futures.as_completed(futures), total=len(links), desc="Processing URLs"):
            pass  # We don't need the result here since we're appending directly to final_data

                          


FROM GET_DATA: Failed to retrieve https://www.comfortfurniture.com.sg/category/products/bundle-sale, Status Code: 202
FROM:PROCESS_URL: Skipped processing https://www.comfortfurniture.com.sg/category/products/bundle-sale because extract_data returned None


Processing URLs:   0%|          | 25/100000 [00:01<52:07, 31.96it/s]  

FROM GET_DATA: Failed to retrieve https://magnolialane.biz/products/malawi-tub-chair, Status Code: 404


Processing URLs:   0%|          | 29/100000 [00:02<1:24:21, 19.75it/s]

FROM GET_DATA: Failed to retrieve https://kokocollective.com.au/products/pom-pom-blanket-navy, Status Code: 404


Processing URLs:   0%|          | 29/100000 [00:02<1:24:21, 19.75it/s]

FROM:PROCESS_URL: Skipped processing https://magnolialane.biz/products/malawi-tub-chair because extract_data returned None


Processing URLs:   0%|          | 31/100000 [00:04<4:53:00,  5.69it/s]

FROM:PROCESS_URL: Skipped processing https://kokocollective.com.au/products/pom-pom-blanket-navy because extract_data returned None


Processing URLs:   0%|          | 113/100000 [00:13<5:46:12,  4.81it/s]

FROM GET_DATA: Failed to retrieve https://brownandbeam.com/products/karis-slipcovered-sofa, Status Code: 404


                                                                        

FROM:PROCESS_URL: Skipped processing https://brownandbeam.com/products/karis-slipcovered-sofa because extract_data returned None


Processing URLs:   0%|          | 114/100000 [00:15<12:04:34,  2.30it/s]

FROM GET_DATA: Failed to retrieve https://arkaliving.com/products/5738deb1f85082cb3f27ff9a, Status Code: 404


                                                                        

FROM GET_DATA: Failed to retrieve https://www.stylemyhome.com.au/products/plain-abigail-upholstered-storage-box, Status Code: 404


Processing URLs:   0%|          | 115/100000 [00:16<16:55:24,  1.64it/s]

FROM:PROCESS_URL: Skipped processing https://arkaliving.com/products/5738deb1f85082cb3f27ff9a because extract_data returned None


Processing URLs:   0%|          | 117/100000 [00:17<23:17:25,  1.19it/s]

FROM:PROCESS_URL: Skipped processing https://www.stylemyhome.com.au/products/plain-abigail-upholstered-storage-box because extract_data returned None


Processing URLs:   0%|          | 325/100000 [00:34<3:46:19,  7.34it/s] 

FROM GET_DATA: Failed to retrieve https://www.knoll.com/design-plan/products/by-brand/filzfelt-design, Status Code: 404


                                                                       

FROM GET_DATA: Failed to retrieve https://www.comfortfurniture.com.sg/category/products/childrens-furniture, Status Code: 202


Processing URLs:   0%|          | 327/100000 [00:35<6:13:49,  4.44it/s]

FROM:PROCESS_URL: Skipped processing https://www.knoll.com/design-plan/products/by-brand/filzfelt-design because extract_data returned None


Processing URLs:   0%|          | 328/100000 [00:36<11:53:19,  2.33it/s]

FROM:PROCESS_URL: Skipped processing https://www.comfortfurniture.com.sg/category/products/childrens-furniture because extract_data returned None


Processing URLs:   0%|          | 404/100000 [00:42<2:31:08, 10.98it/s] 

FROM GET_DATA: Failed to retrieve https://thebanyantree.com.au/products/lynettesoftpinkvelvetcushion, Status Code: 404


Processing URLs:   0%|          | 406/100000 [00:43<3:00:05,  9.22it/s]

FROM:PROCESS_URL: Skipped processing https://thebanyantree.com.au/products/lynettesoftpinkvelvetcushion because extract_data returned None


Processing URLs:   0%|          | 438/100000 [00:47<3:11:53,  8.65it/s]

FROM GET_DATA: Failed to retrieve https://arkaliving.com/products/5738df90e321409007766985, Status Code: 404


Processing URLs:   0%|          | 440/100000 [00:48<3:15:39,  8.48it/s]

FROM GET_DATA: Failed to retrieve https://www.stylemyhome.com.au/products/fabric-sample, Status Code: 404


Processing URLs:   0%|          | 440/100000 [00:49<3:15:39,  8.48it/s]

FROM:PROCESS_URL: Skipped processing https://arkaliving.com/products/5738df90e321409007766985 because extract_data returned None


Processing URLs:   0%|          | 443/100000 [00:50<10:32:12,  2.62it/s]

FROM:PROCESS_URL: Skipped processing https://www.stylemyhome.com.au/products/fabric-sample because extract_data returned None


Processing URLs:   1%|          | 628/100000 [01:04<1:37:53, 16.92it/s] 

FROM GET_DATA: Failed to retrieve https://www.comfortfurniture.com.sg/category/products/tv-consoles, Status Code: 202


Processing URLs:   1%|          | 631/100000 [01:05<2:36:10, 10.60it/s]

FROM:PROCESS_URL: Skipped processing https://www.comfortfurniture.com.sg/category/products/tv-consoles because extract_data returned None


Processing URLs:   1%|          | 757/100000 [01:20<3:31:10,  7.83it/s]

FROM GET_DATA: Failed to retrieve https://arkaliving.com/products/5738dccfab48de6e3b5413db, Status Code: 404


                                                                       

FROM GET_DATA: Failed to retrieve https://www.stylemyhome.com.au/products/joshua-oak-glass-cross-leg-coffee-table, Status Code: 404


Processing URLs:   1%|          | 758/100000 [01:22<5:42:45,  4.83it/s]

FROM:PROCESS_URL: Skipped processing https://arkaliving.com/products/5738dccfab48de6e3b5413db because extract_data returned None


Processing URLs:   1%|          | 760/100000 [01:23<14:15:00,  1.93it/s]

FROM:PROCESS_URL: Skipped processing https://www.stylemyhome.com.au/products/joshua-oak-glass-cross-leg-coffee-table because extract_data returned None


Processing URLs:   1%|          | 950/100000 [01:38<3:01:33,  9.09it/s] 

FROM GET_DATA: Failed to retrieve https://www.comfortfurniture.com.sg/category/products/lights-and-bulbs, Status Code: 202


                                                                       

FROM GET_DATA: Failed to retrieve https://magnolialane.biz/products/cliffton-beach-coffee-table, Status Code: 404


Processing URLs:   1%|          | 952/100000 [01:40<4:06:41,  6.69it/s]

FROM:PROCESS_URL: Skipped processing https://www.comfortfurniture.com.sg/category/products/lights-and-bulbs because extract_data returned None


Processing URLs:   1%|          | 954/100000 [01:41<9:23:08,  2.93it/s]

FROM GET_DATA: Failed to retrieve https://dhfonline.com/products/rachele-tray, Status Code: 404


Processing URLs:   1%|          | 954/100000 [01:42<9:23:08,  2.93it/s]

FROM:PROCESS_URL: Skipped processing https://magnolialane.biz/products/cliffton-beach-coffee-table because extract_data returned None


Processing URLs:   1%|          | 955/100000 [01:43<16:17:19,  1.69it/s]

FROM:PROCESS_URL: Skipped processing https://dhfonline.com/products/rachele-tray because extract_data returned None


Processing URLs:   1%|          | 1069/100000 [01:53<3:47:38,  7.24it/s]

FROM GET_DATA: Failed to retrieve https://www.stylemyhome.com.au/products/joshua-oak-glass-cross-leg-side-table, Status Code: 404


Processing URLs:   1%|          | 1071/100000 [01:54<10:52:05,  2.53it/s]

FROM:PROCESS_URL: Skipped processing https://www.stylemyhome.com.au/products/joshua-oak-glass-cross-leg-side-table because extract_data returned None


                                                                         

FROM GET_DATA: Failed to retrieve https://www.comfortfurniture.com.sg/category/products/natural-signature, Status Code: 202


Processing URLs:   1%|          | 1244/100000 [02:09<3:22:02,  8.15it/s]

FROM:PROCESS_URL: Skipped processing https://www.comfortfurniture.com.sg/category/products/natural-signature because extract_data returned None


                                                                        

FROM GET_DATA: Failed to retrieve https://www.georgestreet.co.uk/products/footstool/, Status Code: 404


Processing URLs:   1%|          | 1247/100000 [02:10<5:13:53,  5.24it/s]

FROM:PROCESS_URL: Skipped processing https://www.georgestreet.co.uk/products/footstool/ because extract_data returned None


Processing URLs:   1%|▏         | 1390/100000 [02:26<2:56:03,  9.33it/s]

FROM GET_DATA: Failed to retrieve https://www.stylemyhome.com.au/products/hugo-deluxe-bedhead, Status Code: 404


Processing URLs:   1%|▏         | 1392/100000 [02:27<3:02:20,  9.01it/s]

FROM:PROCESS_URL: Skipped processing https://www.stylemyhome.com.au/products/hugo-deluxe-bedhead because extract_data returned None


Processing URLs:   2%|▏         | 1566/100000 [02:42<1:40:09, 16.38it/s]

FROM GET_DATA: Failed to retrieve https://www.comfortfurniture.com.sg/category/products/matte-black, Status Code: 202


Processing URLs:   2%|▏         | 1570/100000 [02:42<3:50:45,  7.11it/s]

FROM:PROCESS_URL: Skipped processing https://www.comfortfurniture.com.sg/category/products/matte-black because extract_data returned None


Processing URLs:   2%|▏         | 1702/100000 [02:58<2:47:53,  9.76it/s]

FROM GET_DATA: Failed to retrieve https://www.stylemyhome.com.au/products/indigo-coral-i-framed, Status Code: 404


Processing URLs:   2%|▏         | 1706/100000 [02:59<6:02:50,  4.52it/s]

FROM:PROCESS_URL: Skipped processing https://www.stylemyhome.com.au/products/indigo-coral-i-framed because extract_data returned None


                                                                        

FROM GET_DATA: Failed to retrieve https://www.comfortfurniture.com.sg/category/products/banquet-furniture, Status Code: 202


Processing URLs:   2%|▏         | 1880/100000 [03:14<1:40:31, 16.27it/s]

FROM:PROCESS_URL: Skipped processing https://www.comfortfurniture.com.sg/category/products/banquet-furniture because extract_data returned None


Processing URLs:   2%|▏         | 1906/100000 [03:16<2:32:47, 10.70it/s]

In [None]:
with open('../data/preprocessed_data_from_all_sitemaps.csv', mode='w', newline='', encoding='utf-8') as file:
    csv_writer = csv.writer(file)
    csv_writer.writerow(['URL', 'h1', 'title', 'url_last_path', 'page_text'])
    for row in final_data:
        csv_writer.writerow(row)
        

print("Processing completed!")