# Web scraping for most of the sitemap websites
- This notebook picks up from the dispersed_link_data.csv file from the last notebook and aims to gather the important raw data from each link (or as many as possible) in a new CSV file that will be sent to the next notebook for tokenization and labeling.

In [None]:
# importing libraries
import csv 
import operator
import re
import threading
import random
import sys
import requests
from tqdm import tqdm 
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import concurrent
from urllib.parse import urlparse
from urllib.parse import urljoin
from collections import defaultdict
import time


### Loading the links for scraping

In [None]:
links = []

with open('../dispersed_link_data.csv', mode='r', newline='', encoding='utf-8') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        links.append(row[0])

### Additional method for storing data

In [None]:
def write_data_to_csv(data_to_write, csv_filename):
    with open(csv_filename, "a", newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        for row in data_to_write:
            csvwriter.writerow(row)

### Methods that will be used for scraping
- The same get_data method is used alongside the get_base_url.
- The code goes through the links and extracts the relevant sections (h1, title, url_last_path, page_text) from the HTML content.
- It also eliminates the header content, scripts, styles etc.

In [None]:
def get_base_url(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url
    except Exception as e:
        # print(f"Error parsing URL {url}: {e}")
        return None


def clean_text(text):
    # Replace multiple spaces and line breaks with a single space
    cleaned_text = re.sub(r'\s+', ' ', text).strip()
    return cleaned_text

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
]

def get_data(url):
    headers = {"User-Agent": random.choice(USER_AGENTS)}  # it uses a random user agent from the list above - it avoids getting IP banned from most websites inside the CSV if I scrape multiple times
    try:
        response = requests.get(url, headers=headers, timeout=3)

        # handle rate-limiting (HTTP 429) by pausing and retrying
        if response.status_code == 429:
            tqdm.write(f"FROM GET_DATA: Rate limit reached. Sleeping before retrying {url}")
            # print(f"FROM GET_DATA: Rate limit reached. Sleeping before retrying {url}")
            time.sleep(random.uniform(4, 8))  # random delay to avoid detection
            return get_data(url)

        if response.status_code == 200:
            return response.content  # return content if successful

        tqdm.write(f"FROM GET_DATA: Failed to retrieve {url}, Status Code: {response.status_code}")
        # print(f"FROM GET_DATA: Failed to retrieve {url}, Status Code: {response.status_code}")
        return None

    except requests.RequestException as e:
        tqdm.write(f"FROM GET_DATA: Error fetching {url}: {e}")
        # print(f"FROM GET_DATA: Error fetching {url}: {e}")
        return None


def has_letters(input_string):
    return any(char.isalpha() for char in input_string)

def extract_data(url):
    html_data = get_data(url)
    if html_data is None:
        return None
    soup = BeautifulSoup(html_data, "html.parser")
    
    # finding the title in the meta tags - 99% of websites have one and it has relevant data for the model most of the time
    title = soup.find('title')
    if title:
        title = title.get_text()
    else: title = None
        
    # remove scripts, styles, and irrelevant content
    for script in soup(["script", "style", "footer", "nav", "header", "noscript", "head"]):
        script.extract()

    h1_tags = [clean_text(h1.get_text()) for h1 in soup.find_all('h1')]
    page_text = clean_text(soup.get_text(separator=' '))
    
    h1_tag = None
    # if no h1 tags are found, return None - since this is our label for the model
    if h1_tags:
        h1_tag = h1_tags[0] 
 
    # we get the last path of the url
    url_index = url.rfind('/')
    url_last_path = url[url_index + 1:].replace('-', ' ').replace('_', ' ')
    if not has_letters(url_last_path): # there are some paths that contain only numbers which dont help the model
        url_last_path = None
    
    h1_tag_position = None
    if h1_tag and page_text: 
        if h1_tag in page_text:
            start_idx = page_text.index(h1_tag)
            end_idx = start_idx + len(h1_tag)
            h1_tag_position = (h1_tag, start_idx, end_idx)
    
    return h1_tag_position, title, url_last_path, page_text

In [None]:
final_data = []
links = links[:100000]  # limit the number of links to process (more than 100000 took me way too long to compute) 

final_data_lock = threading.Lock()  # lock to ensure thread-safe appending to the list

# function to process a single URL
def process_url(link):
    try:
        result = extract_data(link)
        if result is not None:
            h1_tag, title, url_last_path, page_text = result
            with final_data_lock:
                final_data.append([link, h1_tag, title, url_last_path, page_text])
        else:
            tqdm.write(f"FROM:PROCESS_URL: Skipped processing {link} because extract_data returned None")
    except Exception as e:
        tqdm.write(f"FROM:PROCESS_URL: Error processing {link}: {e}")


# read from CSV and process in parallel
with open('../dispersed_link_data.csv', mode='r', newline='', encoding='utf-8') as file:
    with ThreadPoolExecutor(max_workers=32) as executor: 
        futures = []
        for link in links:
            futures.append(executor.submit(process_url, link))  
        for _ in tqdm(concurrent.futures.as_completed(futures), total=len(links), desc="Processing URLs"):
            pass  # we don't need the result here since we're appending directly to final_data

### Saving the data to a CSV file for the next notebook
- Important to note here is that this file gets big very quickly, at 100000 entries excel was already struggling to open it (and stole all my ram), and it took about 2 hours to compute.

In [None]:
with open('../data/preprocessed_data_from_all_sitemaps.csv', mode='w', newline='', encoding='utf-8') as file:
    csv_writer = csv.writer(file)
    csv_writer.writerow(['URL', 'h1', 'title', 'url_last_path', 'page_text'])
    for row in final_data:
        csv_writer.writerow(row)
        
print("Processing completed!")