In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
import time
import random

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def scrape_urls_from_domain(domain, max_urls=10000, retries=3):
    urls = set()
    base_urls = [f"https://{domain}", f"http://{domain}"]
    
    for base_url in base_urls:
        for attempt in range(retries):
            try:
                response = requests.get(base_url, headers=headers, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                
                for link in soup.find_all('a', href=True):
                    absolute_url = urljoin(base_url, link['href'])
                    parsed_url = urlparse(absolute_url)
                    if parsed_url.netloc.endswith(domain) and parsed_url.scheme in ['http', 'https']:
                        urls.add(absolute_url)
                    if len(urls) >= max_urls:
                        break
                print(f"Scraped {len(urls)} URLs from {domain}")
                return list(urls)
            except requests.RequestException as e:
                print(f"Attempt {attempt + 1} failed for {base_url}: {e}")
                if attempt == retries - 1:
                    print(f"Failed to scrape {domain} after {retries} attempts.")
                time.sleep(random.uniform(1, 3))
    return list(urls)

alexa_file = './data/top-1m.csv'
df = pd.read_csv(alexa_file, names=['rank', 'domain'])
top_domains = df['domain'].head(100).tolist() 

legitimate_urls = []
for domain in top_domains:
    print(f"Scraping {domain}...")
    urls = scrape_urls_from_domain(domain, max_urls=1000)
    legitimate_urls.extend(urls)
    time.sleep(random.uniform(1, 3))

output_df = pd.DataFrame(legitimate_urls, columns=['url'])
output_df.to_csv('./data/legitimate_urls.csv', index=False)
print(f"Collected {len(legitimate_urls)} legitimate URLs. Saved to ./data/legitimate_urls.csv")

Scraping google.com...
Scraped 23 URLs from google.com
Scraping microsoft.com...
Scraped 94 URLs from microsoft.com
Scraping mail.ru...
Attempt 1 failed for https://mail.ru: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Attempt 2 failed for https://mail.ru: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Attempt 3 failed for https://mail.ru: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Failed to scrape mail.ru after 3 attempts.
Attempt 1 failed for http://mail.ru: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Attempt 2 failed for http://mail.ru: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forci