In [5]:
# Import necessary packages
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import pandas as pd

In [6]:
# Seed URLs
seed_urls = [
    "https://www.yonsei.ac.kr/sc/",
    "https://portal.yonsei.ac.kr/portal/MainCtr/index.do",
    "https://library.yonsei.ac.kr/",
    "https://ys.learnus.org/"
]

# Initialize variables
urls_to_visit = seed_urls.copy()
visited_urls = set()
edges = pd.DataFrame(columns=['FromNodeLink', 'ToNodeLink'])

In [7]:
# Function to check if URL is within the desired domains and does not contain "musiclib"
def is_valid_url(url):
    # Check if URL is empty
    if not url:
        return False
    
    # Check if URL contains "musiclib" and skip it
    if "musiclib" in url.lower():
        return False
    
    # Parse the URL
    parsed_url = urlparse(url)
    
    # Check if parsing failed or hostname is missing
    if not parsed_url or not parsed_url.hostname:
        return False
    
    hostname = parsed_url.hostname.lower()
    
    # Check if hostname contains "yonsei" or starts with "ys."
    return "yonsei" in hostname or hostname.startswith("ys.")

In [10]:
# Set crawling parameters
max_urls = 100000000  # Adjust this to control the size of your crawl
crawl_delay = 0.025    # Delay in seconds between requests

# Failure tracking variables
consecutive_failures = 0
failure_wait_time = 30  # Wait time in seconds when encountering 429 error

In [None]:
# Start crawling
while urls_to_visit and len(visited_urls) < max_urls:
    # Get the next URL
    current_url = urls_to_visit.pop(0)
    
    # Skip if already visited or invalid
    if current_url in visited_urls or not current_url:
        continue
    
    # Fetch the page
    print(f"Visiting: {current_url}")
    time.sleep(crawl_delay)  # Politeness delay
    
    try:
        response = requests.get(current_url, timeout=10)
        status_code = response.status_code
        
        # Handle 429 Too Many Requests
        if status_code == 429:
            print(f"Received 429 Too Many Requests for {current_url}")
            print(f"Waiting for {failure_wait_time} seconds...")
            time.sleep(failure_wait_time)
            continue  # Retry the current URL after waiting
        
        # Proceed only if the request was successful
        if status_code != 200:
            print(f"Failed to retrieve {current_url} with status code {status_code}")
            continue
        
        page_content = response.text
    except requests.RequestException as e:
        print(f"Failed to read: {current_url}")
        print(f"Error: {e}")
        continue  # Skip to the next URL
    
    # Parse the page content
    try:
        soup = BeautifulSoup(page_content, 'html.parser')
        links = [a.get('href') for a in soup.find_all('a', href=True)]
    except Exception as e:
        print(f"Error extracting links from: {current_url}")
        print(f"Error message: {e}")
        continue  # Skip to the next URL
    
    # Proceed only if links were successfully extracted
    if not links:
        continue
    
    # Standardize URLs
    absolute_links = []
    for link in links:
        try:
            absolute_link = urljoin(current_url, link)
            absolute_links.append(absolute_link)
        except Exception as e:
            print(f"Error in urljoin with link: {link}")
            continue  # Skip invalid links
    
    # Filter valid URLs
    valid_links = [link for link in absolute_links if is_valid_url(link)]
    
    # Avoid duplicates
    valid_links = list(set(valid_links))
    
    # Add edges
    if valid_links:
        new_edges = pd.DataFrame({
            'FromNodeLink': [current_url]*len(valid_links),
            'ToNodeLink': valid_links
        })
        edges = pd.concat([edges, new_edges], ignore_index=True)
        
        # Add new URLs to visit
        new_urls = set(valid_links) - visited_urls
        urls_to_visit.extend(new_urls)
    
    # Mark current URL as visited
    visited_urls.add(current_url)


In [12]:
# Assign Node IDs
all_urls = pd.unique(edges[['FromNodeLink', 'ToNodeLink']].values.ravel('K'))
url_to_id = pd.DataFrame({
    'URL': all_urls,
    'NodeID': range(1, len(all_urls) + 1)
})

edges = edges.merge(url_to_id, how='left', left_on='FromNodeLink', right_on='URL')
edges.rename(columns={'NodeID': 'FromNodeId'}, inplace=True)
edges.drop(columns=['URL'], inplace=True)

edges = edges.merge(url_to_id, how='left', left_on='ToNodeLink', right_on='URL')
edges.rename(columns={'NodeID': 'ToNodeId'}, inplace=True)
edges.drop(columns=['URL'], inplace=True)

In [13]:
# Save to CSV

final_id_edges = edges[['FromNodeId', 'ToNodeId']]
final_id_edges.to_csv('web_yonsei_72h_id.csv', index=False)



In [14]:
# Save to CSV
final_edges = edges[['FromNodeLink', 'ToNodeLink', 'FromNodeId', 'ToNodeId']]
final_edges.to_csv('web_yonsei_72h.csv', index=False)