# Week 2: Exercise Solutions

**Web and Social Network Analytics**

---

This notebook contains complete solutions for all exercises. **Try to solve them yourself first!**

## Setup

In [None]:
# Standard libraries
import os
import time
import pathlib
from urllib.request import urlopen
from urllib.parse import urljoin, urlparse
import pprint as pp

# Web scraping
from bs4 import BeautifulSoup
import requests

# Graph analysis
import networkx as nx
import matplotlib.pyplot as plt

# Data handling
import pandas as pd

print('All libraries imported successfully!')

In [None]:
# Helper function for local files
def create_local_file_address(folder, file):
    """Create a file:// URL that works on any operating system."""
    file_address = os.path.join(os.getcwd(), folder, file)
    with_schema = pathlib.Path(file_address).as_uri()
    return with_schema

---

## Exercise 1 Solution: Link Extraction Basics

**Task**: Extract all hyperlinks from the `home.html` page in the `demowebsite` folder.

In [None]:
# Solution for Exercise 1
# ========================

# Step 1: Create the file path for home.html
file_path = create_local_file_address("demowebsite", "home.html")
print(f"Opening: {file_path}")

# Step 2: Open and parse the HTML file
html = urlopen(file_path)
soup = BeautifulSoup(html, 'html.parser')

# Step 3: Find all anchor (<a>) tags
links = soup.find_all('a')
print(f"\nFound {len(links)} anchor tags")

# Step 4: Extract and store href values in a list
link_urls = []
for link in links:
    href = link.get('href')  # Safer than link['href'] - returns None if missing
    if href:  # Only add if href exists
        link_urls.append(href)
        print(f"  Link text: '{link.text}' -> {href}")

# Step 5: Print the results
print(f"\nExtracted {len(link_urls)} links:")
print(link_urls)

### Alternative Solution (using list comprehension)

In [None]:
# More concise version using list comprehension
file_path = create_local_file_address("demowebsite", "home.html")
soup = BeautifulSoup(urlopen(file_path), 'html.parser')

# One-liner to extract all hrefs
link_urls = [link.get('href') for link in soup.find_all('a') if link.get('href')]

print(f"Found {len(link_urls)} links: {link_urls}")

### Common Mistakes

1. **Using `link['href']` instead of `link.get('href')`**: The first will crash if href doesn't exist
2. **Forgetting `file:///` protocol**: Local files need the file:// scheme
3. **Getting link text instead of href**: `link.text` gives the visible text, not the URL

---

## Exercise 2 Solution: Shingling and Jaccard Similarity

**Task**: Calculate Jaccard similarity between two documents using k=2 word shingles.

In [None]:
# Solution for Exercise 2
# ========================

# Step 1: Define the documents
doc_a = "The quick brown fox jumps"
doc_b = "The quick red fox leaps"

# Step 2: Implement the create_shingles function
def create_shingles(text, k=2):
    """
    Create k-word shingles from text.
    
    Args:
        text: Input string
        k: Number of words per shingle
    
    Returns:
        Set of shingles
    """
    # Normalize: lowercase and split into words
    words = text.lower().split()
    
    # Create overlapping shingles
    shingles = set()
    for i in range(len(words) - k + 1):
        # Join k consecutive words
        shingle = ' '.join(words[i:i+k])
        shingles.add(shingle)
    
    return shingles

# Step 3: Create shingles for both documents
shingles_a = create_shingles(doc_a, k=2)
shingles_b = create_shingles(doc_b, k=2)

# Step 4: Implement Jaccard similarity function
def jaccard_similarity(set1, set2):
    """
    Calculate Jaccard similarity between two sets.
    
    Jaccard = |Intersection| / |Union|
    """
    intersection = set1 & set2  # Elements in both sets
    union = set1 | set2         # Elements in either set
    
    if len(union) == 0:
        return 0.0
    
    return len(intersection) / len(union)

# Step 5: Calculate and display results
print(f"Document A: '{doc_a}'")
print(f"Shingles A: {shingles_a}")
print()
print(f"Document B: '{doc_b}'")
print(f"Shingles B: {shingles_b}")
print()

intersection = shingles_a & shingles_b
union = shingles_a | shingles_b

print(f"Intersection: {intersection}")
print(f"  Count: {len(intersection)}")
print()
print(f"Union: {union}")
print(f"  Count: {len(union)}")
print()

similarity = jaccard_similarity(shingles_a, shingles_b)
print(f"Jaccard Similarity: {len(intersection)}/{len(union)} = {similarity:.4f} ({similarity*100:.1f}%)")

### Bonus: Lecture Example ("I am Zexun")

In [None]:
# Example from the lecture slides
doc1 = "I am Zexun"
doc2 = "Zexun I am"

s1 = create_shingles(doc1, k=2)
s2 = create_shingles(doc2, k=2)

print(f"'{doc1}' shingles: {s1}")
print(f"'{doc2}' shingles: {s2}")
print(f"\nJaccard Similarity: {jaccard_similarity(s1, s2):.4f}")
print(f"Expected: 1/3 = 0.3333")

---

## Exercise 3 Solution: Building a Complete Web Crawler

**Task**: Implement a crawler that maps all pages in the `demowebsite` folder.

In [None]:
# Solution for Exercise 3
# ========================

def visit_page_and_return_dictionary(page_url):
    """
    Visit a local HTML page and return structured link data.
    
    Args:
        page_url: Name of the HTML file (e.g., 'home.html')
    
    Returns:
        Dictionary with 'address' and 'links_to' keys
    """
    # Create full file path
    full_path = create_local_file_address("demowebsite", page_url)
    print(f"Visiting: {page_url}")
    
    try:
        # Open and parse the page
        html = urlopen(full_path)
        soup = BeautifulSoup(html, 'html.parser')
        
        # Find all links
        links = soup.find_all('a')
        link_urls = []
        
        for link in links:
            href = link.get('href')
            if href:
                link_urls.append(href)
        
        return {
            'address': page_url,
            'links_to': link_urls
        }
    
    except Exception as e:
        print(f"  Error: {e}")
        return {
            'address': page_url,
            'links_to': []
        }

# Initialize data structures
starting_website = "home.html"
pages_visited = []              # Track what we've processed
pages_to_visit = [starting_website]  # The frontier
pages_info = []                 # Store results

# Main crawl loop
while len(pages_to_visit) > 0:
    # Get next page from frontier
    current_page = pages_to_visit.pop()
    
    # Skip if already visited
    if current_page in pages_visited:
        continue
    
    # Visit and get link info
    page_data = visit_page_and_return_dictionary(current_page)
    pages_info.append(page_data)
    
    # Mark as visited
    pages_visited.append(current_page)
    
    # Add new links to frontier
    for link in page_data['links_to']:
        if link not in pages_visited and link not in pages_to_visit:
            pages_to_visit.append(link)

# Display results
print("\n" + "=" * 50)
print("CRAWLING COMPLETE!")
print("=" * 50)
print(f"\nCrawled {len(pages_visited)} pages: {pages_visited}")

In [None]:
# View complete link structure
print("Complete link structure:")
print("-" * 50)
pp.pprint(pages_info)

### Common Mistakes

1. **Not checking if already visited**: Can cause infinite loops
2. **Adding duplicates to frontier**: Check both visited AND to_visit lists
3. **Not handling errors**: Pages might not exist or have issues

---

## Exercise 4 Solution: PageRank Calculation

**Task**: Build a NetworkX graph from crawler data and calculate PageRank.

In [None]:
# Solution for Exercise 4
# ========================

# Step 1: Create a directed graph from crawler data
graph = nx.DiGraph()

for page in pages_info:
    origin = page['address']
    for destination in page['links_to']:
        graph.add_edge(origin, destination)

print(f"Graph created with {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges")

# Step 2: Calculate PageRank
pageranks = nx.pagerank(graph, alpha=0.85)  # alpha is damping factor

# Step 3: Print sorted PageRank scores
print("\nPageRank Scores (highest to lowest):")
print("-" * 45)

sorted_pr = sorted(pageranks.items(), key=lambda x: x[1], reverse=True)
for page, score in sorted_pr:
    print(f"{page:35} {score:.4f}")

# Identify highest and lowest
highest = sorted_pr[0]
lowest = sorted_pr[-1]

print("\n" + "-" * 45)
print(f"Highest PageRank: {highest[0]} ({highest[1]:.4f})")
print(f"Lowest PageRank:  {lowest[0]} ({lowest[1]:.4f})")

In [None]:
# Step 4: Visualize the graph with node sizes based on PageRank
plt.figure(figsize=(14, 10))

# Calculate layout
positions = nx.spring_layout(graph, seed=42, k=2)

# Node sizes based on PageRank (scaled up for visibility)
sizes = [pageranks[node] * 8000 for node in graph.nodes()]

# Node colors based on PageRank (darker = higher)
colors = [pageranks[node] for node in graph.nodes()]

# Draw nodes
nx.draw_networkx_nodes(graph, positions, 
                       node_size=sizes, 
                       node_color=colors, 
                       cmap=plt.cm.Reds,
                       alpha=0.8)

# Draw edges
nx.draw_networkx_edges(graph, positions, 
                       edge_color='gray', 
                       arrows=True, 
                       arrowsize=15,
                       alpha=0.6)

# Draw labels
nx.draw_networkx_labels(graph, positions, font_size=8)

plt.title("DemoWebsite - Node Size & Color = PageRank", fontsize=14)
plt.axis('off')
plt.tight_layout()
plt.show()

### Analysis

**Which page has the highest PageRank? Why?**

The page with the highest PageRank is typically `home.html` or `shop.html` because:
1. It's linked to by multiple other pages (many incoming links)
2. The pages linking to it also have decent PageRank (quality of links matters)
3. In this demo website, `home.html` serves as the central hub

Pages with low PageRank (like `team.html`) typically have:
- Few incoming links
- Are "dead ends" with no outgoing links (though this affects others' PageRank more than their own)

---

## Exercise 5 Solution: HITS Algorithm Comparison

**Task**: Calculate HITS scores and compare with PageRank.

In [None]:
# Solution for Exercise 5
# ========================

# Step 1: Calculate HITS scores
hubs, authorities = nx.hits(graph, max_iter=100)

# Step 2: Print hub scores (sorted)
print("Hub Scores (pages that link to many good authorities):")
print("-" * 50)
sorted_hubs = sorted(hubs.items(), key=lambda x: x[1], reverse=True)
for page, score in sorted_hubs:
    print(f"{page:35} {score:.4f}")

In [None]:
# Step 3: Print authority scores (sorted)
print("Authority Scores (pages linked to by many good hubs):")
print("-" * 50)
sorted_auth = sorted(authorities.items(), key=lambda x: x[1], reverse=True)
for page, score in sorted_auth:
    print(f"{page:35} {score:.4f}")

In [None]:
# Step 4: Create comparison DataFrame
comparison_data = []
for page in graph.nodes():
    comparison_data.append({
        'Page': page,
        'PageRank': round(pageranks[page], 4),
        'Hub Score': round(hubs[page], 4),
        'Authority': round(authorities[page], 4),
        'In-Degree': graph.in_degree(page),
        'Out-Degree': graph.out_degree(page)
    })

df = pd.DataFrame(comparison_data)
df = df.sort_values('PageRank', ascending=False)

print("Comparison Table:")
print("=" * 80)
print(df.to_string(index=False))

In [None]:
# Identify best hub and authority
best_hub = max(hubs.items(), key=lambda x: x[1])
best_authority = max(authorities.items(), key=lambda x: x[1])

print(f"\nBest Hub: {best_hub[0]} (score: {best_hub[1]:.4f})")
print(f"  - Out-degree: {graph.out_degree(best_hub[0])} outgoing links")

print(f"\nBest Authority: {best_authority[0]} (score: {best_authority[1]:.4f})")
print(f"  - In-degree: {graph.in_degree(best_authority[0])} incoming links")

### Analysis

**1. Which page is the best hub? Why?**

The best **hub** is typically `home.html` because:
- It has the most outgoing links (highest out-degree)
- A hub's score depends on the authority scores of pages it links to
- Good hubs are "directories" that point to many valuable resources

**2. Which page is the best authority? Why?**

The best **authority** is often `shop.html` or `home.html` because:
- It's linked to by many other pages (high in-degree)
- An authority's score depends on the hub scores of pages linking to it
- Good authorities are "experts" that everyone references

**3. Why might HITS rankings differ from PageRank?**

- **PageRank** assigns ONE score per page based on incoming links and their importance
- **HITS** assigns TWO scores (hub + authority) recognizing different types of importance
- A page can be a great hub (links out a lot) but poor authority (not linked to much)
- PageRank includes teleportation; HITS doesn't handle dead ends the same way
- HITS is typically query-dependent in practice (computed for search results)

---

## Bonus Challenge Solution: Remote Website Crawler

**Task**: Adapt the crawler to work with real websites.

In [None]:
# Bonus Solution: Remote Website Crawler
# =======================================

def get_domain(url):
    """Extract domain from URL."""
    parsed = urlparse(url)
    return parsed.netloc

def clean_url(base_url, link):
    """
    Convert relative URLs to absolute and clean up.
    
    Args:
        base_url: The page URL where the link was found
        link: The href value (may be relative or absolute)
    
    Returns:
        Clean absolute URL or None if invalid
    """
    # Skip non-page links
    if not link or link.startswith('#') or link.startswith('mailto:') or link.startswith('javascript:'):
        return None
    
    # Convert relative to absolute
    full_url = urljoin(base_url, link)
    
    # Remove fragments and query strings for cleaner comparison
    parsed = urlparse(full_url)
    clean = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
    
    # Remove trailing slash for consistency
    clean = clean.rstrip('/')
    
    return clean

def visit_remote_page(url, target_domain):
    """
    Visit a remote URL and return links (filtered to same domain).
    
    Args:
        url: URL to visit
        target_domain: Only return links within this domain
    
    Returns:
        Dictionary with 'address' and 'links_to'
    """
    print(f"Visiting: {url}")
    
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Educational Web Crawler)'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all links
        links = soup.find_all('a')
        valid_links = []
        
        for link in links:
            href = link.get('href')
            clean = clean_url(url, href)
            
            if clean and get_domain(clean) == target_domain:
                if clean not in valid_links:  # Avoid duplicates
                    valid_links.append(clean)
        
        return {
            'address': url,
            'links_to': valid_links
        }
    
    except Exception as e:
        print(f"  Error: {e}")
        return {
            'address': url,
            'links_to': []
        }

def crawl_website(start_url, max_pages=10):
    """
    Crawl a website starting from start_url.
    
    Args:
        start_url: URL to start crawling from
        max_pages: Maximum number of pages to crawl
    
    Returns:
        List of page info dictionaries
    """
    target_domain = get_domain(start_url)
    print(f"Crawling domain: {target_domain}")
    print(f"Max pages: {max_pages}")
    print("=" * 50)
    
    # Clean start URL
    start_url = clean_url(start_url, start_url)
    
    pages_visited = []
    pages_to_visit = [start_url]
    pages_info = []
    
    while len(pages_to_visit) > 0 and len(pages_visited) < max_pages:
        current = pages_to_visit.pop(0)  # FIFO for breadth-first
        
        if current in pages_visited:
            continue
        
        page_data = visit_remote_page(current, target_domain)
        pages_info.append(page_data)
        pages_visited.append(current)
        
        # Add new links to frontier
        for link in page_data['links_to']:
            if link not in pages_visited and link not in pages_to_visit:
                pages_to_visit.append(link)
        
        # Be polite!
        time.sleep(1)
    
    print("\n" + "=" * 50)
    print(f"Crawled {len(pages_visited)} pages")
    
    return pages_info

In [None]:
# Test with quotes.toscrape.com (a site designed for scraping practice)
results = crawl_website("https://quotes.toscrape.com", max_pages=5)

print("\nCrawl Results:")
for page in results:
    print(f"\n{page['address']}")
    print(f"  Links to {len(page['links_to'])} pages")

In [None]:
# Build and visualize the graph
if results:
    remote_graph = nx.DiGraph()
    
    for page in results:
        for dest in page['links_to']:
            # Shorten URLs for display
            origin_short = page['address'].replace('https://quotes.toscrape.com', '')
            dest_short = dest.replace('https://quotes.toscrape.com', '')
            if not origin_short: origin_short = '/'
            if not dest_short: dest_short = '/'
            remote_graph.add_edge(origin_short, dest_short)
    
    # Calculate PageRank
    if remote_graph.number_of_nodes() > 0:
        pr = nx.pagerank(remote_graph)
        
        plt.figure(figsize=(12, 8))
        pos = nx.spring_layout(remote_graph, seed=42)
        sizes = [pr[n] * 5000 for n in remote_graph.nodes()]
        
        nx.draw(remote_graph, pos, with_labels=True, node_size=sizes,
                node_color='lightgreen', font_size=8, arrows=True)
        plt.title("quotes.toscrape.com Link Structure")
        plt.show()
        
        print("\nPageRank Scores:")
        for page, score in sorted(pr.items(), key=lambda x: x[1], reverse=True)[:5]:
            print(f"  {page}: {score:.4f}")

---

## Summary

In these exercises, you practiced:

1. **Link Extraction**: Using BeautifulSoup to parse HTML and extract hyperlinks
2. **Shingling**: Breaking text into overlapping pieces for comparison
3. **Jaccard Similarity**: Measuring document similarity using set operations
4. **Web Crawling**: Managing a frontier and avoiding revisits
5. **PageRank**: Measuring page importance based on link structure
6. **HITS**: Understanding hubs vs authorities

These are fundamental concepts for search engines and web analytics!