# Week 1: Exercise Solutions

**Web and Social Network Analytics**

---

This notebook contains complete solutions for all exercises. Try to solve them yourself first before looking at these solutions!

**Disclaimer**: This educational content is provided for instructional purposes only. Always respect website terms of service and legal requirements when scraping.

---

## Setup

In [None]:
# Standard libraries
import os
import time

# Web scraping
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

# Data handling
import pandas as pd

print('Libraries imported successfully!')

---

# Exercise 1 Solution: BeautifulSoup Basics

**Task**: Extract the SCQF Level from a DRPS course page.

In [None]:
# Solution for Exercise 1

# Step 1: Fetch the page
url = 'http://www.drps.ed.ac.uk/24-25/dpt/cxcmse11427.htm'
html = urlopen(url)

# Step 2: Parse with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Step 3: Find the table containing course information
table = soup.find('table', {'class': 'sitstablegrid'})

# Step 4: Search for the cell containing 'SCQF Level'
if table:
    for cell in table.find_all('td'):
        if 'SCQF Level' in cell.text:
            print('Found:', cell.text.strip())
            break
else:
    print('Table not found')

### Alternative Solution: More Specific Search

In [None]:
# Alternative: Find both SCQF Level and Credits

url = 'http://www.drps.ed.ac.uk/24-25/dpt/cxcmse11427.htm'
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

# Get course name
course_name = soup.find('h1')
if course_name:
    print(f'Course: {course_name.text.strip()}')

# Find all cells and extract specific info
table = soup.find('table', {'class': 'sitstablegrid'})
if table:
    for cell in table.find_all('td'):
        text = cell.text.strip()
        if 'SCQF Level' in text:
            print(f'SCQF Level: {text}')
        elif 'SCQF Credits' in text:
            print(f'Credits: {text}')

### Common Mistakes to Avoid

1. **Not checking if element exists**: Always check `if table:` before using `.find_all()`
2. **Forgetting to strip whitespace**: Use `.strip()` to clean text
3. **Using wrong class name**: Inspect the page carefully to get exact class names

---

# Exercise 2 Solution: Multi-Item Scraping

**Task**: Scrape quotes from multiple pages of quotes.toscrape.com.

In [None]:
# Solution for Exercise 2

all_quotes = []

# Loop through pages 1 to 3
for page_num in range(1, 4):
    url = f'https://quotes.toscrape.com/page/{page_num}/'
    print(f'Scraping page {page_num}...')
    
    # Fetch and parse the page
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all quote containers
    quotes = soup.find_all('div', {'class': 'quote'})
    
    # Extract data from each quote
    for quote in quotes:
        # Get the quote text
        text = quote.find('span', {'class': 'text'}).text
        
        # Get the author
        author = quote.find('small', {'class': 'author'}).text
        
        # Get all tags as a list
        tags = [tag.text for tag in quote.find_all('a', {'class': 'tag'})]
        
        # Append to our list
        all_quotes.append({
            'text': text,
            'author': author,
            'tags': tags
        })
    
    print(f'  Found {len(quotes)} quotes')
    
    # Be respectful - wait between requests
    time.sleep(1)

print(f'\nTotal quotes collected: {len(all_quotes)}')

In [None]:
# Create and display DataFrame
df = pd.DataFrame(all_quotes)
print(f'DataFrame shape: {df.shape}')
df.head(10)

In [None]:
# Bonus: Analyze the data
print('\nQuotes by author:')
print(df['author'].value_counts().head())

# Flatten tags and count
all_tags = [tag for tags_list in df['tags'] for tag in tags_list]
print('\nMost common tags:')
print(pd.Series(all_tags).value_counts().head())

### Alternative: Using requests instead of urlopen

In [None]:
# Alternative solution using requests library

import requests

all_quotes_v2 = []

for page_num in range(1, 4):
    url = f'https://quotes.toscrape.com/page/{page_num}/'
    
    # Use requests.get() instead of urlopen()
    response = requests.get(url)
    
    # Check if request was successful
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        for quote in soup.find_all('div', {'class': 'quote'}):
            all_quotes_v2.append({
                'text': quote.find('span', {'class': 'text'}).text,
                'author': quote.find('small', {'class': 'author'}).text,
                'tags': [t.text for t in quote.find_all('a', {'class': 'tag'})]
            })
    
    time.sleep(1)

print(f'Collected {len(all_quotes_v2)} quotes using requests')

---

# Exercise 3 Solution: Dynamic Content with Playwright

**Task**: Scrape the JavaScript-rendered quotes page.

In [None]:
# First, demonstrate that BeautifulSoup alone doesn't work
url = 'https://quotes.toscrape.com/js/'
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

quotes_bs = soup.find_all('div', {'class': 'quote'})
print(f'BeautifulSoup alone finds: {len(quotes_bs)} quotes')
print('(Expected: 0, because quotes are loaded by JavaScript)')

In [None]:
# Solution using Playwright
# ============================================
# ASYNC API (for JupyterLab/Notebook)
# ============================================
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://quotes.toscrape.com/js/'

async def scrape_js_quotes():
    async with async_playwright() as p:
        # Launch browser in headless mode
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        
        # Navigate to the page
        print('Loading page...')
        await page.goto(url)
        
        # Wait for quotes to be loaded by JavaScript
        await page.wait_for_selector('.quote')
        print('Quotes loaded!')
        
        # Get the rendered HTML
        html = await page.content()
        
        # Close browser
        await browser.close()
        
        return html

# Run the async function in JupyterLab/Notebook
html = await scrape_js_quotes()

# Now parse with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
quotes = soup.find_all('div', {'class': 'quote'})

print(f'\nPlaywright + BeautifulSoup finds: {len(quotes)} quotes')

# Extract data
js_quotes = []
for quote in quotes:
    js_quotes.append({
        'text': quote.find('span', {'class': 'text'}).text,
        'author': quote.find('small', {'class': 'author'}).text
    })

# Create DataFrame
js_df = pd.DataFrame(js_quotes)
js_df

# ============================================
# SYNC API (for .py scripts) - Uncomment to use
# ============================================
# from playwright.sync_api import sync_playwright
#
# with sync_playwright() as p:
#     browser = p.chromium.launch(headless=True)
#     page = browser.new_page()
#     page.goto(url)
#     page.wait_for_selector('.quote')
#     html = page.content()
#     browser.close()
#
# soup = BeautifulSoup(html, 'html.parser')
# quotes = soup.find_all('div', {'class': 'quote'})
# print(f'Found: {len(quotes)} quotes')

### Alternative: Scrape Multiple Pages with Playwright

In [None]:
# Advanced: Scrape multiple pages using Playwright
# ============================================
# ASYNC API (for JupyterLab/Notebook)
# ============================================
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import asyncio

async def scrape_multiple_js_pages():
    all_js_quotes = []
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        
        # Start at first page
        await page.goto('https://quotes.toscrape.com/js/')
        await page.wait_for_selector('.quote')
        
        for page_num in range(1, 4):  # 3 pages
            print(f'Scraping page {page_num}...')
            
            # Get current page content
            html = await page.content()
            soup = BeautifulSoup(html, 'html.parser')
            
            quotes = soup.find_all('div', {'class': 'quote'})
            for quote in quotes:
                all_js_quotes.append({
                    'text': quote.find('span', {'class': 'text'}).text,
                    'author': quote.find('small', {'class': 'author'}).text
                })
            
            print(f'  Found {len(quotes)} quotes')
            
            # Try to click Next button
            next_btn = await page.query_selector('li.next a')
            if next_btn:
                await next_btn.click()
                await page.wait_for_selector('.quote')
                await asyncio.sleep(1)
            else:
                print('No more pages')
                break
        
        await browser.close()
    
    return all_js_quotes

# Run the async function in JupyterLab/Notebook
all_js_quotes = await scrape_multiple_js_pages()
print(f'\nTotal quotes from JS pages: {len(all_js_quotes)}')

# ============================================
# SYNC API (for .py scripts) - Uncomment to use
# ============================================
# from playwright.sync_api import sync_playwright
# import time
#
# all_js_quotes = []
#
# with sync_playwright() as p:
#     browser = p.chromium.launch(headless=True)
#     page = browser.new_page()
#     page.goto('https://quotes.toscrape.com/js/')
#     page.wait_for_selector('.quote')
#     
#     for page_num in range(1, 4):
#         print(f'Scraping page {page_num}...')
#         html = page.content()
#         soup = BeautifulSoup(html, 'html.parser')
#         
#         quotes = soup.find_all('div', {'class': 'quote'})
#         for quote in quotes:
#             all_js_quotes.append({
#                 'text': quote.find('span', {'class': 'text'}).text,
#                 'author': quote.find('small', {'class': 'author'}).text
#             })
#         
#         next_btn = page.query_selector('li.next a')
#         if next_btn:
#             next_btn.click()
#             page.wait_for_selector('.quote')
#             time.sleep(1)
#         else:
#             break
#     
#     browser.close()
#
# print(f'Total quotes: {len(all_js_quotes)}')

---

# Exercise 4A Solution: Weather API

**Task**: Fetch weather data for Scottish cities using Open-Meteo API.

In [None]:
# Solution for Exercise 4A

import requests

# Define cities and their coordinates
cities = {
    'Edinburgh': (55.95, -3.19),
    'Glasgow': (55.86, -4.25),
    'Aberdeen': (57.15, -2.11)
}

# API endpoint
api_url = 'https://api.open-meteo.com/v1/forecast'

weather_data = []

for city, (lat, lon) in cities.items():
    print(f'Fetching weather for {city}...')
    
    # Build request parameters
    params = {
        'latitude': lat,
        'longitude': lon,
        'current_weather': True
    }
    
    # Make API request
    response = requests.get(api_url, params=params)
    
    # Check if successful
    if response.status_code == 200:
        data = response.json()
        current = data['current_weather']
        
        weather_data.append({
            'city': city,
            'temperature': current['temperature'],
            'windspeed': current['windspeed'],
            'time': current['time']
        })
    else:
        print(f'  Error: {response.status_code}')

# Create DataFrame
weather_df = pd.DataFrame(weather_data)
print('\nWeather Data:')
weather_df

In [None]:
# Bonus: Add more weather details

detailed_weather = []

for city, (lat, lon) in cities.items():
    params = {
        'latitude': lat,
        'longitude': lon,
        'current_weather': True,
        'hourly': 'temperature_2m,precipitation_probability',
        'timezone': 'Europe/London'
    }
    
    response = requests.get(api_url, params=params)
    data = response.json()
    
    # Get current weather
    current = data['current_weather']
    
    # Get next hour's precipitation probability
    hourly = data.get('hourly', {})
    precip_prob = hourly.get('precipitation_probability', [0])[0]
    
    detailed_weather.append({
        'city': city,
        'temperature_c': current['temperature'],
        'windspeed_kmh': current['windspeed'],
        'precipitation_probability_%': precip_prob
    })

detailed_df = pd.DataFrame(detailed_weather)
print('Detailed Weather:')
detailed_df

---

# Exercise 4B Solution: Google Maps API (Optional)

**Task**: Fetch place details using Google Maps API.

**Note**: This requires a valid API key from Google Cloud Platform.

In [None]:
# Solution for Exercise 4B (requires API key)

'''
# Uncomment and add your API key to use this code

import requests

api_key = 'YOUR_API_KEY_HERE'  # Replace with your key

# Edinburgh Castle Place ID
place_id = 'ChIJ98CZIJrHh0gRWApM5esemkY'

# API endpoint
url = 'https://maps.googleapis.com/maps/api/place/details/json'

# Parameters
params = {
    'place_id': place_id,
    'fields': 'name,rating,user_ratings_total,reviews',
    'key': api_key
}

# Make request
response = requests.get(url, params=params)
data = response.json()

if data['status'] == 'OK':
    result = data['result']
    
    print(f"Place: {result['name']}")
    print(f"Rating: {result.get('rating', 'N/A')}")
    print(f"Total Reviews: {result.get('user_ratings_total', 'N/A')}")
    
    print("\nFirst 3 Reviews:")
    reviews = result.get('reviews', [])
    for i, review in enumerate(reviews[:3]):
        print(f"\n{i+1}. {review['author_name']} - {review['rating']}/5 stars")
        print(f"   {review['text'][:150]}...")
else:
    print(f"Error: {data['status']}")
'''

print('Google Maps API solution (requires API key)')
print('Uncomment the code above and add your API key to run')

### Using the googlemaps Library

In [None]:
# Alternative solution using googlemaps library

'''
# First install: pip install googlemaps
import googlemaps

api_key = 'YOUR_API_KEY_HERE'
gmaps = googlemaps.Client(key=api_key)

# Search for Edinburgh Castle
places_result = gmaps.places('Edinburgh Castle')

if places_result['results']:
    place_id = places_result['results'][0]['place_id']
    
    # Get detailed info
    place_details = gmaps.place(place_id)
    result = place_details['result']
    
    print(f"Name: {result['name']}")
    print(f"Rating: {result.get('rating', 'N/A')}")
    
    reviews = result.get('reviews', [])
    print(f"\nNumber of reviews returned: {len(reviews)} (API limit: 5)")
    
    # Create DataFrame of reviews
    if reviews:
        reviews_df = pd.DataFrame([{
            'author': r['author_name'],
            'rating': r['rating'],
            'text': r['text'][:100] + '...',
            'time': r['relative_time_description']
        } for r in reviews])
        print(reviews_df)
'''

print('googlemaps library solution (requires API key)')

---

# Bonus Challenge Solution

**Task**: Create a function that combines API and web scraping.

In [None]:
# Bonus Solution: Combining techniques

def get_city_info(city_name, latitude, longitude):
    """
    Get comprehensive information about a city.
    
    Combines:
    - Open-Meteo API for weather
    - Web scraping for Wikipedia summary
    
    Args:
        city_name: Name of the city
        latitude: City latitude
        longitude: City longitude
    
    Returns:
        Dictionary with city information
    """
    info = {'city': city_name}
    
    # 1. Get weather from API
    try:
        weather_url = 'https://api.open-meteo.com/v1/forecast'
        weather_params = {
            'latitude': latitude,
            'longitude': longitude,
            'current_weather': True
        }
        weather_response = requests.get(weather_url, params=weather_params)
        weather_data = weather_response.json()
        
        current = weather_data['current_weather']
        info['temperature_c'] = current['temperature']
        info['windspeed_kmh'] = current['windspeed']
        info['weather_time'] = current['time']
    except Exception as e:
        info['weather_error'] = str(e)
    
    # 2. Get Wikipedia summary (scraping)
    try:
        wiki_url = f'https://en.wikipedia.org/wiki/{city_name}'
        wiki_response = requests.get(wiki_url)
        wiki_soup = BeautifulSoup(wiki_response.text, 'html.parser')
        
        # Get first paragraph
        paragraphs = wiki_soup.find_all('p')
        for p in paragraphs:
            text = p.text.strip()
            if len(text) > 100:  # Skip short paragraphs
                info['wikipedia_summary'] = text[:300] + '...'
                break
    except Exception as e:
        info['wikipedia_error'] = str(e)
    
    return info

# Test the function
edinburgh_info = get_city_info('Edinburgh', 55.95, -3.19)

print('Edinburgh City Information:')
print('=' * 50)
for key, value in edinburgh_info.items():
    if key == 'wikipedia_summary':
        print(f'\n{key}:')
        print(f'  {value}')
    else:
        print(f'{key}: {value}')

In [None]:
# Apply to multiple cities
cities = {
    'Edinburgh': (55.95, -3.19),
    'Glasgow': (55.86, -4.25),
    'Aberdeen': (57.15, -2.11)
}

all_city_info = []
for city, (lat, lon) in cities.items():
    print(f'Processing {city}...')
    info = get_city_info(city, lat, lon)
    all_city_info.append(info)
    time.sleep(1)  # Be respectful

# Create summary DataFrame (weather only for cleaner display)
weather_summary = pd.DataFrame([{
    'city': info['city'],
    'temperature_c': info.get('temperature_c', 'N/A'),
    'windspeed_kmh': info.get('windspeed_kmh', 'N/A')
} for info in all_city_info])

print('\nWeather Summary:')
weather_summary

---

# Summary of Key Patterns

## BeautifulSoup Pattern
```python
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
elements = soup.find_all('tag', {'class': 'classname'})
```

## Playwright Pattern

### For JupyterLab/Notebook (Async API)
```python
from playwright.async_api import async_playwright

async def scrape_page():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)
        await page.wait_for_selector('.selector')
        html = await page.content()
        await browser.close()
        return html

html = await scrape_page()
```

### For .py Scripts (Sync API)
```python
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch(headless=True)
    page = browser.new_page()
    page.goto(url)
    page.wait_for_selector('.selector')
    html = page.content()
    browser.close()
```

## API Pattern
```python
import requests

response = requests.get(url, params=params)
data = response.json()
```

---

*End of Exercise Solutions*