# Lab2: Accessing Web Resources with Python

- Scrape https://visitseattle.org for events happening in Seattle
    - Scraping the list page
        -  extract the url in `href` of a tags to the event detail page (e.g. https://visitseattle.org/events/glen-teriyaki/)
        - paginate the website and scape all events from the list page (https://visitseattle.org/events/page/1)
        - store in a listprint urls
- Look up the location
- Look up the weather for the location
- Store the data as CSV
- (bonus) [Store the data in Azure Blob storage](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?tabs=managed-identity%2Croles-azure-portal%2Csign-in-azure-cli) as csv
- (bonus) [Deploy the scraper on GitHub Actions](https://www.python-engineer.com/posts/run-python-github-actions/)

In [2]:

import requests
from bs4 import  BeautifulSoup

### 1. Scrape https://visitseattle.org for events happening in Seattle

In [6]:

def extract_event_urls(url):
    res = requests.get(url)
    
    if res.status_code == 200:
        soup = BeautifulSoup(res.text, 'html.parser')

        selector = "div.search-result-preview > div > h3 > a"
        a_eles = soup.select(selector)
        event_urls = [x['href'] for x in a_eles]
        return event_urls
    else:
        print(f"Failed to retrieve the page. Status code: {res.status_code}")
        return []

def scrape_event_urls(base_url):
    all_event_urls = []
    page_num = 1

    while True:
        page_url = f"{base_url}/page/{page_num}"
        event_urls = extract_event_urls(page_url)

        if not event_urls:
            break

        all_event_urls.extend(event_urls)
        page_num += 1

    return all_event_urls


base_url = 'https://visitseattle.org/events'


event_urls = scrape_event_urls(base_url)

# Print the list of event URLs
#for i, event_url in enumerate(event_urls, start=1):
#    print(f"{i}. {event_url}")

output_list = [f'"{url}",' for url in event_urls]

for output in output_list:
    print(output)


"https://visitseattle.org/events/amber-liu/",
"https://visitseattle.org/events/disability-justice/",
"https://visitseattle.org/events/hughes-bros-presents/",
"https://visitseattle.org/events/sarya-wu/",
"https://visitseattle.org/events/the-sweet-lillies/",
"https://visitseattle.org/events/dinosaur-jr/",
"https://visitseattle.org/events/black-dogs/",
"https://visitseattle.org/events/blue-elephant-and-the-seven-snakes/",
"https://visitseattle.org/events/brock-lanzetti-ogawa/",
"https://visitseattle.org/events/fact-and-fiction-the-lord-of-the-rings/",
"https://visitseattle.org/events/groundation/",
"https://visitseattle.org/events/kayla-min-andrews/",
"https://visitseattle.org/events/ol-doris/",
"https://visitseattle.org/events/rosetan/",
"https://visitseattle.org/events/untold-stories-wire-life-and-art-with-kristin-tollefson/",
"https://visitseattle.org/events/y-la-bamba/",
"https://visitseattle.org/events/dead-bars/",
"https://visitseattle.org/events/emotional-a-night-for-the-heartbroke

### 2. Scraping the detail pages
- From step1, loop through the detail page urls you extracted
- HTTP GET all the detail page HTMLs
- Extract
    - Name
    - Date
    - Location
    - Type
    - Region
- Store it as CSV ( “events.csv”)

In [9]:
import requests
from bs4 import BeautifulSoup
import csv


def extract_event_details(event_url):
    res = requests.get(event_url)
    
    if res.status_code == 200:
        soup = BeautifulSoup(res.text, 'html.parser')

        
        name = soup.select('#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > h1')[0].text
        date = soup.select('#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > h4 > span:nth-child(1)')[0].text
        location = soup.select('#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > h4 > span:nth-child(2)')[0].text
        event_type = soup.select('#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > a:nth-child(3)')[0].text
        region = soup.select('#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > a:nth-child(4)')[0].text

        return {'Name': name, 'Date': date, 'Location': location, 'Type': event_type, 'Region': region}
    else:
        print(f"Failed to retrieve the page. Status code: {res.status_code}")
        return None

def scrape_and_save_event_details(event_urls):
    event_details_list = []

    for event_url in event_urls:
        event_details = extract_event_details(event_url)

        if event_details:
            event_details_list.append(event_details)

    
    csv_file_path = 'events.csv'
    fields = ['Name', 'Date', 'Location', 'Type', 'Region']

    with open(csv_file_path, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        writer.writeheader()
        writer.writerows(event_details_list)

    print(f"Event details have been saved to {csv_file_path}")


base_url = 'https://visitseattle.org/events'
event_urls = scrape_event_urls(base_url)


scrape_and_save_event_details(event_urls)


Event details have been saved to events.csv


### 3. Look up the location
- https://nominatim.openstreetmap.org

In [10]:
# location name to lat lon

# Option 1: just string manipulation
base_url = "https://nominatim.openstreetmap.org/search.php"
query_params_str = "?q=Wallingford%2C+Seattle&format=jsonv2"
ful_url = base_url + query_params_str

# Option 2: use dictionary to represent query params
#           use requests.get(url, params=query_params) to attach the query param dict
query_params = {
    "q": "Wallingford, Seattle",
    "format": "jsonv2"
}

res = requests.get(base_url, params=query_params)
res.json()

[{'place_id': 312844859,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
  'osm_type': 'node',
  'osm_id': 150973716,
  'lat': '47.6594631',
  'lon': '-122.3343417',
  'category': 'place',
  'type': 'suburb',
  'place_rank': 19,
  'importance': 0.3131791431958073,
  'addresstype': 'suburb',
  'name': 'Wallingford',
  'display_name': 'Wallingford, Seattle, King County, Washington, 98015, United States',
  'boundingbox': ['47.6394631', '47.6794631', '-122.3543417', '-122.3143417']}]

In [11]:
import requests
import pandas as pd

def get_lat_lon(location_name):
    base_url = "https://nominatim.openstreetmap.org/search.php"
    query_params = {
        "q": location_name,
        "format": "jsonv2"
    }

    res = requests.get(base_url, params=query_params)
    data = res.json()

    if data:
        return data[0]['lat'], data[0]['lon']
    else:
        return None, None

# Read events.csv into a DataFrame
df = pd.read_csv('events.csv')

# Create new columns for latitude and longitude
df['Latitude'] = None
df['Longitude'] = None

# Iterate through each row and update latitude and longitude
for index, row in df.iterrows():
    location_name = f"{row['Location']}, Seattle"
    latitude, longitude = get_lat_lon(location_name)
    df.at[index, 'Latitude'] = latitude
    df.at[index, 'Longitude'] = longitude

# Save the updated DataFrame to events.csv
df.to_csv('events_with_lat_lon.csv', index=False)


### 3. Look up the weather for the location
- https://www.weather.gov/documentation/services-web-api
- Choose the Day time for the day