In [1]:
# Cell 1: Imports
import os
import re
import time
import pandas as pd
import logging
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup

# Set your own path or default fallback
TEMP_DIR = "./tmp_data"


In [None]:
pip install selenium webdriver-manager

In [2]:
# Cell 2: Website settings and logger configuration
WEBSITE_NAME = "boston_gov"
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


In [7]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

def setup_chromium_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-software-rasterizer")
    options.add_argument("--disable-extensions")
    
    # DO NOT manually specify binary paths unless absolutely necessary
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=options)


In [9]:
# Cell 4: Scrape listing pages
BASE_URL = "https://www.boston.gov/events?page="
event_list = []

driver = setup_chromium_driver()
for page in range(3):
    url = BASE_URL + str(page)
    logger.info(f"Scraping page {page + 1}: {url}")
    driver.get(url)
    time.sleep(5)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    articles = soup.find_all("article", class_="calendar-listing-wrapper")
    
    for article in articles:
        title_tag = article.find("div", class_="title")
        title = title_tag.text.strip() if title_tag else "No Title"
        details_link_tag = article.find("a", class_="button")
        details_link = "https://www.boston.gov" + details_link_tag["href"] if details_link_tag and details_link_tag.has_attr("href") else "No Link"
        event_list.append({"Title": title, "Event Details Link": details_link})
        
    time.sleep(3)
driver.quit()

df_events = pd.DataFrame(event_list)
df_events.head()


INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:There is no [mac64] chromedriver "135.0.7049.84" for browser google-chrome "135.0.7049" in cache
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:WebDriver version 135.0.7049.84 selected
INFO:WDM:Modern chrome version https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.84/mac-arm64/chromedriver-mac-arm64.zip
INFO:WDM:About to download new driver from https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.84/mac-arm64/chromedriver-mac-arm64.zip
INFO:WDM:Driver downloading response is 200
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver has been saved in cache [/Users/ramkumarrp/.wdm/drivers/chromedriver/mac64/135.0.7049.84]
INFO:__main__:Scraping page 1: https://www.boston.gov/events?page=0
INFO:__main__:Scraping page 2: https://www.boston.gov/events?page=1
INFO:__main__:Scraping page 3: h

Unnamed: 0,Title,Event Details Link
0,Age Strong Yoga,https://www.boston.gov/node/16105821
1,West Roxbury Liaison Office Hours,https://www.boston.gov/node/16037156
2,Allston-Brighton Liaison Office Hours,https://www.boston.gov/node/15961906
3,CDL License Workshop (in collaboration with Em...,https://www.boston.gov/node/16362966
4,Charlestown Liaison Office Hours,https://www.boston.gov/node/15961761


In [11]:
df_events

Unnamed: 0,Title,Event Details Link
0,Age Strong Yoga,https://www.boston.gov/node/16105821
1,West Roxbury Liaison Office Hours,https://www.boston.gov/node/16037156
2,Allston-Brighton Liaison Office Hours,https://www.boston.gov/node/15961906
3,CDL License Workshop (in collaboration with Em...,https://www.boston.gov/node/16362966
4,Charlestown Liaison Office Hours,https://www.boston.gov/node/15961761
...,...,...
85,Climate Action Plan: Public Workshop,https://www.boston.govhttps://forms.gle/QKfSjE...
86,How to Improve Your Credit and Increase Your S...,https://www.boston.govhttps://www.zoomgov.com/...
87,Age Strong Senior Sound Bath,https://www.boston.gov/node/16361011
88,Age Strong Bowling League,https://www.boston.gov/node/16363141


In [17]:
import snowflake.connector
import logging

In [53]:
# Snowflake Settings for metrics
SNOWFLAKE_ACCOUNT_METRICS='ggtheca-odb63172'
SNOWFLAKE_USER_METRICS='EVENTSLENSSF'
SNOWFLAKE_PASSWORD_METRICS='Eventlenssnowflake123'
SNOWFLAKE_DATABASE_METRICS='eventslens'
SNOWFLAKE_SCHEMA_METRICS='edw'
SNOWFLAKE_WAREHOUSE_METRICS='EVENTSLENSMETRIC'
SNOWFLAKE_ROLE_METRICS='EVENTLENSMETRICS'


In [55]:
def get_snowflake_connection_metrics():
    try:
        connection = snowflake.connector.connect(
            user=SNOWFLAKE_USER_METRICS,
            password=SNOWFLAKE_PASSWORD_METRICS,
            account=SNOWFLAKE_ACCOUNT_METRICS,
            warehouse=SNOWFLAKE_WAREHOUSE_METRICS,
            database=SNOWFLAKE_DATABASE_METRICS,
            schema="EDW",
            role=SNOWFLAKE_ROLE_METRICS,
            client_session_keep_alive=True
        )
        return connection
    except Exception as e:
        print(f"Error connecting to Snowflake Metrics table: {str(e)}")
        raise

In [75]:
conn = get_snowflake_connection_metrics()
cursor = conn.cursor()
# Explicitly set the database and schema if not already default
cursor.execute("USE DATABASE EVENTSLENS")
cursor.execute("USE SCHEMA EDW")

<snowflake.connector.cursor.SnowflakeCursor at 0x30dac2090>

In [47]:
get_snowflake_connection_metrics()

<snowflake.connector.connection.SnowflakeConnection at 0x309d11400>