In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
##Scraping Home page - Master Details

In [3]:
# Home URL
URL = "https://www.thebostoncalendar.com/events?day=12&month=2&week=1&year=2025"

In [4]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

In [5]:
# Getting HTML Content
response = requests.get(URL, headers=HEADERS)
    
if response.status_code != 200:
    print(f"Failed to fetch page: {response.status_code}")
# Converting into Soup Object   
soup = BeautifulSoup(response.text, "lxml")

In [6]:
# In this Page all the different events are under different li component.
events = soup.find_all("li", class_="event")

In [7]:
event_data = []

In [8]:
for event in events:
    title_tag = event.find("h3").find("a")
    title = title_tag.get_text(strip=True) if title_tag else "No Title"

    link = title_tag["href"] if title_tag and title_tag.has_attr("href") else "No Link"
    full_link = f"https://www.thebostoncalendar.com{link}" if link.startswith("/") else link

    event_data.append({"Title": title, "Link": full_link})

In [9]:
df = pd.DataFrame(event_data)

In [10]:
df

Unnamed: 0,Title,Link
0,V for Vault: An Electronic Dance Music Valentines,https://www.thebostoncalendar.com/events/v-for...
1,Artz Underground Presents: For The Love of RnB,https://www.thebostoncalendar.com/events/artz-...
2,¡Miércoles Maravilloso!: free in-person Spanis...,https://www.thebostoncalendar.com/events/mierc...
3,100 things to do in Boston this weekend,https://www.thebostoncalendar.com/events/100-t...
4,91 FREE things to do in Boston this week: Feb ...,https://www.thebostoncalendar.com/events/91-fr...
...,...,...
1194,Après-ski on the Mystic at The Great American ...,https://www.thebostoncalendar.com/events/apres...
1195,Dirty Disney at Lil Chuck,https://www.thebostoncalendar.com/events/dirty...
1196,$1 Oysters at Bootleg Special,https://www.thebostoncalendar.com/events/1-oys...
1197,10PM Detention: Standup Comedy and $4 Drinks,https://www.thebostoncalendar.com/events/late-...


### Now from the event list we have to scrape through all individual events page to extract specific event related information

In [12]:
event_list=[]

In [13]:
# To retrieve information for each Event Link
for index, row in df.iterrows():
    EVENT_URL = row["Link"]
    
    # To get the HTML content for each link
    response = requests.get(EVENT_URL, headers=HEADERS)

    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "lxml")

        # To Extract event title
        title_tag = soup.find("h1", itemprop="name")
        title = title_tag.get_text(strip=True) if title_tag else "No Title"

        # To Extract event image URL
        image_tag = soup.find("a", class_="zoom_in")
        image_url = image_tag["href"] if image_tag and image_tag.has_attr("href") else "No Image"

        # To Extract event time
        start_time_tag = soup.find("span", id="startdate", itemprop="startDate")
        end_time_tag = soup.find("span", id="startdate", itemprop="endDate")
        start_time = start_time_tag["content"] if start_time_tag and start_time_tag.has_attr("content") else "No Start Time"
        end_time = end_time_tag["content"] if end_time_tag and end_time_tag.has_attr("content") else "No End Time"

        # To Extract event location
        location_name_tag = soup.find("span", itemprop="name")
        street_address_tag = soup.find("span", itemprop="streetAddress")
        city_tag = soup.find("span", itemprop="addressLocality")
        state_tag = soup.find("span", itemprop="addressRegion")
        postal_code_tag = soup.find("span", itemprop="postalCode")

        location_name = location_name_tag.get_text(strip=True) if location_name_tag else "No Location Name"
        street_address = street_address_tag.get_text(strip=True) if street_address_tag else "No Street Address"
        city = city_tag.get_text(strip=True) if city_tag else "No City"
        state = state_tag.get_text(strip=True) if state_tag else "No State"
        postal_code = postal_code_tag.get_text(strip=True) if postal_code_tag else "No Postal Code"
        full_address = f"{street_address}, {city}, {state} {postal_code}"

        # To Extract event categories
        categories_tag = soup.find("b", string="Categories:")
        categories = categories_tag.find_next_sibling(string=True).strip() if categories_tag else "No Categories"

        # To Extract event admission details
        admission_tag = soup.find("b", string="Admission:")
        admission = admission_tag.find_next_sibling("span").get_text(strip=True) if admission_tag else "No Admission Info"

        # To Extract event description
        description_tag = soup.find("div", id="event_description")
        description = description_tag.get_text(strip=True) if description_tag else "No Description"

        # To Check if any link in the description points to the same site and skip if found
        if description_tag:
            links = description_tag.find_all("a", href=True)
            for link in links:
                if "https://www.thebostoncalendar.com/events/" in link["href"]:
                    continue  # Skip this event if such a link is found


        event_data = {
            "Title": title,
            "Image URL": image_url,
            "Start Time": start_time,
            "End Time": end_time,
            "Location": location_name,
            "Full Address": full_address,
            "Categories": categories,
            "Admission": admission,
            "Description": description,
            "Event URL": EVENT_URL
        }

        event_list.append(event_data)

In [14]:
# Converting List into DataFrame
df_events = pd.DataFrame(event_list)

In [15]:
df_events

Unnamed: 0,Title,Image URL,Start Time,End Time,Location,Full Address,Categories,Admission,Description,Event URL
0,V for Vault: An Electronic Dance Music Valentines,https://media.thebostoncalendar.com/images/q_a...,2025-02-14 7:00pm EST,2025-02-15 1:00am EST,The Neal Rantoul Vault Theater,"25 Exchange St., Lynn, MA 01901","Music, Nightlife, Party, Shows",$15,V for Vault: An Electronic Dance Music Valenti...,https://www.thebostoncalendar.com/events/v-for...
1,Artz Underground Presents: For The Love of RnB,https://media.thebostoncalendar.com/images/q_a...,2025-02-15 7:00pm EST,2025-02-16 12:30am EST,The Neal Rantoul Vault Theater,"25 Exchange St., Lynn, MA 01901","Drinks, Music, Nightlife, Shows",$20,"Join us For The Love of RnB on Saturday, Febru...",https://www.thebostoncalendar.com/events/artz-...
2,¡Miércoles Maravilloso!: free in-person Spanis...,No Image,2025-02-19 7:15pm EST,2025-02-19 8:15pm EST,Boston Area Spanish Exchange (BASE),"101 Arch St., Boston, MA 02110","Classes, Date Idea, Meetup",FREE,¡Miércoles Maravilloso!Free open-level Spanish...,https://www.thebostoncalendar.com/events/mierc...
3,100 things to do in Boston this weekend,https://media.thebostoncalendar.com/images/q_a...,2025-02-14 7:00am EST,2025-02-17 11:00pm EST,Boston,"Surrounding areas, Boston, MA","Animals, Art, Business & Professional, Classes...",$Varies,Celebrate Valentine’s Day weekend and Presiden...,https://www.thebostoncalendar.com/events/100-t...
4,91 FREE things to do in Boston this week: Feb ...,https://media.thebostoncalendar.com/images/q_a...,2025-02-10 7:00am EST,2025-02-17 11:00pm EST,Boston,"Surrounding areas, Boston, MA","Animals, Art, Business & Professional, Classes...",FREE,Happy week of love! It’s time to celebrate our...,https://www.thebostoncalendar.com/events/91-fr...
...,...,...,...,...,...,...,...,...,...,...
1194,Après-ski on the Mystic at The Great American ...,https://media.thebostoncalendar.com/images/q_a...,2025-02-21 8:00pm EST,2025-02-22 12:00am EST,The Great American Beer Hall,"142 Mystic Ave, Medford, MA 02155","Drinks, Nightlife, Outside, Party",$5.00,"Après-ski on the MysticFebruary 21, 2025 8:00 ...",https://www.thebostoncalendar.com/events/apres...
1195,Dirty Disney at Lil Chuck,https://media.thebostoncalendar.com/images/q_a...,2025-02-21 9:30pm EST,2025-02-21 11:00pm EST,Lil Chuck at the Charles Playhouse,"74 Warrenton Street, Boston, MA 02116","Date Idea, Drinks, Good for Groups, Nightlife",$30,"Disney adults rejoice! We're back, baby! Revis...",https://www.thebostoncalendar.com/events/dirty...
1196,$1 Oysters at Bootleg Special,https://media.thebostoncalendar.com/images/q_a...,2025-02-21 10:00pm EST,2025-02-23 1:00am EST,Bootleg Special,"400 Tremont Street, Boston, MA","Date Idea, Food",$1+,Enjoy a late night special of $1 oysters at 10...,https://www.thebostoncalendar.com/events/1-oys...
1197,10PM Detention: Standup Comedy and $4 Drinks,https://media.thebostoncalendar.com/images/q_a...,2025-02-21 10:00pm EST,2025-02-21 11:30pm EST,Goofs Comedy Club,"432 McGrath Highway, Somerville, MA 02143","Accessible Spots, Date Idea, Drinks, Good for ...",$16-22,"Every weekend, we bring our favorite headliner...",https://www.thebostoncalendar.com/events/late-...


In [16]:
df_events.columns

Index(['Title', 'Image URL', 'Start Time', 'End Time', 'Location',
       'Full Address', 'Categories', 'Admission', 'Description', 'Event URL'],
      dtype='object')

### Now Let's Load the data into Snowflake

In [18]:
# Snowflake Credentials
SNOWFLAKE_ACCOUNT=''
SNOWFLAKE_USER=''
SNOWFLAKE_PASSWORD=''
SNOWFLAKE_DATABASE=''
SNOWFLAKE_SCHEMA=''
SNOWFLAKE_WAREHOUSE=''
SNOWFLAKE_ROLE=''

In [19]:
import snowflake.connector


conn = snowflake.connector.connect(
    user=SNOWFLAKE_USER,
    password=SNOWFLAKE_PASSWORD,
    account=SNOWFLAKE_ACCOUNT, 
    warehouse=SNOWFLAKE_WAREHOUSE,
    database=SNOWFLAKE_DATABASE,
    schema=SNOWFLAKE_SCHEMA,
    role=SNOWFLAKE_ROLE,
    client_session_keep_alive=True
)


cursor = conn.cursor()

In [20]:
df_events = df_events.rename(columns={
    "Title":"Event_Title",
    "Image URL": "Image_URL",
    "Start Time": "Start_Time",
    "End Time": "End_Time",
    "Full Address": "Full_Address",
    "Admission": "Admission",
    "Event URL": "Event_URL"
})

In [21]:
insert_query = """
INSERT INTO STAGING.BOSTON_CALENDAR_EVENTS_DETAILS 
(Event_Title, Image_URL, Start_Time, End_Time, Location, Full_Address, Categories, Admission, Description, Event_URL)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

In [22]:
data_to_insert = [tuple(row) for row in df_events.itertuples(index=False, name=None)]

In [23]:
cursor.executemany(insert_query, data_to_insert)

conn.commit()
print(f"Inserted {len(data_to_insert)} records into Snowflake.")

cursor.close()
conn.close()

Inserted 1199 records into Snowflake.
