In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import spacy
from datetime import datetime
from calendar import monthrange

In [None]:
# Load the spaCy model for NLP tasks
nlp = spacy.load("en_core_web_sm")

# Function to scrape events from the "On This Day" website for a given month and date
def scrape_events(month, date):
    url = f"https://www.onthisday.com/events/{month}/{date}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    events = []
    event_descriptions = soup.select('.event')

    for event in event_descriptions:
        event_text = event.get_text(strip=True)
        if len(event_text) >= 4 and event_text[:4].isdigit():
            event_year = int(event_text[:4])
            if 2009 <= event_year <= 2022:
                event_desc = event_text[4:].strip()  # Remove the year part
                event_date = f"{event_year}-{month}-{date}"
                events.append({
                    'date': event_date,
                    'event': event_desc
                })

    return events

# List to store all events
all_events = []

# Loop through all months and dates
for month in range(1, 13):
    month_name = datetime(2023, month, 1).strftime('%B').lower()
    days_in_month = monthrange(2023, month)[1]
    for date in range(1, days_in_month + 1):
        events = scrape_events(month_name, str(date))
        all_events.extend(events)

# Convert the list of events to a DataFrame
df = pd.DataFrame(all_events)

# Convert the 'date' column to datetime format for proper sorting
df['date'] = pd.to_datetime(df['date'], format='%Y-%B-%d')

# Sort the DataFrame by date
df = df.sort_values(by='date')

# Save the DataFrame to a CSV file
df.to_csv('/content/drive/MyDrive/fakeBN/data/on_this_day_events_2009_2022.csv', index=False)
