In [1]:
import requests
from bs4 import BeautifulSoup
import json

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"}

In [76]:
url1 = "https://en.wikipedia.org/wiki/Pittsburgh"
f1 = requests.get(url1, headers=headers, timeout = 1)
f1_soup = BeautifulSoup(f1.text, 'lxml')

In [77]:
content_div = f1_soup.find(class_="mw-content-ltr mw-parser-output")
for tag in content_div.find_all('sup', class_='reference'):
    tag.decompose()

for tag in content_div.find_all('span', class_='mw-editsection'):
    tag.decompose()

In [78]:
section_ids_to_remove = ['References', 'See_also', 'Further_reading', 'External_links']

for section_id in section_ids_to_remove:
    heading_tag = content_div.find('h2', id=section_id)
    if heading_tag:
        parent_div = heading_tag.parent
        for sibling in list(parent_div.next_siblings):
            sibling.decompose()
        parent_div.decompose()

In [None]:
clean_text = content_div.get_text(separator = '\n', strip = True)
print(clean_text)

In [21]:
data = {}
data[url1] = clean_text

In [47]:
def req(url) :
  try :
    r = requests.get(url, headers = headers, timeout = 2)
    return r.text
  except :
    return "failed"

def soup(text) :
  return BeautifulSoup(text, 'lxml')

In [40]:
def content(soup, id) :
  content_div = soup.find(class_ = id)
  if content_div and content_div.find_all('sup', class_ = 'reference') is not None :
    for tag in content_div.find_all('sup', class_='reference'):
        tag.decompose()
  if content_div and content_div.find_all('span', class_ = 'mw-editsection') is not None :
    for tag in content_div.find_all('span', class_='mw-editsection'):
        tag.decompose()
  return content_div

In [41]:
def remove(section_ids, content) :
  if content is None :
    return None
  for section_id in section_ids:
    heading_tag = content.find('h2', id=section_id)
    if heading_tag:
        parent_div = heading_tag.parent
        for sibling in list(parent_div.next_siblings):
            sibling.decompose()
        parent_div.decompose()
  return content.get_text(separator = "\n", strip = True)

In [20]:
URLS_TO_SCRAPE = [
    'https://en.wikipedia.org/wiki/History_of_Pittsburgh',
    'https://www.britannica.com/place/Pittsburgh',
    'https://www.cmu.edu/about/history.html'
]

In [48]:
soups = []
for url in URLS_TO_SCRAPE :
  soups.append(soup(req(url)))

In [25]:
content_div2 = content(soups[0], "mw-content-ltr mw-parser-output")

In [28]:
content_div2 = remove(["See_also", "References", "Bibliography", "External__links"], content_div2)

In [None]:
print(content_div2)

In [30]:
data[URLS_TO_SCRAPE[0]] = content_div2

In [51]:
content_div3 = soups[1].find('div', class_="page2ref-true topic-content topic-type-REGULAR")
if content_div3:
    for tag in content_div3.find_all('span'):
        tag.decompose()
    content_div3 = content_div3.get_text(separator = "\n", strip = True)
else:
    content_div3 = None
    print("Could not find a div with class 'reading-channel' in the second URL.")

Could not find a div with class 'reading-channel' in the second URL.


In [None]:
data[URLS_TO_SCRAPE[1]] = None

In [53]:
content_div4 = soups[2].find('div', class_ = "layout-content")
content_div4 = content_div4.get_text(separator = '\n', strip = True)
data[URLS_TO_SCRAPE[2]] = content_div4

In [68]:
events_urls = ["https://community.pghcitypaper.com/pittsburgh/EventSearch",
               ]

In [None]:
BASE_URL = 'https://community.pghcitypaper.com/pittsburgh/EventSearch'

TOTAL_PAGES = 26

all_events_data = []

for page_num in range(1, TOTAL_PAGES + 1):
    current_url = f"{BASE_URL}?page={page_num}&v=d"
    print(f"Scraping page {page_num} of {TOTAL_PAGES}...")

    try:
        response = requests.get(current_url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        results_container = soup.find('ul', class_="pres-EventSearchRectangle uk-list uk-list-divider uk-flex@show-grid")

        if results_container:
            all_events_data.append(results_container.get_text(separator = '\n', strip = True))
        if not results_container:
            print("  -> No events found on this page. Stopping.")
            break

    except requests.exceptions.RequestException as e:
        print(f"  [!] Could not fetch page {page_num}: {e}")
    except Exception as e:
        print(f"  [!] An error occurred while parsing page {page_num}: {e}")

print(f"\n✅ Scraping complete. Total events collected: {len(all_events_data)}")

Scraping page 1 of 26...
Scraping page 2 of 26...
Scraping page 3 of 26...
Scraping page 4 of 26...
Scraping page 5 of 26...
Scraping page 6 of 26...
Scraping page 7 of 26...
Scraping page 8 of 26...
Scraping page 9 of 26...
Scraping page 10 of 26...
Scraping page 11 of 26...
Scraping page 12 of 26...
Scraping page 13 of 26...
Scraping page 14 of 26...
Scraping page 15 of 26...
Scraping page 16 of 26...
Scraping page 17 of 26...
Scraping page 18 of 26...
Scraping page 19 of 26...
Scraping page 20 of 26...
Scraping page 21 of 26...
Scraping page 22 of 26...
Scraping page 23 of 26...
Scraping page 24 of 26...
Scraping page 25 of 26...
Scraping page 26 of 26...

✅ Scraping complete. Total events collected: 26


In [75]:
content_div5 = "\n \n".join(all_events_data)
import re

raw_text = content_div5

cleaned_lines = []
for line in raw_text.split('\n'):
    # Strip whitespace to make matching easier
    stripped_line = line.strip()

    # Create a list of conditions for lines you want to IGNORE
    if stripped_line == "Get Tickets":
        continue # Skip this line
    if re.match(r'^\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$', stripped_line): # Regex for phone numbers
        continue # Skip this line
    if re.match(r'^\$?\d+(\.\d{2})?.*$', stripped_line): # Basic regex for prices
        continue # Skip this line

    # If the line is not noise, keep it
    if stripped_line: # Also ignore empty lines for now
        cleaned_lines.append(stripped_line)

# Join the lines back together into a single string
content_div5 = "\n".join(cleaned_lines)
data[BASE_URL] = content_div5

In [79]:
filename = 'data.json'

with open(filename, 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4)

print(f"✅ Dictionary successfully saved to '{filename}'")

✅ Dictionary successfully saved to 'data.json'


In [2]:
base_url = "https://downtownpittsburgh.com/events/?n=12&y=2025&cat=0"
text = []

for i in  range(2015, 2026) :
  for j in range(1, 13) :
    current_url = f"{base_url}/?n={j}y={i}&cat=0"
    try:
        response = requests.get(current_url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        event_items = soup.find_all('div', class_ = "eventitem")
        for event in event_items:
          # Use .find() within each event container
          copy_content = event.find('div', class_='copyContent')

          # Default values in case an element is missing
          name = "N/A"
          time = "N/A"
          description = "N/A"
          # Location isn't explicitly tagged, it might be in the description

          if copy_content:
              # --- Extract Event Name ---
              # Find the <h1>, then the <a> inside it
              name_tag = copy_content.find('h1')
              if name_tag and name_tag.a:
                  name = name_tag.a.get_text(strip=True)

              # --- Extract Event Date/Time ---
              # Find the div with class 'eventdate'
              time_tag = copy_content.find('div', class_='eventdate')
              if time_tag:
                  time = time_tag.get_text(strip=True)

                  # --- Extract Event Description ---
                  # The description is the text node right after the time_tag
                  # We use .next_sibling to find it
                  desc_node = time_tag.next_sibling
                  if desc_node and isinstance(desc_node, str):
                      description = desc_node.strip()

          # 3. Append the structured data to our list
          text.append({
              'event_name': name,
              'time': time,
              'description': description,
              # Location will likely need to be parsed from the description later
              'location': 'See description'
          })
    except :
        print("Error...")

In [3]:
filename = 'data.json'

# This variable will hold the dictionary after loading
loaded_data = {}

try:
    # Use 'with open' to handle the file safely
    # 'r' stands for "read mode"
    with open(filename, 'r', encoding='utf-8') as f:
        # json.load() reads the file and converts the JSON object
        # into a Python dictionary.
        loaded_data = json.load(f)

    print(f"✅ JSON file '{filename}' loaded successfully!")

except FileNotFoundError:
    print(f"[!] Error: The file '{filename}' was not found.")
except json.JSONDecodeError:
    print(f"[!] Error: The file '{filename}' is not a valid JSON file.")

# Now you can use 'loaded_data' just like any other Python dictionary
if loaded_data:
    print("\n--- File Contents ---")
    print(loaded_data)

    # Accessing a specific value from the loaded dictionary
    print("\nAccessing a specific key:")
    print(f"Project Name: {loaded_data.get('https://en.wikipedia.org/wiki/History_of_Pittsburgh')}")

✅ JSON file 'data.json' loaded successfully!

--- File Contents ---
{'https://en.wikipedia.org/wiki/Pittsburgh': 'Second-most populous city in Pennsylvania, U.S.\nThis article is about the city in Pennsylvania. For the region, see\nGreater Pittsburgh\n. For other uses, see\nPittsburgh (disambiguation)\n.\nNot to be confused with\nPittsburg\n.\nCity in Pennsylvania, United States\nPittsburgh\nDionde:gâ\n(\nSeneca\n)\nCity\nDowntown Pittsburgh\nDuquesne Incline\nPhipps Conservatory and Botanical Gardens\nLawrenceville\nPNC Park\nCathedral of Learning\nCarnegie Museums of Pittsburgh\nFlag\nSeal\nCoat of arms\nNickname(s):\nCity of Bridges, Steel City,\nCity of Champions, The \'Burgh, The Paris of Appalachia\nMotto:\nBenigno Numine\n("With the benevolent deity")\nInteractive map of Pittsburgh\nPittsburgh\nShow map of Pennsylvania\nPittsburgh\nShow map of the United States\nCoordinates:\n40°26′23″N\n79°58′35″W\n\ufeff / \ufeff\n40.43972°N 79.97639°W\n\ufeff /\n40.43972; -79.97639\nCountry\n

In [4]:
data = loaded_data
data[base_url] = text

In [None]:
text = [text[i].pop('location') for i in range(0, len(text))]

In [16]:
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4)

In [5]:
base_url = "https://www.cmu.edu/engage/events"
events_list = ["reunion-weekend", "alumni-awards", "online-events", "spring-carnival", "homecoming", "tartans-on-the-rise"]
text = []

for event in events_list:
  current_url = f"{base_url}/{event}"
  r = requests.get(current_url, headers=headers, timeout=10)
  r.raise_for_status()

  soup = BeautifulSoup(r.content, "html.parser")
  x = soup.find('div', class_ = "region region-content")
  text.append(x.get_text(separator = '\n', strip = True))


In [8]:
filename = 'data.json'

# This variable will hold the dictionary after loading
loaded_data = {}

try:
    # Use 'with open' to handle the file safely
    # 'r' stands for "read mode"
    with open(filename, 'r', encoding='utf-8') as f:
        # json.load() reads the file and converts the JSON object
        # into a Python dictionary.
        loaded_data = json.load(f)

    print(f"✅ JSON file '{filename}' loaded successfully!")

except FileNotFoundError:
    print(f"[!] Error: The file '{filename}' was not found.")
except json.JSONDecodeError:
    print(f"[!] Error: The file '{filename}' is not a valid JSON file.")

# Now you can use 'loaded_data' just like any other Python dictionary
if loaded_data:
    print("\n--- File Contents ---")
    print(loaded_data)

    # Accessing a specific value from the loaded dictionary
    print("\nAccessing a specific key:")
    print(f"Project Name: {loaded_data.get('https://en.wikipedia.org/wiki/History_of_Pittsburgh')}")

✅ JSON file 'data.json' loaded successfully!

--- File Contents ---
{'https://en.wikipedia.org/wiki/Pittsburgh': 'Second-most populous city in Pennsylvania, U.S.\nThis article is about the city in Pennsylvania. For the region, see\nGreater Pittsburgh\n. For other uses, see\nPittsburgh (disambiguation)\n.\nNot to be confused with\nPittsburg\n.\nCity in Pennsylvania, United States\nPittsburgh\nDionde:gâ\n(\nSeneca\n)\nCity\nDowntown Pittsburgh\nDuquesne Incline\nPhipps Conservatory and Botanical Gardens\nLawrenceville\nPNC Park\nCathedral of Learning\nCarnegie Museums of Pittsburgh\nFlag\nSeal\nCoat of arms\nNickname(s):\nCity of Bridges, Steel City,\nCity of Champions, The \'Burgh, The Paris of Appalachia\nMotto:\nBenigno Numine\n("With the benevolent deity")\nInteractive map of Pittsburgh\nPittsburgh\nShow map of Pennsylvania\nPittsburgh\nShow map of the United States\nCoordinates:\n40°26′23″N\n79°58′35″W\n\ufeff / \ufeff\n40.43972°N 79.97639°W\n\ufeff /\n40.43972; -79.97639\nCountry\n

In [9]:
data = loaded_data
data[base_url] = text

In [10]:
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4)

In [11]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-6.1.1-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.1.1-py3-none-any.whl (323 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/323.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.5/323.5 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.1.1


In [16]:
from pypdf import PdfReader
import io

pdf_url = "https://www.pittsburghpa.gov/files/assets/city/v/1/omb/documents/operating-budgets/24731_2024_operating_budget_2.pdf"

reader = PdfReader("/content/24731_2024_operating_budget_2.pdf")
all_text = []
for page_num, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
      all_text.append(text)

    # Join all the page texts together into one large string
full_text_content = "\n\n".join(all_text)
print("Text extraction complete.")

data[pdf_url] = full_text_content

Text extraction complete.


In [20]:
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4)

In [22]:
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4)