In [259]:
history_wiki_1 ='https://en.wikipedia.org/wiki/Pittsburgh'
history_wiki_2 ='https://en.wikipedia.org/wiki/History_of_Pittsburgh'
more =          'https://en.wikipedia.org/wiki/List_of_museums_in_Pittsburgh'
brittannica =    'https://www.britannica.com/place/Pittsburgh'
pit_gov =       ['https://pittsburghpa.gov/index.html']
downtown_cal =  ['https://downtownpittsburgh.com/events/']
event_cal =     ['https://pittsburgh.events/']
city_paper =    ['https://www.pghcitypaper.com/pittsburgh/EventSearch?v=d']
about_cmu =     ['https://www.cmu.edu/about/']
cmu_events =    ['https://events.cmu.edu/']
campus_events = ['https://www.cmu.edu/engage/alumni/events/campus/index.html']
food_fest =     ['https://www.visitpittsburgh.com/events-festivals/food-festivals/']
pickle =        ['https://www.picklesburgh.com/']
taco_fest =     ['https://www.pghtacofest.com/']
restaurant_w =  ['https://pittsburghrestaurantweek.com/']
little_italy =  ['https://littleitalydays.com/']
symphony =      ['https://www.pittsburghsymphony.org/']
visit_pit =     ['https://www.visitpittsburgh.com/']
tax_reg =       ['https://pittsburghpa.gov/finance/tax-forms']
op_budget =     ['https://apps.pittsburghpa.gov/redtail/images/23255_2024_Operating_Budget.pdf']
opera =         ['https://pittsburghopera.org/']
cultural_trust =['https://trustarts.org/']
carn_museum =   ['https://carnegiemuseums.org/']
heinz_museum =  ['https://www.heinzhistorycenter.org/']
frick_museum =  ['https://www.thefrickpittsburgh.org/']
banana_split =  ['https://bananasplitfest.com/']
visit_pitsb  =  ['https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/']
pirates =       ['https://www.mlb.com/pirates']
steelers =      ['https://www.steelers.com/']
penguins =      ['https://www.nhl.com/penguins/']




In [260]:
pip install requests beautifulsoup4 pandas tabulate

^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


# Scrape Wikipedia Pages

In [50]:
import requests
from bs4 import BeautifulSoup
import os

def clean_text(text):
    return ' '.join(text.split())


def truncate(text, max_len):
    return text[:max_len]


def format_table_manually(table_html, max_col_width=30):
    EXCLUDE_PHRASES = [
        "This section does not cite any",
        "This section needs additional",
        "This section needs expansion",
    ]

    rows = table_html.find_all('tr')
    table_data = []
    table_name = None

    for i, row in enumerate(rows):
        cols = row.find_all(['th', 'td'])
        cols_text = [clean_text(col.get_text()) for col in cols]

        if i == 0:

            if len(cols) == 1 and cols[0].name == 'th':
                table_name = cols_text[0]
                continue  
            else:
                table_name = "No Name"

        if any(phrase in cell for cell in cols_text for phrase in EXCLUDE_PHRASES):
            continue


        if len(cols_text) > 0:
            table_data.append(cols_text)

    if not table_data:
        return "", table_name

    num_cols = max(len(row) for row in table_data)
    non_empty_columns = [False] * num_cols

    for row in table_data:
        for i in range(num_cols):
            if i < len(row) and row[i].strip():
                non_empty_columns[i] = True

    columns_to_keep = [i for i, has_content in enumerate(non_empty_columns) if has_content]

    if not columns_to_keep:
        return "", table_name

    new_table_data = []
    for row in table_data:
        new_row = [row[i] if i < len(row) else '' for i in columns_to_keep]
        new_table_data.append(new_row)

    table_data = new_table_data
    num_cols = len(columns_to_keep)

    col_widths = [0] * num_cols
    for row in table_data:
        for i, col in enumerate(row):
            col_length = min(len(col), max_col_width)  
            col_widths[i] = max(col_widths[i], col_length)

    table_str = ""
    border_line = "+" + "+".join(["-" * width for width in col_widths]) + "+\n"
    table_str += border_line

    for row in table_data:
        formatted_row = "|".join(f"{truncate(col, col_widths[i]):<{col_widths[i]}}" for i, col in enumerate(row))
        table_str += f"|{formatted_row}|\n"
        table_str += border_line

    return table_str.strip(), table_name

def scrape_wikipedia_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    EXCLUDED_SECTIONS = [
        "See also",
        "Explanatory notes",
        "References",
        "Further reading",
        "External links",
        "Bibliography",
        "Notes"
    ]

    content = soup.find('div', {'id': 'bodyContent'})
    sections = content.find_all(['h1', 'h2', 'h3', 'p', 'table', 'ul', 'ol'])

    output = ""
    current_section = "Intro"
    section_content = ""

    for section in sections:
        if section.name in ['h1', 'h2', 'h3']:
            section_title = clean_text(section.get_text())

            if section_title in EXCLUDED_SECTIONS:
                if section_content.strip():
                    output += f"=section_start=\n=section name=\"{current_section}\"\n{section_content.strip()}\n=section_end=\n"
                current_section = None
                section_content = ""
                continue
            else:
                if section_content.strip():
                    output += f"=section_start=\n=section name=\"{current_section}\"\n{section_content.strip()}\n=section_end=\n"
                current_section = section_title
                section_content = ""
                continue

        if current_section:
            if section.name == 'p':
                paragraph = clean_text(section.get_text())
                section_content += f"{paragraph}\n"
            elif section.name in ['ul', 'ol']:
                list_items = section.find_all('li')
                for item in list_items:
                    item_text = clean_text(item.get_text())
                    section_content += f"- {item_text}\n"
            elif section.name == 'table':
                table_content_str, table_name = format_table_manually(section)
                if table_content_str:
                    section_content += f"=== Table ===\nTable Name: {table_name}\n{table_content_str}\n=== End of Table ===\n"

    if current_section and section_content.strip():
        output += f"=section_start=\n=section name=\"{current_section}\"\n{section_content.strip()}\n=section_end=\n"

    return output.strip()

def scrape_and_save(url_variable_name, url):
    """Scrape Wikipedia page and save content as 'data/{url_variable_name}.txt'."""
    scraped_content = scrape_wikipedia_page(url)
    os.makedirs("data", exist_ok=True)
    file_path = f"data/{url_variable_name}.txt"
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(scraped_content)

    print(f"Done scraping, saved to {file_path}.")

history_wiki_1 = 'https://en.wikipedia.org/wiki/Pittsburgh'
history_wiki_2 = 'https://en.wikipedia.org/wiki/History_of_Pittsburgh'
more = 'https://en.wikipedia.org/wiki/List_of_museums_in_Pittsburgh'

scrape_and_save("history_wiki_1", history_wiki_1)
scrape_and_save("history_wiki_2", history_wiki_2)
scrape_and_save("more", more)


Done scraping, saved to data/history_wiki_1.txt.
Done scraping, saved to data/history_wiki_2.txt.
Done scraping, saved to data/more.txt.


# Scrape Brittanica

Delete the garbage info produced by the adds

In [5]:
import requests
from bs4 import BeautifulSoup
import os

def clean_text(text):
    return ' '.join(text.split())

def scrape_britannica_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    content = soup.find('div', {'class': 'md-content'})
    if not content:
        print("Error: Main content not found on the page.")
        return ""

    ad_related_classes = ["ads", "ad-container", "sponsored-content", "advertisement", "breadcrumb"]

    for ad_section in content.find_all(True, class_=ad_related_classes):
        ad_section.decompose()  
    sections = content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'p', 'ul', 'ol', 'div'])

    output = ""
    current_section = None
    section_content = ""

    EXCLUDED_SECTIONS = [
        "Intro",
        "Pittsburgh",
        "Recent News",
        "See also",
        "References",
        "External Links",
        "Citations",
        "Additional Reading",
        "More Articles On This Topic",
        "Article History",
        "Contributors",
        "Edit History"
    ]

    EXCLUDED_PHRASES = [
        "Are you a student",
        "Get a special academic rate on Britannica Premium",
        "Subscribe"
    ]

    for section in sections:

        if section.find('img') or section.get('class') == ['image-caption']:
            continue


        if section.name in ['h1', 'h2', 'h3', 'h4', 'h5']:
            section_title = clean_text(section.get_text())


            if section_title in EXCLUDED_SECTIONS:
                if section_content.strip():
                    output += f"=section_start=\n=section name=\"{current_section}\"\n{section_content.strip()}\n=section_end=\n"
                current_section = None
                section_content = ""
                continue
            else:

                if section_content.strip():
                    output += f"=section_start=\n=section name=\"{current_section}\"\n{section_content.strip()}\n=section_end=\n"
                current_section = section_title
                section_content = ""
                continue

        if section.name == 'p':
            paragraph = clean_text(section.get_text())
            
            if not any(phrase in paragraph for phrase in EXCLUDED_PHRASES):

                if current_section is None:
                    current_section = "Introduction"
                    section_content = ""
                section_content += f"{paragraph}\n"

        elif section.name in ['ul', 'ol']:
            list_items = section.find_all('li')
            for item in list_items:
                item_text = clean_text(item.get_text())
                

                if not any(phrase in item_text for phrase in EXCLUDED_PHRASES):
                    if current_section is None:
                        current_section = "Introduction"
                        section_content = ""
                    section_content += f"- {item_text}\n"

        elif section.name == 'div':
            div_text = clean_text(section.get_text())
            if div_text and div_text not in EXCLUDED_SECTIONS:
                if not any(phrase in div_text for phrase in EXCLUDED_PHRASES):
                    if current_section is None:
                        current_section = "Introduction"
                        section_content = ""
                    section_content += f"{div_text}\n"
    if current_section and section_content.strip():
        output += f"=section_start=\n=section name=\"{current_section}\"\n{section_content.strip()}\n=section_end=\n"

    return output.strip()

def scrape_and_save(url_variable_name, url):
    """Scrape Britannica page and save content as 'data/{url_variable_name}.txt'."""
    scraped_content = scrape_britannica_page(url)
    os.makedirs("data", exist_ok=True)
    file_path = f"data/{url_variable_name}.txt"
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(scraped_content)

    print(f"Done scraping, saved to {file_path}.")

britannica_url = 'https://www.britannica.com/place/Pittsburgh'

scrape_and_save("britannica", britannica_url)


Done scraping, saved to data/britannica.txt.


# gov website:https://pittsburghpa.gov/index.html

In [217]:
import requests
from bs4 import BeautifulSoup
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = 'https://pittsburghpa.gov/index.html'
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.content, 'html.parser')
links_dict = {}

for link in soup.find_all('a', href=True):
    href = link['href']
    text = link.get_text(strip=True)
    
    if not text:
        text = link.get('title', '')
    
    if not text:
        img = link.find('img', alt=True)
        if img:
            text = img['alt']
    
    if text:
        if text in links_dict:
            if isinstance(links_dict[text], list):
                links_dict[text].append(href)
            else:
                links_dict[text] = [links_dict[text], href]
        else:
            links_dict[text] = href

In [218]:
links_dict

{'PITTSBURGH': '../index.html',
 'REGISTER TO VOTE': 'https://www.votespa.com/Pages/default.aspx',
 'GUÍA DE RESIDENTES': ['https://pittsburghpa.gov/guia-para-residentes-de-la-ciudad-de-pittsburgh/introduccion',
  'https://pittsburghpa.gov/guia-para-residentes-de-la-ciudad-de-pittsburgh/introduccion'],
 '311': ['https://pittsburghpa.gov/311',
  'https://pittsburghpa.gov/311',
  'https://pittsburghpa.gov/311',
  'https://pittsburghpa.gov/311/',
  'https://pittsburghpa.gov/311/'],
 'COVID-19 UPDATES': 'https://pittsburghpa.gov/mayor/covid-updates',
 'BUILDING ACCESSIBILITY': 'https://pittsburghpa.gov/dcp/ccb-ada',
 'CONTACT US': ['https://pittsburghpa.gov/city-info/frequent-numbers',
  'https://pittsburghpa.gov/city-info/frequent-numbers'],
 'FOLLOW US': ['https://pittsburghpa.gov/city-info/socialmedia',
  'https://pittsburghpa.gov/city-info/socialmedia'],
 'RESIDENTS': ['page.html', '#'],
 'Citiparks': ['https://pittsburghpa.gov/citiparks/parks.html',
  'https://pittsburghpa.gov/citipar

In [219]:
links_dict['VIEW ALL NEWS'] = ['https://pittsburghpa.gov/inc/announcement.html?ta=mayor']
links_dict['PWSA Bills'] = ['https://www.pgh2o.com/pay']
links_dict['Right of Way Permits'] = ['https://pittsburghpa.gov/domi/right-of-way']
links_dict['Dog Licensing'] =  ['https://pittsburghpa.gov/publicsafety/dog-license']
links_dict['311 (non-emergency requests)'] = ['https://pittsburghpa.gov/311']
links_dict['Tax information'] =['https://pittsburghpa.gov/finance/tax-descriptions']
links_dict['Recycling/Trash Collection Schedule']  = ['https://pittsburghpa.gov/dpw/collection-schedule']
links_dict['Police Reports'] = ['https://goo.gl/ZPbkRa']
links_dict['Parking Information'] = ['https://www.pittsburghparking.com/']

In [220]:
del_entries = ['Close','YouTube','Social Media Hub','Twitter','Open Gov Portal','Accessibility',
               'Right to Know Policy', 'PAFR','Online Shelter Permit','Contract Bids','Housing Assistance Resource Portal',
               'Vendor Registration','Budget and Tax Receipt Simulations','Financial Audits','WPRDC.org',
                'Buildingeye','City Budget','Home Rule Charter','PGH City Ordinances Code','Expenditure Reports',
                'Current Contracts','City Buying Plan','Black Pittsburgh Matters','Boards, Authorities & Commissions','DPW PERMITS',
                'PORT AUTHORITY','MEETINGS/AGENDAS','BID OPPORTUNITIES','OPEN DATA','PRESS RELEASES',
                'MAPS: GIS, ZONING','ANIMAL CONTROL','COLLECTIONTrash & Recycling Schedule...',
                'Electronics & Household Hazardous Materials Recycling','Read More', 'Real Estate Taxes',
                'Parking Tickets','Parking Leases','Facility Rentals','Alarm Registrations','PAY',
                'Vendor Licenses','Residential Building Permits','Event & Film Permits',
                'Commercial Building Permits','Business Discontinuation','Alarm Registration','REGISTER/SUBMIT',
                'Zoning Information','Police Reports','Paving Schedule','Open Data','Interactive City Data Map',
                'ONLINE APPS','Pittsburgh Logo', 'EXPLORE','CITY INFO', 'CITY HALL', 'Boards, Authorities, Commissions',
                'BUSINESS','VISITORS', 'RESIDENTS' ,'PITTSBURGH','REGISTER TO VOTE','BUILDING ACCESSIBILITY',
                'FOLLOW US', 'City Planning', 'Engage PGH','Ethics Hearing Board', 'Finance','Voting Districts & Polling Places',
                'Winter Resource Center', 'Next Pittsburgh', 'VisitPittsburgh', 'Welcoming Pittsburgh', 'Bid Opportunities', 'Legislative Information Center','Press Releases' ]
for val in del_entries:
    del links_dict[val]



In [221]:
links_dict['Police Data Portal'] = ['https://pittsburghpa.gov/police/police-branches',
                                    'https://pittsburghpa.gov/police/police-investigations',
                                    'https://pittsburghpa.gov/police/police-administration',
                                    ]

In [222]:
links_dict['Citiparks'] = ['https://pittsburghpa.gov/citiparks/beta/our-parks.html',
                                  'https://pittsburghpa.gov/citiparks/citiparks-swimming',
                                  'https://pittsburghpa.gov/citiparks/spray-park',
                                  'https://pittsburghpa.gov/citiparks/tennis',
                                  'https://pittsburghpa.gov/park-shelter/reserve',
                                  'https://pittsburghpa.gov/dpw/field-permit',
                                  'https://pittsburghpa.gov/citiparks/farmers-market',
                                  'https://pittsburghpa.gov/citiparks/citisports',
                                  'https://pittsburghpa.gov/schenley/rink',
                                  'https://pittsburghpa.gov/citiparks/rec-centers-info',
                                  'https://pittsburghpa.gov/citiparks/senior-centers',
                                  'https://pittsburghpa.gov/publicsafety/park-rangers',
                                  'https://pittsburghpa.gov/events/',
                                  'https://pittsburghpa.gov/citiparks/rec2tech',
                                  'https://pittsburghpa.gov/citiparks/citiparks-directory',
                                  'https://pittsburghpa.gov/citiparks/pickleball',
                                  'https://pittsburghpa.gov/events/cinema',
                                  'https://pittsburghpa.gov/events/concerts',
                                  'https://pittsburghpa.gov/events/footraces'
                                  ]


In [223]:
links_dict['Job Opportunities'] = [ 'https://www.governmentjobs.com/careers/pittsburgh?page=1',
                                    'https://www.governmentjobs.com/careers/pittsburgh?page=2',
                                    'https://www.governmentjobs.com/careers/pittsburgh?page=3',
                                    'https://www.governmentjobs.com/careers/pittsburgh?page=4',
                                    'https://www.governmentjobs.com/careers/pittsburgh?page=5',
                                    'https://www.governmentjobs.com/careers/pittsburgh?page=6',
                                    'https://www.governmentjobs.com/careers/pittsburgh?page=7',
                                    'https://www.governmentjobs.com/careers/pittsburgh?page=8',
                                    'https://www.governmentjobs.com/careers/pittsburgh?page=9',
                                    'https://www.governmentjobs.com/careers/pittsburgh?page=10']

In [224]:
links_dict['Human Resources and Civil Service'] = ['https://pittsburghpa.gov/humanresources/index.html',
                                                    'https://www.governmentjobs.com/careers/pittsburgh/transferjobs?page=1',
                                                    'https://www.governmentjobs.com/careers/pittsburgh/transferjobs?page=2',
                                                    'https://pittsburghpa.gov/humanresources/civil-service-commission',
                                                    'https://pittsburghpa.gov/humanresources/csc-hearings']

In [225]:
for key, value in links_dict.items():
        if isinstance(value, str):  
            links_dict[key] = [value]

In [226]:
for key, value in links_dict.items():
    links_dict[key] = list(set(value))
links_dict


{'GUÍA DE RESIDENTES': ['https://pittsburghpa.gov/guia-para-residentes-de-la-ciudad-de-pittsburgh/introduccion'],
 '311': ['https://pittsburghpa.gov/311', 'https://pittsburghpa.gov/311/'],
 'COVID-19 UPDATES': ['https://pittsburghpa.gov/mayor/covid-updates'],
 'CONTACT US': ['https://pittsburghpa.gov/city-info/frequent-numbers'],
 'Citiparks': ['https://pittsburghpa.gov/citiparks/citisports',
  'https://pittsburghpa.gov/citiparks/beta/our-parks.html',
  'https://pittsburghpa.gov/citiparks/rec2tech',
  'https://pittsburghpa.gov/schenley/rink',
  'https://pittsburghpa.gov/events/cinema',
  'https://pittsburghpa.gov/dpw/field-permit',
  'https://pittsburghpa.gov/citiparks/farmers-market',
  'https://pittsburghpa.gov/park-shelter/reserve',
  'https://pittsburghpa.gov/citiparks/citiparks-swimming',
  'https://pittsburghpa.gov/citiparks/senior-centers',
  'https://pittsburghpa.gov/citiparks/tennis',
  'https://pittsburghpa.gov/events/',
  'https://pittsburghpa.gov/citiparks/citiparks-directo

In [232]:
# pip install  selenium
#pip install webdriver-manager

In [243]:
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup


def clean_text(text):
    return ' '.join(text.split())

def extract_table(table):
    rows = table.find_all('tr')
    table_data = []

    for row in rows:
        cols = row.find_all(['th', 'td'])
        cols_text = [clean_text(col.get_text()) for col in cols]
        if cols_text:
            table_data.append(cols_text)

    if not table_data:
        return ""

    col_widths = []
    for col in zip(*table_data):
        col_widths.append(max(len(cell) for cell in col))

    table_str = ""
    for row in table_data:
        padded_row = [cell.ljust(width) for cell, width in zip(row, col_widths)]
        table_str += ' | '.join(padded_row) + '\n'

    return table_str.strip()

def extract_content(soup):
    for script_or_style in soup(['script', 'style', 'noscript']):
        script_or_style.extract()

    content = ""
    elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'ol', 'table'])

    for elem in elements:
        if elem.name in ['h1', 'h2', 'h3']:
            content += f"\n\n{elem.get_text(strip=True)}\n\n"
        elif elem.name == 'p':
            content += f"{elem.get_text(strip=True)}\n"
        elif elem.name in ['ul', 'ol']:
            for li in elem.find_all('li'):
                content += f"- {li.get_text(strip=True)}\n"
        elif elem.name == 'table':
            table_text = extract_table(elem)
            if table_text:
                content += f"\n=== Table ===\n{table_text}\n=== End of Table ===\n"

    return content.strip()

def scrape_page(url, key, driver):
    try:
        driver.get(url)
        time.sleep(2)

        
        try:
            
            expand_buttons = driver.find_elements(By.CSS_SELECTOR, '.expand, .more, .show-more, .accordion-toggle')
            for button in expand_buttons:
                driver.execute_script("arguments[0].click();", button)
                time.sleep(0.5)
        except Exception as e:
            pass  

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        content = extract_content(soup)
        return content
    except WebDriverException as e:
        print(f"WebDriverException scraping URL: Key='{key}', URL='{url}', Error='{e}'")
        return ""
    except Exception as e:
        print(f"Error scraping URL: Key='{key}', URL='{url}', Error='{e}'")
        return ""


os.makedirs('data', exist_ok=True)
output_file = 'data/pit_gov.txt'


chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

with open(output_file, 'w', encoding='utf-8') as f:
    for key, urls in links_dict.items():
        for url in urls:
            print(f"Scraping Key='{key}', URL='{url}'")
            scraped_content = scrape_page(url, key, driver)
            if scraped_content:
                f.write('=section_start=\n')
                f.write(f'=section name="{key}"\n')
                f.write(f'{scraped_content}\n')
                f.write('=section_end=\n')
            else:
                print(f"No content scraped: Key='{key}', URL='{url}'")

driver.quit()
print(f"Scraping complete. Data saved to '{output_file}'.")



Scraping Key='GUÍA DE RESIDENTES', URL='https://pittsburghpa.gov/guia-para-residentes-de-la-ciudad-de-pittsburgh/introduccion'
Scraping Key='311', URL='https://pittsburghpa.gov/311'
Scraping Key='311', URL='https://pittsburghpa.gov/311/'
Scraping Key='COVID-19 UPDATES', URL='https://pittsburghpa.gov/mayor/covid-updates'
Scraping Key='CONTACT US', URL='https://pittsburghpa.gov/city-info/frequent-numbers'
Scraping Key='Citiparks', URL='https://pittsburghpa.gov/citiparks/citisports'
Scraping Key='Citiparks', URL='https://pittsburghpa.gov/citiparks/beta/our-parks.html'
Scraping Key='Citiparks', URL='https://pittsburghpa.gov/citiparks/rec2tech'
Scraping Key='Citiparks', URL='https://pittsburghpa.gov/schenley/rink'
Scraping Key='Citiparks', URL='https://pittsburghpa.gov/events/cinema'
Scraping Key='Citiparks', URL='https://pittsburghpa.gov/dpw/field-permit'
Scraping Key='Citiparks', URL='https://pittsburghpa.gov/citiparks/farmers-market'
Scraping Key='Citiparks', URL='https://pittsburghpa.go

In [244]:
import os

def remove_block_and_clean(input_file, output_file, block_to_remove):
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()
    content = content.replace(block_to_remove, '')

    lines = content.split('\n')
    cleaned_lines = []
    previous_line_empty = False

    for line in lines:
        if line.strip() == '':
            if not previous_line_empty:
                cleaned_lines.append(line.strip())
            previous_line_empty = True
        else:
            cleaned_lines.append(line)
            previous_line_empty = False

    cleaned_content = '\n'.join(cleaned_lines)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(cleaned_content)

    print(f"All instances of the specified block have been removed.")


input_file = 'data/pit_gov.txt'
output_file = 'data/pit_gov.txt'

block_to_remove = """- PITTSBURGH
- REGISTER TO VOTE
- GUÍA DE RESIDENTES
- 311
- COVID-19 UPDATES
- BUILDING ACCESSIBILITY
- CONTACT US
- FOLLOW US
- RESIDENTS311CitiparksCitizen Police Review BoardCity PlanningCommission On Human RelationsEngage PGHEthics Hearing BoardFinanceInnovation & PerformanceHuman Resources and Civil ServiceJob OpportunitiesMobility & InfrastructureOffice of Community Health & SafetyOffice of Film & Event ManagementPermits, Licenses, & InspectionsPublic SafetyPublic WorksSchenley Skating RinkSpecial EventsVoting Districts & Polling PlacesWinter Resource Center
- 311
- Citiparks
- Citizen Police Review Board
- City Planning
- Commission On Human Relations
- Engage PGH
- Ethics Hearing Board
- Finance
- Innovation & Performance
- Human Resources and Civil Service
- Job Opportunities
- Mobility & Infrastructure
- Office of Community Health & Safety
- Office of Film & Event Management
- Permits, Licenses, & Inspections
- Public Safety
- Public Works
- Schenley Skating Rink
- Special Events
- Voting Districts & Polling Places
- Winter Resource Center
- VISITORSCitiparksEngage PGHExplore PittsburghNext PittsburghOffice of Film & Event ManagementSchenley Skating RinkSpecial EventsVisitPittsburghWelcoming Pittsburgh
- Citiparks
- Engage PGH
- Explore Pittsburgh
- Next Pittsburgh
- Office of Film & Event Management
- Schenley Skating Rink
- Special Events
- VisitPittsburgh
- Welcoming Pittsburgh
- BUSINESSBid OpportunitiesCity PlanningFinanceInnovation & PerformanceLegislative Information CenterMobility & InfrastructurePGH LabPermits, Licenses, & Inspections
- Bid Opportunities
- City Planning
- Finance
- Innovation & Performance
- Legislative Information Center
- Mobility & Infrastructure
- PGH Lab
- Permits, Licenses, & Inspections
- CITY HALLBoards, Authorities, CommissionsCity Clerk's OfficeCity CouncilCity Council Meetings, AgendasCommunity Development Block Grant ProgramComprehensive Municipal Pension Trust FundController's OfficeHuman Resources and Civil ServiceLawMayor's CabinetMayor's OfficeMunicipal Pension FundOffice of Management & BudgetOffice of Municipal InvestigationsOther Post Employment (OPEB) Trust FundPublic SafetyCity Hall History - Public Tours
- Boards, Authorities, Commissions
- City Clerk's Office
- City Council
- City Council Meetings, Agendas
- Community Development Block Grant Program
- Comprehensive Municipal Pension Trust Fund
- Controller's Office
- Human Resources and Civil Service
- Law
- Mayor's Cabinet
- Mayor's Office
- Municipal Pension Fund
- Office of Management & Budget
- Office of Municipal Investigations
- Other Post Employment (OPEB) Trust Fund
- Public Safety
- City Hall History - Public Tours
- ONLINE APPSOneStopPGHBid OpportunitiesCivicCentralBurgh's Eye ViewDashburghEngage PGHFacility ReservationsFiscal FocusLive Website TrafficOnline Alarm RegistrationOpen Book PGHPay Parking TicketsPPA GoMobile PGHPay PPAP Parking LeasePay Real Estate TaxesPGH WatchdogSnow AngelsSnow Plow TrackerFilm & Event PermitsTrash Schedule App
- OneStopPGH
- Bid Opportunities
- CivicCentral
- Burgh's Eye View
- Dashburgh
- Engage PGH
- Facility Reservations
- Fiscal Focus
- Live Website Traffic
- Online Alarm Registration
- Open Book PGH
- Pay Parking Tickets
- PPA GoMobile PGH
- Pay PPAP Parking Lease
- Pay Real Estate Taxes
- PGH Watchdog
- Snow Angels
- Snow Plow Tracker
- Film & Event Permits
- Trash Schedule App
- CITY INFOAbout PittsburghCity DirectoryPoliciesPress ReleasesPublic Safety BlotterRefuse & Recycling CollectionTax FormsCouncil Meetings & HearingsWebsite Release NotesCity CareersWomen's Suffrage CentennialCity InternshipsCitywide Event Schedule
- About Pittsburgh
- City Directory
- Policies
- Press Releases
- Public Safety Blotter
- Refuse & Recycling Collection
- Tax Forms
- Council Meetings & Hearings
- Website Release Notes
- City Careers
- Women's Suffrage Centennial
- City Internships
- Citywide Event Schedule
- 311
- Citiparks
- Citizen Police Review Board
- City Planning
- Commission On Human Relations
- Engage PGH
- Ethics Hearing Board
- Finance
- Innovation & Performance
- Human Resources and Civil Service
- Job Opportunities
- Mobility & Infrastructure
- Office of Community Health & Safety
- Office of Film & Event Management
- Permits, Licenses, & Inspections
- Public Safety
- Public Works
- Schenley Skating Rink
- Special Events
- Voting Districts & Polling Places
- Winter Resource Center
- Citiparks
- Engage PGH
- Explore Pittsburgh
- Next Pittsburgh
- Office of Film & Event Management
- Schenley Skating Rink
- Special Events
- VisitPittsburgh
- Welcoming Pittsburgh
- Bid Opportunities
- City Planning
- Finance
- Innovation & Performance
- Legislative Information Center
- Mobility & Infrastructure
- PGH Lab
- Permits, Licenses, & Inspections
- Boards, Authorities, Commissions
- City Clerk's Office
- City Council
- City Council Meetings, Agendas
- Community Development Block Grant Program
- Comprehensive Municipal Pension Trust Fund
- Controller's Office
- Human Resources and Civil Service
- Law
- Mayor's Cabinet
- Mayor's Office
- Municipal Pension Fund
- Office of Management & Budget
- Office of Municipal Investigations
- Other Post Employment (OPEB) Trust Fund
- Public Safety
- City Hall History - Public Tours
- OneStopPGH
- Bid Opportunities
- CivicCentral
- Burgh's Eye View
- Dashburgh
- Engage PGH
- Facility Reservations
- Fiscal Focus
- Live Website Traffic
- Online Alarm Registration
- Open Book PGH
- Pay Parking Tickets
- PPA GoMobile PGH
- Pay PPAP Parking Lease
- Pay Real Estate Taxes
- PGH Watchdog
- Snow Angels
- Snow Plow Tracker
- Film & Event Permits
- Trash Schedule App
- About Pittsburgh
- City Directory
- Policies
- Press Releases
- Public Safety Blotter
- Refuse & Recycling Collection
- Tax Forms
- Council Meetings & Hearings
- Website Release Notes
- City Careers
- Women's Suffrage Centennial
- City Internships
- Citywide Event Schedule"""

remove_block_and_clean(input_file, output_file, block_to_remove)


All instances of the specified block have been removed.


In [246]:
def remove_strings_from_file(input_file, output_file, strings_to_remove):
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    for string in strings_to_remove:
        content = content.replace(string, '')
        
    lines = content.split('\n')
    cleaned_lines = []
    previous_line_empty = False

    for line in lines:

        if line.strip() == '':
            if not previous_line_empty:
                cleaned_lines.append(line.strip())
            previous_line_empty = True
        else:
            cleaned_lines.append(line)
            previous_line_empty = False

    cleaned_content = '\n'.join(cleaned_lines)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(cleaned_content)

    print(f"Specified strings have been removed, and empty lines cleaned. Output saved to '{output_file}'.")


input_file = 'data/pit_gov.txt'
output_file ='data/pit_gov.txt'

strings_to_remove = [
    """CAREERS

CONTACT US

FOLLOW US

GUÍA DE RESIDENTES""",
"""- RESIDENTS311CitiparksCitizen Police Review BoardCity PlanningCommission On Human RelationsEthics Hearing BoardEngage PGHFinanceInnovation & PerformanceHuman Resources and Civil ServiceJob OpportunitiesMobility & InfrastructureOffice of Community Health & SafetyOffice of Film & Event ManagementSpecial EventsPermits, Licenses, & InspectionsPublic SafetyPublic WorksSchenley Skating RinkSpecial EventsVoting Districts & Polling PlacesWinter Resource Center
- 311
- Citiparks
- Citizen Police Review Board
- City Planning
- Commission On Human Relations
- Ethics Hearing Board
- Engage PGH
- Finance
- Innovation & Performance
- Human Resources and Civil Service
- Job Opportunities
- Mobility & Infrastructure
- Office of Community Health & Safety
- Office of Film & Event Management
- Special Events
- Permits, Licenses, & Inspections
- Public Safety
- Public Works
- Schenley Skating Rink
- Special Events
- Voting Districts & Polling Places
- Winter Resource Center
- VISITORSCitiparksEngage PGHExplore PittsburghOffice of Film & Event ManagementSchenley Skating RinkSpecial EventsVisitPittsburghWelcoming Pittsburgh
- Citiparks
- Engage PGH
- Explore Pittsburgh
- Office of Film & Event Management
- Schenley Skating Rink
- Special Events
- VisitPittsburgh
- Welcoming Pittsburgh
- BUSINESSBid OpportunitiesCity PlanningFinanceInnovation & PerformanceLegislative Information CenterMobility & InfrastructurePGH LabPermits, Licenses, & Inspections
- Bid Opportunities
- City Planning
- Finance
- Innovation & Performance
- Legislative Information Center
- Mobility & Infrastructure
- PGH Lab
- Permits, Licenses, & Inspections
- CITY HALLBoards, Authorities, CommissionsCity Clerk's OfficeCity CouncilCity Council Meetings, AgendasCommunity Development Block Grant ProgramComprehensive Municipal Pension Trust FundController's OfficeHuman Resources and Civil ServiceLawMayor's CabinetMayor's OfficeMunicipal Pension FundOffice of Management & BudgetOffice of Municipal InvestigationsOther Post Employment (OPEB) Trust FundPublic SafetyCity Hall History - Public Tours
- Boards, Authorities, Commissions
- City Clerk's Office
- City Council
- City Council Meetings, Agendas
- Community Development Block Grant Program
- Comprehensive Municipal Pension Trust Fund
- Controller's Office
- Human Resources and Civil Service
- Law
- Mayor's Cabinet
- Mayor's Office
- Municipal Pension Fund
- Office of Management & Budget
- Office of Municipal Investigations
- Other Post Employment (OPEB) Trust Fund
- Public Safety
- City Hall History - Public Tours
- ONLINE APPSBid OpportunitiesCivicCentralBurgh's Eye ViewEngage PGHFacility ReservationsFiscal FocusOnline Alarm RegistrationOneStopPGHOpen Book PGHPay Parking TicketsPPA GoMobile PGHPay PPAP Parking LeasePay Real Estate TaxesPGH WatchdogTrash Schedule AppSnow AngelsSnow Plow TrackerFilm & Event Permits
- Bid Opportunities
- CivicCentral
- Burgh's Eye View
- Engage PGH
- Facility Reservations
- Fiscal Focus
- Online Alarm Registration
- OneStopPGH
- Open Book PGH
- Pay Parking Tickets
- PPA GoMobile PGH
- Pay PPAP Parking Lease
- Pay Real Estate Taxes
- PGH Watchdog
- Trash Schedule App
- Snow Angels
- Snow Plow Tracker
- Film & Event Permits
- CITY INFOAbout PittsburghCity DirectoryPoliciesPress ReleasesPublic Safety BlotterRefuse & Recycling CollectionTax FormsCouncil Meetings & HearingsWebsite Release NotesCity CareersCity Internships
- About Pittsburgh
- City Directory
- Policies
- Press Releases
- Public Safety Blotter
- Refuse & Recycling Collection
- Tax Forms
- Council Meetings & Hearings
- Website Release Notes
- City Careers
- City Internships
- 311
- Citiparks
- Citizen Police Review Board
- City Planning
- Commission On Human Relations
- Ethics Hearing Board
- Engage PGH
- Finance
- Innovation & Performance
- Human Resources and Civil Service
- Job Opportunities
- Mobility & Infrastructure
- Office of Community Health & Safety
- Office of Film & Event Management
- Special Events
- Permits, Licenses, & Inspections
- Public Safety
- Public Works
- Schenley Skating Rink
- Special Events
- Voting Districts & Polling Places
- Winter Resource Center
- Citiparks
- Engage PGH
- Explore Pittsburgh
- Office of Film & Event Management
- Schenley Skating Rink
- Special Events
- VisitPittsburgh
- Welcoming Pittsburgh
- Bid Opportunities
- City Planning
- Finance
- Innovation & Performance
- Legislative Information Center
- Mobility & Infrastructure
- PGH Lab
- Permits, Licenses, & Inspections
- Boards, Authorities, Commissions
- City Clerk's Office
- City Council
- City Council Meetings, Agendas
- Community Development Block Grant Program
- Comprehensive Municipal Pension Trust Fund
- Controller's Office
- Human Resources and Civil Service
- Law
- Mayor's Cabinet
- Mayor's Office
- Municipal Pension Fund
- Office of Management & Budget
- Office of Municipal Investigations
- Other Post Employment (OPEB) Trust Fund
- Public Safety
- City Hall History - Public Tours
- Bid Opportunities
- CivicCentral
- Burgh's Eye View
- Engage PGH
- Facility Reservations
- Fiscal Focus
- Online Alarm Registration
- OneStopPGH
- Open Book PGH
- Pay Parking Tickets
- PPA GoMobile PGH
- Pay PPAP Parking Lease
- Pay Real Estate Taxes
- PGH Watchdog
- Trash Schedule App
- Snow Angels
- Snow Plow Tracker
- Film & Event Permits
- About Pittsburgh
- City Directory
- Policies
- Press Releases
- Public Safety Blotter
- Refuse & Recycling Collection
- Tax Forms
- Council Meetings & Hearings
- Website Release Notes
- City Careers
- City Internships""",
"""ANNOUNCEMENTS
Posted on: 06/04/2024
Posted on: 05/20/2024
Posted on: 05/19/2024
Posted on: 05/02/2024""",
"""DEPARTMENT ANNOUNCEMENTS

Posted on: 09/25/2024
Posted on: 08/29/2024
Posted on: 08/29/2024
Posted on: 08/28/2024
Posted on: 08/28/2024
Posted on: 08/09/2024
Posted on: 06/04/2024
Posted on: 06/04/2024
Posted on: 06/04/2024
Posted on: 05/07/2024
Posted on: 05/03/2024
Posted on: 05/01/2024
Posted on: 02/07/2024
Posted on: 02/02/2024
Posted on: 02/01/2024
Posted on: 01/30/2024
Posted on: 01/12/2024
Posted on: 01/08/2024
Posted on: 01/04/2024
Posted on: 01/03/2024
Posted on: 01/02/2024
Posted on: 12/11/2023
Posted on: 12/06/2023
Posted on: 11/21/2023
Posted on: 11/21/2023
Posted on: 11/20/2023
Posted on: 10/18/2023
Posted on: 07/25/2023
Posted on: 07/21/2023
Posted on: 06/23/2023
Posted on: 05/26/2023
Posted on: 05/12/2023
Posted on: 05/09/2023
Posted on: 05/08/2023
Posted on: 03/07/2023
Posted on: 02/01/2023
Posted on: 01/30/2023
Posted on: 01/24/2023
Posted on: 01/23/2023
Posted on: 01/13/2023
Posted on: 12/09/2022
Posted on: 11/22/2022
Posted on: 11/22/2022
Posted on: 11/22/2022
Posted on: 11/22/2022
Posted on: 11/22/2022
Posted on: 11/19/2022
Posted on: 11/14/2022
Posted on: 11/08/2022
Posted on: 11/07/2022
Posted on: 11/06/2022
Posted on: 11/06/2022
Posted on: 11/06/2022
Posted on: 11/04/2022
Posted on: 10/14/2022
Posted on: 10/08/2022
Posted on: 10/03/2022
Posted on: 09/30/2022
Posted on: 09/29/2022
Posted on: 09/09/2022
Posted on: 09/06/2022
Posted on: 09/02/2022
Posted on: 08/31/2022
Posted on: 08/19/2022
Posted on: 08/05/2022
Posted on: 07/25/2022
Posted on: 07/01/2022
Posted on: 06/24/2022
Posted on: 06/14/2022
Posted on: 06/14/2022
Posted on: 06/09/2022
Posted on: 05/27/2022
Posted on: 05/11/2022
Posted on: 05/05/2022
Posted on: 04/25/2022
Posted on: 04/25/2022
Posted on: 04/18/2022
Posted on: 04/01/2022
Posted on: 03/30/2022
Posted on: 03/11/2022
Posted on: 03/11/2022
Posted on: 03/11/2022
Posted on: 01/06/2022
Posted on: 01/04/2022
Posted on: 01/04/2022
Posted on: 12/28/2021
Posted on: 12/21/2021
Posted on: 12/16/2021
Posted on: 12/10/2021
Posted on: 11/30/2021
Posted on: 11/30/2021
Posted on: 11/24/2021
Posted on: 11/23/2021
Posted on: 11/23/2021
Posted on: 11/22/2021
Posted on: 11/19/2021
Posted on: 11/19/2021
Posted on: 11/19/2021
Posted on: 11/18/2021
Posted on: 11/18/2021
Posted on: 11/18/2021
Posted on: 11/16/2021
Posted on: 11/15/2021
Posted on: 11/13/2021
Posted on: 11/12/2021
Posted on: 11/12/2021
Posted on: 11/12/2021
Posted on: 11/12/2021
Posted on: 11/11/2021
Posted on: 11/11/2021
Posted on: 11/10/2021
Posted on: 11/10/2021
Posted on: 11/10/2021
Posted on: 11/09/2021
Posted on: 11/09/2021
Posted on: 11/09/2021
Posted on: 11/09/2021
Posted on: 11/09/2021
Posted on: 11/08/2021
Posted on: 11/08/2021
Posted on: 11/08/2021
Posted on: 11/07/2021
Posted on: 11/05/2021
Posted on: 11/05/2021
Posted on: 11/05/2021
Posted on: 11/05/2021
Posted on: 11/05/2021
Posted on: 11/04/2021
Posted on: 11/04/2021
Posted on: 11/04/2021
Posted on: 11/03/2021
Posted on: 11/02/2021
Posted on: 11/02/2021
Posted on: 11/02/2021
Posted on: 11/01/2021
Posted on: 11/01/2021
Posted on: 11/01/2021
Posted on: 10/29/2021
Posted on: 10/29/2021
Posted on: 10/28/2021
Posted on: 10/26/2021
Posted on: 10/25/2021
Posted on: 10/22/2021
Posted on: 10/22/2021
Posted on: 10/21/2021
Posted on: 10/20/2021
Posted on: 10/20/2021
Posted on: 10/19/2021
Posted on: 10/19/2021
Posted on: 10/19/2021
Posted on: 10/18/2021
Posted on: 10/18/2021
Posted on: 10/15/2021
Posted on: 10/15/2021
Posted on: 10/15/2021
Posted on: 10/14/2021
Posted on: 10/12/2021
Posted on: 10/12/2021
Posted on: 10/12/2021
Posted on: 10/11/2021
Posted on: 10/11/2021
Posted on: 10/08/2021
Posted on: 10/08/2021
Posted on: 10/07/2021
Posted on: 10/07/2021
Posted on: 10/06/2021
Posted on: 10/06/2021
Posted on: 10/05/2021
Posted on: 10/04/2021
Posted on: 10/04/2021
Posted on: 10/04/2021
Posted on: 09/30/2021
Posted on: 09/30/2021
Posted on: 09/29/2021
Posted on: 09/29/2021
Posted on: 09/28/2021
Posted on: 09/27/2021
Posted on: 09/24/2021
Posted on: 09/24/2021
Posted on: 09/23/2021
Posted on: 09/23/2021
Posted on: 09/22/2021
Posted on: 09/21/2021
Posted on: 09/21/2021
Posted on: 09/21/2021
Posted on: 09/20/2021
Posted on: 09/20/2021
Posted on: 09/17/2021
Posted on: 09/17/2021
Posted on: 09/17/2021
Posted on: 09/17/2021
Posted on: 09/17/2021
Posted on: 09/16/2021
Posted on: 09/15/2021
Posted on: 09/14/2021
Posted on: 09/14/2021
Posted on: 09/13/2021
Posted on: 09/13/2021
Posted on: 09/10/2021
Posted on: 09/09/2021
Posted on: 09/09/2021
Posted on: 09/08/2021
Posted on: 09/03/2021
Posted on: 09/03/2021
Posted on: 09/03/2021
Posted on: 09/01/2021
Posted on: 08/31/2021
Posted on: 08/31/2021
Posted on: 08/31/2021
Posted on: 08/27/2021
Posted on: 08/27/2021
Posted on: 08/26/2021
Posted on: 08/26/2021
Posted on: 08/26/2021
Posted on: 08/25/2021
Posted on: 08/25/2021
Posted on: 08/24/2021
Posted on: 08/24/2021
Posted on: 08/24/2021
Posted on: 08/23/2021
Posted on: 08/23/2021
Posted on: 08/23/2021
Posted on: 08/20/2021
Posted on: 08/20/2021
Posted on: 08/20/2021
Posted on: 08/20/2021
Posted on: 08/19/2021
Posted on: 08/16/2021
Posted on: 08/13/2021
Posted on: 08/12/2021
Posted on: 08/12/2021
Posted on: 08/12/2021
Posted on: 08/11/2021
Posted on: 08/11/2021
Posted on: 08/10/2021
Posted on: 08/09/2021
Posted on: 08/06/2021
Posted on: 08/06/2021
Posted on: 08/06/2021
Posted on: 08/06/2021
Posted on: 08/05/2021
Posted on: 07/30/2021
Posted on: 07/30/2021
Posted on: 07/30/2021
Posted on: 07/29/2021
Posted on: 07/29/2021
Posted on: 07/29/2021
Posted on: 07/29/2021
Posted on: 07/29/2021
Posted on: 07/27/2021
Posted on: 07/26/2021
Posted on: 07/23/2021
Posted on: 07/22/2021
Posted on: 07/20/2021
Posted on: 07/16/2021
Posted on: 07/16/2021
Posted on: 07/16/2021
Posted on: 07/16/2021
Posted on: 07/15/2021
Posted on: 07/15/2021
Posted on: 07/14/2021
Posted on: 07/13/2021
Posted on: 07/13/2021
Posted on: 07/12/2021
Posted on: 07/12/2021
Posted on: 07/12/2021
Posted on: 07/10/2021
Posted on: 07/09/2021
Posted on: 07/09/2021
Posted on: 07/09/2021
Posted on: 07/08/2021
Posted on: 07/08/2021
Posted on: 07/07/2021
Posted on: 07/06/2021
Posted on: 07/06/2021
Posted on: 07/02/2021
Posted on: 07/02/2021
Posted on: 07/02/2021
Posted on: 07/01/2021
Posted on: 07/01/2021
Posted on: 06/30/2021
Posted on: 06/30/2021
Posted on: 06/29/2021
Posted on: 06/29/2021
Posted on: 06/28/2021
Posted on: 06/25/2021
Posted on: 06/24/2021
Posted on: 06/23/2021
Posted on: 06/22/2021
Posted on: 06/22/2021
Posted on: 06/17/2021
Posted on: 06/17/2021
Posted on: 06/17/2021
Posted on: 06/16/2021
Posted on: 06/16/2021
Posted on: 06/15/2021
Posted on: 06/15/2021
Posted on: 06/15/2021
Posted on: 06/14/2021
Posted on: 06/11/2021
Posted on: 06/11/2021
Posted on: 06/08/2021
Posted on: 06/04/2021
Posted on: 06/04/2021
Posted on: 06/03/2021
Posted on: 06/03/2021
Posted on: 06/03/2021
Posted on: 06/02/2021
Posted on: 06/01/2021
Posted on: 05/28/2021
Posted on: 05/28/2021
Posted on: 05/27/2021
Posted on: 05/26/2021
Posted on: 05/26/2021
Posted on: 05/25/2021
Posted on: 05/24/2021
Posted on: 05/21/2021
Posted on: 05/20/2021
Posted on: 05/17/2021
Posted on: 05/17/2021
Posted on: 05/14/2021
Posted on: 05/13/2021
Posted on: 05/13/2021
Posted on: 05/10/2021
Posted on: 05/07/2021
Posted on: 05/07/2021
Posted on: 05/06/2021
Posted on: 05/04/2021
Posted on: 05/03/2021
Posted on: 05/01/2021
Posted on: 04/30/2021
Posted on: 04/29/2021
Posted on: 04/26/2021
Posted on: 04/23/2021
Posted on: 04/22/2021
Posted on: 04/21/2021
Posted on: 04/20/2021
Posted on: 04/19/2021
Posted on: 04/19/2021
Posted on: 04/19/2021
Posted on: 04/16/2021
Posted on: 04/15/2021
Posted on: 04/14/2021
Posted on: 04/14/2021
Posted on: 04/13/2021
Posted on: 04/13/2021
Posted on: 04/12/2021
Posted on: 04/09/2021
Posted on: 04/08/2021
Posted on: 04/06/2021
Posted on: 04/05/2021
Posted on: 04/01/2021
Posted on: 03/31/2021
Posted on: 03/31/2021
Posted on: 03/31/2021
Posted on: 03/30/2021
Posted on: 03/27/2021
Posted on: 03/26/2021
Posted on: 03/24/2021
Posted on: 03/24/2021
Posted on: 03/23/2021
Posted on: 03/23/2021
Posted on: 03/22/2021
Posted on: 03/21/2021
Posted on: 03/19/2021
Posted on: 03/18/2021
Posted on: 03/16/2021
Posted on: 03/16/2021
Posted on: 03/15/2021
Posted on: 03/15/2021
Posted on: 03/12/2021
Posted on: 03/12/2021
Posted on: 03/10/2021
Posted on: 03/09/2021
Posted on: 03/08/2021
Posted on: 03/08/2021
Posted on: 03/03/2021
Posted on: 03/02/2021
Posted on: 03/01/2021
Posted on: 02/26/2021
Posted on: 02/24/2021
Posted on: 02/22/2021
Posted on: 02/22/2021
Posted on: 02/18/2021
Posted on: 02/14/2021
Posted on: 02/11/2021
Posted on: 02/11/2021
Posted on: 02/10/2021
Posted on: 02/09/2021
Posted on: 02/09/2021
Posted on: 02/09/2021
Posted on: 02/08/2021
Posted on: 02/08/2021
Posted on: 02/05/2021
Posted on: 02/03/2021
Posted on: 02/02/2021
Posted on: 02/02/2021
Posted on: 02/01/2021
Posted on: 02/01/2021
Posted on: 01/30/2021
Posted on: 01/30/2021
Posted on: 01/27/2021
Posted on: 01/26/2021
Posted on: 01/21/2021
Posted on: 01/21/2021
Posted on: 01/20/2021
Posted on: 01/19/2021
Posted on: 01/19/2021
Posted on: 01/15/2021
Posted on: 01/14/2021
Posted on: 01/13/2021
Posted on: 01/08/2021
Posted on: 01/08/2021
Posted on: 01/07/2021
Posted on: 01/04/2021
Posted on: 01/04/2021
Posted on: 12/29/2020
Posted on: 12/26/2020
Posted on: 12/22/2020
Posted on: 12/22/2020
Posted on: 12/22/2020
Posted on: 12/21/2020
Posted on: 12/21/2020
Posted on: 12/21/2020
Posted on: 12/18/2020
Posted on: 12/18/2020
Posted on: 12/17/2020
Posted on: 12/17/2020
Posted on: 12/16/2020
Posted on: 12/15/2020
Posted on: 12/09/2020
Posted on: 12/07/2020
Posted on: 12/07/2020
Posted on: 12/03/2020
Posted on: 12/02/2020
Posted on: 12/01/2020
Posted on: 11/30/2020
Posted on: 11/25/2020
Posted on: 11/25/2020
Posted on: 11/24/2020
Posted on: 11/23/2020
Posted on: 11/23/2020
Posted on: 11/23/2020
Posted on: 11/23/2020
Posted on: 11/23/2020
Posted on: 11/20/2020
Posted on: 11/19/2020
Posted on: 11/18/2020
Posted on: 11/17/2020
Posted on: 11/16/2020
Posted on: 11/16/2020
Posted on: 11/16/2020
Posted on: 11/12/2020
Posted on: 11/12/2020
Posted on: 11/11/2020
Posted on: 11/10/2020
Posted on: 11/09/2020
Posted on: 11/09/2020
Posted on: 11/09/2020
Posted on: 11/06/2020
Posted on: 11/05/2020
Posted on: 11/05/2020
Posted on: 11/03/2020
Posted on: 11/03/2020
Posted on: 11/02/2020
Posted on: 11/02/2020
Posted on: 10/30/2020
Posted on: 10/30/2020
Posted on: 10/28/2020
Posted on: 10/26/2020
Posted on: 10/26/2020
Posted on: 10/26/2020
Posted on: 10/26/2020
Posted on: 10/26/2020
Posted on: 10/23/2020
Posted on: 10/22/2020
Posted on: 10/21/2020
Posted on: 10/21/2020
Posted on: 10/19/2020
Posted on: 10/19/2020
Posted on: 10/19/2020
Posted on: 10/16/2020
Posted on: 10/15/2020
Posted on: 10/15/2020
Posted on: 10/14/2020
Posted on: 10/13/2020
Posted on: 10/13/2020
Posted on: 10/12/2020
Posted on: 10/11/2020
Posted on: 10/09/2020
Posted on: 10/09/2020
Posted on: 10/09/2020
Posted on: 10/08/2020
Posted on: 10/08/2020
Posted on: 10/06/2020
Posted on: 10/06/2020
Posted on: 10/05/2020
Posted on: 10/05/2020
Posted on: 10/02/2020
Posted on: 09/29/2020
Posted on: 09/28/2020
Posted on: 09/24/2020
Posted on: 09/24/2020
Posted on: 09/23/2020
Posted on: 09/23/2020
Posted on: 09/21/2020
Posted on: 09/18/2020
Posted on: 09/16/2020
Posted on: 09/15/2020
Posted on: 09/15/2020
Posted on: 09/14/2020
Posted on: 09/11/2020
Posted on: 09/10/2020
Posted on: 09/10/2020
Posted on: 09/09/2020
Posted on: 09/08/2020
Posted on: 09/04/2020
Posted on: 09/03/2020
Posted on: 09/02/2020
Posted on: 09/01/2020
Posted on: 09/01/2020
Posted on: 09/01/2020
Posted on: 09/01/2020
Posted on: 09/01/2020
Posted on: 08/31/2020
Posted on: 08/30/2020
Posted on: 08/29/2020
Posted on: 08/28/2020
Posted on: 08/28/2020
Posted on: 08/28/2020
Posted on: 08/26/2020
Posted on: 08/25/2020
Posted on: 08/25/2020
Posted on: 08/25/2020
Posted on: 08/21/2020
Posted on: 08/21/2020
Posted on: 08/19/2020
Posted on: 08/19/2020
Posted on: 08/19/2020
Posted on: 08/17/2020
Posted on: 08/14/2020
Posted on: 08/14/2020
Posted on: 08/13/2020
Posted on: 08/13/2020
Posted on: 08/12/2020
Posted on: 08/12/2020
Posted on: 08/11/2020
Posted on: 08/11/2020
Posted on: 08/11/2020
Posted on: 08/10/2020
Posted on: 08/07/2020
Posted on: 08/06/2020
Posted on: 08/04/2020
Posted on: 08/03/2020
Posted on: 07/31/2020
Posted on: 07/31/2020
Posted on: 07/26/2020
Posted on: 07/26/2020
Posted on: 07/24/2020
Posted on: 07/21/2020
Posted on: 07/20/2020
Posted on: 07/20/2020
Posted on: 07/14/2020
Posted on: 07/14/2020
Posted on: 07/14/2020
Posted on: 07/14/2020
Posted on: 07/14/2020
Posted on: 07/10/2020
Posted on: 07/10/2020
Posted on: 07/10/2020
Posted on: 07/10/2020
Posted on: 07/10/2020
Posted on: 07/09/2020
Posted on: 07/09/2020
Posted on: 07/09/2020
Posted on: 07/06/2020
Posted on: 07/03/2020
Posted on: 07/02/2020
Posted on: 07/02/2020
Posted on: 07/02/2020
Posted on: 07/01/2020
Posted on: 07/01/2020
Posted on: 07/01/2020
Posted on: 06/30/2020
Posted on: 06/30/2020
Posted on: 06/29/2020
Posted on: 06/27/2020
Posted on: 06/26/2020
Posted on: 06/26/2020
Posted on: 06/26/2020
Posted on: 06/26/2020
Posted on: 06/26/2020
Posted on: 06/25/2020
Posted on: 06/25/2020
Posted on: 06/24/2020
Posted on: 06/24/2020
Posted on: 06/23/2020
Posted on: 06/22/2020
Posted on: 06/22/2020
Posted on: 06/22/2020
Posted on: 06/19/2020
Posted on: 06/19/2020
Posted on: 06/19/2020
Posted on: 06/18/2020
Posted on: 06/18/2020
Posted on: 06/18/2020
Posted on: 06/17/2020
Posted on: 06/16/2020
Posted on: 06/15/2020
Posted on: 06/15/2020
Posted on: 06/12/2020
Posted on: 06/12/2020
Posted on: 06/12/2020
Posted on: 06/12/2020
Posted on: 06/11/2020
Posted on: 06/11/2020
Posted on: 06/11/2020
Posted on: 06/10/2020
Posted on: 06/10/2020
Posted on: 06/09/2020
Posted on: 06/09/2020
Posted on: 06/09/2020
Posted on: 06/08/2020
Posted on: 06/04/2020
Posted on: 06/03/2020
Posted on: 06/03/2020
Posted on: 06/01/2020
Posted on: 05/31/2020
Posted on: 05/28/2020
Posted on: 05/22/2020
Posted on: 05/22/2020
Posted on: 05/22/2020
Posted on: 05/21/2020
Posted on: 05/20/2020
Posted on: 05/20/2020
Posted on: 05/19/2020
Posted on: 05/19/2020
Posted on: 05/19/2020
Posted on: 05/19/2020
Posted on: 05/18/2020
Posted on: 05/18/2020
Posted on: 05/18/2020
Posted on: 05/15/2020
Posted on: 05/15/2020
Posted on: 05/15/2020
Posted on: 05/15/2020
Posted on: 05/14/2020
Posted on: 05/13/2020
Posted on: 05/12/2020
Posted on: 05/12/2020
Posted on: 05/11/2020
Posted on: 05/11/2020
Posted on: 05/08/2020
Posted on: 05/08/2020
Posted on: 05/06/2020
Posted on: 05/06/2020
Posted on: 05/05/2020
Posted on: 05/04/2020
Posted on: 05/04/2020
Posted on: 05/01/2020
Posted on: 05/01/2020
Posted on: 04/30/2020
Posted on: 04/29/2020
Posted on: 04/28/2020
Posted on: 04/28/2020
Posted on: 04/27/2020
Posted on: 04/27/2020
Posted on: 04/27/2020
Posted on: 03/16/2020
Posted on: 03/16/2020
Posted on: 03/11/2020
Posted on: 03/11/2020
Posted on: 02/06/2020
Posted on: 01/30/2020
Posted on: 01/30/2020
Posted on: 01/07/2020
Posted on: 12/23/2019
Posted on: 12/23/2019
Posted on: 12/16/2019
Posted on: 12/13/2019
Posted on: 11/22/2019
Posted on: 11/12/2019
Posted on: 11/12/2019
Posted on: 11/06/2019
Posted on: 09/24/2019
Posted on: 09/10/2019
Posted on: 09/09/2019
Posted on: 08/15/2019
Posted on: 06/27/2019
Posted on: 06/27/2019
Posted on: 06/27/2019
Posted on: 06/27/2019
Posted on: 06/27/2019
Posted on: 05/08/2019
Posted on: 05/07/2019
Posted on: 04/18/2019
Posted on: 04/17/2019
Posted on: 04/09/2019
Posted on: 04/08/2019
Posted on: 03/27/2019
Posted on: 03/04/2019
Posted on: 02/27/2019
Posted on: 01/29/2019
Posted on: 01/03/2019
Posted on: 11/26/2018
Posted on: 11/08/2018
Posted on: 03/09/2018
Posted on: 02/07/2018
Posted on: 12/23/2017
Posted on: 09/22/2017
Posted on: 08/15/2017
Posted on: 08/14/2017
Posted on: 08/11/2017
Posted on: 08/11/2017
Posted on: 08/11/2017
Posted on: 08/11/2017
Posted on: 06/02/2017"""
]

remove_strings_from_file(input_file, output_file, strings_to_remove)


Specified strings have been removed, and empty lines cleaned. Output saved to 'data/pit_gov.txt'.


# https://downtownpittsburgh.com/events/

In [6]:
import requests
from bs4 import BeautifulSoup
url = "https://downtownpittsburgh.com/events/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
read_more_links = soup.find_all('a', text="READ MORE")
urls = [link['href'][8:] for link in read_more_links]
urls = [url+link for link in  urls]


  read_more_links = soup.find_all('a', text="READ MORE")


In [7]:
import os
import requests
from lxml import html

os.makedirs('data', exist_ok=True)
output_file = 'data/downtown_cal.txt'

with open(output_file, 'w', encoding='utf-8') as f:
    for url in urls:
        try:
            response = requests.get(url)
            tree = html.fromstring(response.content)
            title = tree.xpath('/html/body/div[1]/div[4]/div[2]/div[1]/div/div/h1/text()')
            title = title[0].strip() if title else "No title available"            
            date_time = tree.xpath('/html/body/div[1]/div[4]/div[2]/div[1]/div/div/div[2]/text()')
            date_time = ' '.join(date_time).strip() if date_time else "No date/time available"
            location_name = tree.xpath('/html/body/div[1]/div[4]/div[2]/div[1]/div/div/div[3]/strong/text()')
            street_address = tree.xpath('/html/body/div[1]/div[4]/div[2]/div[1]/div/div/div[3]/br[1]/following-sibling::text()')
            city_state_zip = tree.xpath('/html/body/div[1]/div[4]/div[2]/div[1]/div/div/div[3]/br[2]/following-sibling::text()')

            if location_name and street_address and city_state_zip:
                location = f"{location_name[0].strip()}, {street_address[0].strip()} {city_state_zip[0].strip()}"
            else:
                location = "No location available"
            
            description_paragraphs = tree.xpath('/html/body/div[1]/div[4]/div[2]/div[1]/div/div/p/text()')
            description = ' '.join([desc.strip() for desc in description_paragraphs if desc])
            description = description if description else "No description available"
            
            f.write(f"Title: {title}\n")
            f.write(f"Date & Time: {date_time}\n")
            f.write(f"Location: {location}\n")
            f.write(f"Description: {description}\n")
            f.write("\n" + "-"*50 + "\n\n")  
            
        except Exception as e:
            print(f"Failed to scrape {url}: {str(e)}")


# event_cal =     ['https://pittsburgh.events/']


In [None]:
output_file = 'data/event_cal.txt'

In [5]:
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from lxml import html

url = 'https://pittsburgh.events/'
os.makedirs('data', exist_ok=True)
output_file = 'data/event_cal.txt'
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.get(url)

def click_show_more_with_timer(duration_minutes=5):
    start_time = time.time()
    max_duration = duration_minutes * 60  
    while True:
        try:
            if time.time() - start_time > max_duration:
                print(f"Reached the {duration_minutes}-minute mark, stopping the 'Show More' clicks.")
                break

            show_more_button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//span[contains(@class, "textnode") and contains(text(), "Show More")]'))
            )

            driver.execute_script("window.scrollBy(0, 500);")
            time.sleep(1)
            driver.execute_script("arguments[0].click();", show_more_button)
            print("Clicked 'Show More' button via JavaScript")
            time.sleep(2)
        except (NoSuchElementException, TimeoutException):
            print("No more 'Show More' buttons found.")
            break

click_show_more_with_timer(duration_minutes=8)

tree = html.fromstring(driver.page_source)

try:
    with open(output_file, 'w', encoding='utf-8') as f:
        events = tree.xpath('//li[@class="date-row"]')

        for event in events:
            date = event.xpath('.//div[@class="date"]//text()')
            date = ' '.join([d.strip() for d in date if d.strip()])
            date = date if date else "No date available"
            time = event.xpath('.//div[@class="time"]/text()')
            time = time[0].strip() if time else "No time available"
            date_time = f"{date} {time}" if date and time else "No date/time available"
            arena = event.xpath('.//div[@class="date-desc"]/a/text()')
            arena = arena[0].strip() if arena else "No arena available"
            # title = event.xpath('.//div[@class="venue"]/div/text()')
            
            # title = title[0].strip() if title else "No title available"
            
            title = event.xpath('.//div[@class="venue"]/div[1]/a/text()')  # Case where title is inside an <a> tag
            if not title:  # Fallback if no <a> tag
                title = event.xpath('.//div[@class="venue"]/div[1]/text()')  # Case where title is directly in div
            title = title[0].strip() if title else "No title available"

            location = event.xpath('.//span[@class="location"]/text()')
            location = ' '.join([loc.strip() for loc in location if loc.strip()]) if location else "No location available"
            price = event.xpath('.//div[@class="from-price"]/text()')
            price = price[0].strip() if price else "No price available"
            f.write(f"Title: {title}\n")
            f.write(f"Date & Time: {date_time}\n")
            f.write(f"Location: {location}\n")
            f.write(f"Arena: {arena}\n")
            f.write(f"Price: {price}\n")
            f.write("\n" + "-"*50 + "\n\n") 

    print(f"Scraping complete. Data saved to {output_file}")

except Exception as e:
    print(f"Failed to scrape the website: {str(e)}")

driver.quit()


Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via JavaScript
Clicked 'Show More' button via Jav

# city_paper =    ['https://www.pghcitypaper.com/pittsburgh/EventSearch?v=d']


In [21]:
import os
import requests
from bs4 import BeautifulSoup

def scrape_event_data(event):
    """Function to scrape the details of one event."""
    
    try:
        title = event.select_one('div > div:nth-of-type(2) > div:nth-of-type(1) > p > a').text.strip()
    except AttributeError:
        title = None

    try:
        date_time = event.select_one('div > div:nth-of-type(2) > p:nth-of-type(1)').text.strip()
    except AttributeError:
        date_time = None

    try:
        location_1 = event.select_one('div > div:nth-of-type(2) > div:nth-of-type(2) > p:nth-of-type(1) > a').text.strip()
    except AttributeError:
        location_1 = None

    try:
        location_2 = event.select_one('div > div:nth-of-type(2) > div:nth-of-type(2) > p:nth-of-type(2) > span:nth-of-type(1)').text.strip()
    except AttributeError:
        location_2 = None

    try:
        price = event.select_one('div > div:nth-of-type(2) > div:nth-of-type(3) > span').text.strip()
    except AttributeError:
        price = None

    try:
        event_types = [a.text.strip() for a in event.select('div > div:nth-of-type(2) > p:nth-of-type(2) > a')]
    except AttributeError:
        event_types = []

    try:
        description = event.select_one('div > div:nth-of-type(2) > div:nth-of-type(4)').text.strip()
    except AttributeError:
        description = None

    if title and description:
        return {
            'title': title,
            'date_time': date_time or 'Date and time not available',
            'location_1': location_1 or 'Location 1 not available',
            'location_2': location_2 or 'Location 2 not available',
            'price': price or 'Price not available',
            'event_types': event_types if event_types else ['No event type available'],
            'description': description
        }
    else:
        return None 

def scrape_events(url):
    """Scrapes all events from the given page."""
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    events = soup.select('div > div > div:nth-of-type(3) > div:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(7) > div > ul > li')

    all_events = []
    for event in events:
        event_data = scrape_event_data(event)
        if event_data: 
            all_events.append(event_data)

    return all_events

def save_to_file(events, filepath):
    """Save event data to a text file, appending new events."""
    os.makedirs(os.path.dirname(filepath), exist_ok=True) 
    
    with open(filepath, 'a', encoding='utf-8') as file:  
        for event in events:
            file.write(f"Title: {event['title']}\n")
            file.write(f"Date and Time: {event['date_time']}\n")
            file.write(f"Location 1: {event['location_1']}\n")
            file.write(f"Location 2: {event['location_2']}\n")
            file.write(f"Price: {event['price']}\n")
            file.write(f"Type of Event: {', '.join(event['event_types'])}\n")
            file.write(f"Description: {event['description']}\n")
            file.write("--------------------------------------------------\n")



city_paper =    ['https://www.pghcitypaper.com/pittsburgh/EventSearch?v=d']

temp = [f'https://www.pghcitypaper.com/pittsburgh/EventSearch?page={i}&v=d' for i in range(1,25)]


for url  in temp:
    events_data = scrape_events(url)
    save_to_file(events_data, 'data/city_paper.txt')




In [2]:
def clean_text(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    # Replace newlines before and after the separator
    cleaned_content = content.replace('\n--------------------------------------------------\n', '--------------------------------------------------')
    
    with open(file_path, 'w') as file:
        file.write(cleaned_content)

# Example usage
file_path = 'data/downtown_cal.txt'  # Replace with your file path
clean_text(file_path)


# https://events.cmu.edu/all

In [19]:
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.chrome.options import Options
# from webdriver_manager.chrome import ChromeDriverManager
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# import time

# # Set up Chrome options
# options = Options()
# # options.add_argument('--headless')  # Uncomment to run in headless mode
# options.add_argument('--disable-blink-features=AutomationControlled')  # Optional: Bypass bot detection

# # Initialize the Chrome driver using webdriver-manager
# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# try:
#     # Navigate to the website
#     driver.get('https://events.cmu.edu/all')

#     # Wait for the page to fully load
#     wait = WebDriverWait(driver, 20)  # Increase timeout if needed

#     # Set the timeout in seconds for the entire operation
#     overall_timeout = 60  # Adjust the time as needed
#     end_time = time.time() + overall_timeout

#     while time.time() < end_time:
#         try:
#             # Wait until the "Show 50 more" button is present and clickable
#             show_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#lw_cal_body > p > a')))

#             # Scroll the button into view (optional)
#             driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)

#             # Click the button
#             show_more_button.click()

#             # Optional: Wait for new events to load
#             time.sleep(2)
#         except Exception as e:
#             # If the button is not found or any other error occurs, break the loop
#             print("Button not found or an error occurred:", e)
#             break

#     print("Finished clicking 'Show 50 more'.")

# finally:
#     # Close the browser
#     driver.quit()


In [24]:
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import sys

options = Options()

options.add_argument('--disable-blink-features=AutomationControlled')  
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

try:
    driver.get('https://events.cmu.edu/all')
    wait = WebDriverWait(driver, 20)  
    overall_timeout = 180 # 3 min expanding events  
    end_time = time.time() + overall_timeout

    while time.time() < end_time:
        try:
            show_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#lw_cal_body > p > a')))
            driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
            show_more_button.click()
            time.sleep(2)
        except Exception as e:
            print("Button not found or an error occurred:", e)
            break

    print("Finished clicking 'Show 50 more'.")

    if not os.path.exists('data_temp'):
        os.makedirs('data_temp')

    with open('data_temp/events_cmu.txt', 'w', encoding='utf-8') as file:
        date_headers = driver.find_elements(By.XPATH, '/html/body/div[1]/div[3]/div/div[1]/div[2]/div[2]/div/h3')
        num_groups = len(date_headers)
        print(f"Found {num_groups} date groups.")

        for N in range(1, num_groups + 1):
            date_xpath = f'/html/body/div[1]/div[3]/div/div[1]/div[2]/div[2]/div/h3[{N}]'
            try:
                date_element = driver.find_element(By.XPATH, date_xpath)
                date_text = date_element.text.strip()
                print(f"Processing date group {N}: {date_text}")
            except Exception as e:
                print(f"Could not find date for group {N}: {e}")
                continue 

            group_xpath = f'/html/body/div[1]/div[3]/div/div[1]/div[2]/div[2]/div/div[{N}]'
            try:
                group_div = driver.find_element(By.XPATH, group_xpath)
            except Exception as e:
                print(f"Could not find event group {N}: {e}")
                continue  

            event_divs = group_div.find_elements(By.XPATH, 'div/div')
            num_events = len(event_divs)
            print(f"Found {num_events} events in group {N}.")

            for M in range(1, num_events + 1):
                title = ''
                date_time = ''
                location = ''
                description = ''

                event_xpath = f'div/div[{M}]'
                try:
                    event_div = group_div.find_element(By.XPATH, event_xpath)
                except Exception as e:
                    print(f"Could not find event {M} in group {N}: {e}")
                    continue 

                base_event_xpath = f'/html/body/div[1]/div[3]/div/div[1]/div[2]/div[2]/div/div[{N}]/div/div[{M}]'

                try:
                    location_xpath = f'{base_event_xpath}/div/div[2]'
                    location_element = driver.find_element(By.XPATH, location_xpath)
                    location = location_element.text.strip()
                except Exception as e:
                    pass  

                try:
                    start_time_xpath = f'{base_event_xpath}/div/div[3]/span[1]'
                    end_time_xpath = f'{base_event_xpath}/div/div[3]/span[2]'
                    start_time_element = driver.find_element(By.XPATH, start_time_xpath)
                    end_time_element = driver.find_element(By.XPATH, end_time_xpath)
                    start_time = start_time_element.text.strip()
                    end_time = end_time_element.text.strip()
                    date_time = f"{date_text} {start_time} - {end_time}"
                except:
                    try:
                        time_xpath = f'{base_event_xpath}/div/div[3]'
                        time_element = driver.find_element(By.XPATH, time_xpath)
                        time_text = time_element.text.strip()
                        date_time = f"{date_text} {time_text}"
                    except:
                        date_time = date_text  
                try:
                    title_xpath = f'{base_event_xpath}/div/div[4]/a'
                    title_element = driver.find_element(By.XPATH, title_xpath)
                    title = title_element.text.strip()
                except Exception as e:
                    pass  
                try:
                    description_xpath = f'{base_event_xpath}/div/div[5]/p'
                    description_element = driver.find_element(By.XPATH, description_xpath)
                    description = description_element.text.strip()
                except Exception as e:
                    pass 

                file.write(f"Title: {title}\n")
                file.write(f"Date & Time: {date_time}\n")
                file.write(f"Location: {location}\n")
                file.write(f"Description: {description}\n")
                file.write('-' * 50 + '\n')

                print(f"Processed event {M} in group {N}.")

    print("Data scraping completed. Data saved in 'data_temp/events_cmu.txt'.")

finally:
    driver.quit()


Button not found or an error occurred: Message: 

Finished clicking 'Show 50 more'.
Found 39 date groups.
Processing date group 1: Sunday, October 20
Found 6 events in group 1.
Processed event 1 in group 1.
Processed event 2 in group 1.
Processed event 3 in group 1.
Processed event 4 in group 1.
Processed event 5 in group 1.
Processed event 6 in group 1.
Processing date group 2: Monday, October 21
Found 14 events in group 2.
Processed event 1 in group 2.
Processed event 2 in group 2.
Processed event 3 in group 2.
Processed event 4 in group 2.
Processed event 5 in group 2.
Processed event 6 in group 2.
Processed event 7 in group 2.
Processed event 8 in group 2.
Processed event 9 in group 2.
Processed event 10 in group 2.
Processed event 11 in group 2.
Processed event 12 in group 2.
Processed event 13 in group 2.
Processed event 14 in group 2.
Processing date group 3: Tuesday, October 22
Found 14 events in group 3.
Processed event 1 in group 3.
Processed event 2 in group 3.
Processed eve