In [41]:
history_wiki_1 ='https://en.wikipedia.org/wiki/Pittsburgh'
history_wiki_2 ='https://en.wikipedia.org/wiki/History_of_Pittsburgh'
more =          'https://en.wikipedia.org/wiki/List_of_museums_in_Pittsburgh'

pit_gov =       ['https://pittsburghpa.gov/index.html']
brittanics =    ['https://www.britannica.com/place/Pittsburgh']
visit_pit =     ['https://www.visitpittsburgh.com/']
tax_reg =       ['https://pittsburghpa.gov/finance/tax-forms']
op_budget =     ['https://apps.pittsburghpa.gov/redtail/images/23255_2024_Operating_Budget.pdf']
about_cmu =     ['https://www.cmu.edu/about/']
event_cal =     ['https://pittsburgh.events/']
downtown_cal =  ['https://downtownpittsburgh.com/events/']
city_paper =    ['https://www.pghcitypaper.com/pittsburgh/EventSearch?v=d']
cmu_events =    ['https://events.cmu.edu/']
campus_events = ['https://www.cmu.edu/engage/alumni/events/campus/index.html']
symphony =      ['https://www.pittsburghsymphony.org/']
opera =         ['https://pittsburghopera.org/']
cultural_trust =['https://trustarts.org/']
carn_museum =   ['https://carnegiemuseums.org/']
heinz_museum =  ['https://www.heinzhistorycenter.org/']
frick_museum =  ['https://www.thefrickpittsburgh.org/']
food_fest =     ['https://www.visitpittsburgh.com/events-festivals/food-festivals/']
pickle =        ['https://www.picklesburgh.com/']
taco_fest =     ['https://www.pghtacofest.com/']
restaurant_w =  ['https://pittsburghrestaurantweek.com/']
little_italy =  ['https://littleitalydays.com/']
banana_split =  ['https://bananasplitfest.com/']
visit_pitsb  =  ['https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/']
pirates =       ['https://www.mlb.com/pirates']
steelers =      ['https://www.steelers.com/']
penguins =      ['https://www.nhl.com/penguins/']


In [10]:
pip install requests beautifulsoup4 pandas tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [45]:
import requests
from bs4 import BeautifulSoup
import os

#strip white spaces and new lines
def clean_text(text):
    return ' '.join(text.split())
#truncate text
def truncate(text, max_len):
    return text[:max_len]

# format table manually 
def format_table_manually(table_html, max_col_width=30):
    EXCLUDE_PHRASES = [
        "This section does not cite any",
        "This section needs additional",
        "This section needs expansion",
    ]

    rows = table_html.find_all('tr')
    table_data = []
    for row in rows:
        cols = row.find_all(['th', 'td'])  
        cols = [clean_text(col.get_text()) for col in cols]
        if len(cols) > 1: 
            table_data.append(cols)

    if not table_data:
        return ""

    for row in table_data:
        for cell in row:
            for phrase in EXCLUDE_PHRASES:
                if phrase in cell:
                    return ""

    num_cols = max(len(row) for row in table_data)
    non_empty_columns = [False] * num_cols

    # non-empty columns
    for row in table_data:
        for i in range(num_cols):
            if i < len(row) and row[i].strip():
                non_empty_columns[i] = True

    columns_to_keep = [i for i, has_content in enumerate(non_empty_columns) if has_content]

    if not columns_to_keep:
        return ""

    # Remove empty columns from table_data
    new_table_data = []
    for row in table_data:
        new_row = [row[i] if i < len(row) else '' for i in columns_to_keep]
        new_table_data.append(new_row)

    table_data = new_table_data
    num_cols = len(columns_to_keep)

    # dynamic column width calculation
    col_widths = [0] * num_cols
    for row in table_data:
        for i, col in enumerate(row):
            col_length = min(len(col), max_col_width)  # Cap column width at max_col_width
            col_widths[i] = max(col_widths[i], col_length)

    # format table and pad it correclty
    table_str = ""
    border_line = "+" + "+".join(["-" * width for width in col_widths]) + "+\n"
    table_str += border_line  

    for row in table_data:
        formatted_row = "|".join(f"{truncate(col, col_widths[i]):<{col_widths[i]}}" for i, col in enumerate(row))
        table_str += f"|{formatted_row}|\n"
        table_str += border_line  

    return table_str.strip()

def scrape_wikipedia_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    EXCLUDED_SECTIONS = [
        "See also",
        "Explanatory notes",
        "References",
        "Further reading",
        "External links"
    ]

    content = soup.find('div', {'id': 'bodyContent'})
    sections = content.find_all(['h1', 'h2', 'h3', 'p', 'table', 'ul', 'ol'])

    output = ""
    current_section = "Intro"
    section_content = ""

    for section in sections:
        if section.name in ['h1', 'h2', 'h3']:
            section_title = clean_text(section.get_text())

            # Exclude useless sections
            if section_title in EXCLUDED_SECTIONS:
                if section_content.strip():
                    output += f"=section_start=\n=section name=\"{current_section}\"\n{section_content.strip()}\n=section_end=\n"
                current_section = None
                section_content = ""
                continue
            else:
                if section_content.strip():
                    output += f"=section_start=\n=section name=\"{current_section}\"\n{section_content.strip()}\n=section_end=\n"
                current_section = section_title
                section_content = ""
                continue

        if current_section:
            if section.name == 'p':
                paragraph = clean_text(section.get_text())
                section_content += f"{paragraph}\n"
            elif section.name in ['ul', 'ol']:
                list_items = section.find_all('li')
                for item in list_items:
                    item_text = clean_text(item.get_text())
                    section_content += f"- {item_text}\n"
            elif section.name == 'table':
                table_content_str = format_table_manually(section)
                if table_content_str:
                    section_content += f"=== Table ===\n{table_content_str}\n=== End of Table ===\n"

    if current_section and section_content.strip():
        output += f"=section_start=\n=section name=\"{current_section}\"\n{section_content.strip()}\n=section_end=\n"

    return output.strip()

def scrape_and_save(url_variable_name, url):
    """Scrape Wikipedia page and save content as 'data/{url_variable_name}.txt'."""
    scraped_content = scrape_wikipedia_page(url)
    os.makedirs("data", exist_ok=True)
    file_path = f"data/{url_variable_name}.txt"
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(scraped_content)

    print(f"Done scraping, saved to {file_path}.")



scrape_and_save("history_wiki_1", history_wiki_1)
scrape_and_save("history_wiki_2", history_wiki_2)
scrape_and_save("more", more)


Done scraping, saved to data/history_wiki_1.txt.
Done scraping, saved to data/history_wiki_2.txt.
Done scraping, saved to data/more.txt.
