In [49]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re
from urllib.parse import urljoin
import time
import string

In [50]:
# Specify the Metadata File Path
file_path = './data/moa_metadata.csv'

# Read the CSV file
moa_metadata = pd.read_csv(file_path)

# Extract the MOA_URL links into a list
moa_url_list = moa_metadata['Page_URL'].tolist()

In [51]:
moa_url_list

['https://www.sanjoseca.gov/your-government/departments-offices/office-of-the-city-manager/employee-relations/labor-relations-information/bargaining-units-labor-contract-info/abmei',
 'https://www.sanjoseca.gov/your-government/departments-offices/office-of-the-city-manager/employee-relations/labor-relations-information/bargaining-units-labor-contract-info/aea',
 'https://www.sanjoseca.gov/your-government/departments-offices/office-of-the-city-manager/employee-relations/labor-relations-information/bargaining-units-labor-contract-info/alp',
 'https://www.sanjoseca.gov/your-government/departments-offices/office-of-the-city-manager/employee-relations/labor-relations-information/bargaining-units-labor-contract-info/amsp',
 'https://www.sanjoseca.gov/your-government/departments-offices/office-of-the-city-manager/employee-relations/labor-relations-information/bargaining-units-labor-contract-info/camp',
 'https://www.sanjoseca.gov/your-government/departments-offices/office-of-the-city-manager/

In [52]:
# Function to sanitize filenames
def sanitize_filename(filename):
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    return ''.join(c for c in filename if c in valid_chars)

In [53]:
# Function to download PDF from a given URL
def download_pdf(url, base_url, save_path, session):
    url = urljoin(base_url, url)
    filename = sanitize_filename(save_path) + '.pdf'
    full_save_path = os.path.join(pdf_directory, filename)

    response = session.get(url, stream=True)
    if response.status_code == 200:
        with open(full_save_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"PDF downloaded: {full_save_path}")
    else:
        print(f"Failed to download PDF from {url} - Status Code: {response.status_code}")

In [54]:
# Directory to save PDFs
pdf_directory = './data/'
if not os.path.exists(pdf_directory):
    os.makedirs(pdf_directory)

# Base URL
base_url = 'https://www.sanjoseca.gov'

In [55]:
# Start a session for requests
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
})

# Regex pattern for matching 'Union Contract' in a case-insensitive manner
pattern = re.compile('union contract', re.IGNORECASE)

# Iterate over each MOA_URL
for moa_url in moa_url_list:
    try:
        response = session.get(moa_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all potential headings or paragraphs
            potential_headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'], string=pattern)

            for heading in potential_headings:
                next_siblings = heading.find_next_siblings(['p', 'ul'])
                for sibling in next_siblings:
                    if sibling.name == 'ul':
                        for li in sibling.find_all('li'):
                            a_tag = li.find('a')
                            if a_tag and a_tag.get('href'):
                                pdf_url = a_tag.get('href')
                                pdf_text = a_tag.get_text(strip=True)
                                download_pdf(pdf_url, base_url, pdf_text, session)
                        break  # Break after processing each <ul> associated with the heading

            # Sleep between requests
            time.sleep(1)
        else:
            print(f"Failed to get {moa_url} - Status Code: {response.status_code}")
            time.sleep(5)
    except Exception as e:
        print(f"Error processing {moa_url}: {e}")
        time.sleep(10)

PDF downloaded: ./data/Association of Building Mechanical and Electrical Inspectors (ABMEI) MOA.pdf
PDF downloaded: ./data/Association of Building Mechanical and Electrical Inspectors (ABMEI) MOA.pdf
PDF downloaded: ./data/Association of Engineers and Architects IFTPE Local 21 Units 4142 MOA.pdf
PDF downloaded: ./data/Association of Engineers and Architects IFPTE Local 21 Unit 43 MOA.pdf
PDF downloaded: ./data/Association of Legal Professionals of San Jose (ALP).pdf
PDF downloaded: ./data/Association of Legal Professionals of San Jose (ALP).pdf
PDF downloaded: ./data/Association of Maintenance Supervisory Personnel IFPTE Local 21 (AMSP) MOA.pdf
PDF downloaded: ./data/City Association of Management Personnel IFPTE Local 21 (CAMP) MOA.pdf
PDF downloaded: ./data/San Jos Fire Fighters - International Association of Firefighters (IAFF) Local 230 MOA.pdf
PDF downloaded: ./data/San Jos Fire Fighters - International Association of Firefighters (IAFF) Local 230 MOA.pdf
PDF downloaded: ./data/In