# Setup

In [1]:
import os
import requests
import zipfile
from io import BytesIO
import pandas as pd

In [2]:
def extract_pdf_file(url, prefix, path, name):
    '''
    Downloads a ZIP file from a given URL, extracts a PDF file starting with a specified prefix,
    and saves it to a specified directory with a new name.

    Args:
        url (str): The URL from which to download the ZIP file.
        prefix (str): The prefix to filter PDF files within the ZIP.
        path (str): The local directory path to save the extracted PDF file.
        name (str): The new name for the extracted PDF file, without the '.pdf' extension.

    Returns:
        str or None: The path to the saved PDF file if a PDF file with the specified prefix is found
                     and successfully saved; otherwise, None if no PDF file matches the criteria.
   
    '''
    # Download data
    response = requests.get(url, stream=True)

    # Open Zip
    with zipfile.ZipFile(BytesIO(response.content)) as zip_file:

        # Iterate over all files in Zip
        for file_name in zip_file.namelist():

            # If a file with the right prefix is found, extract it
            if file_name.startswith(prefix) and file_name.endswith('.pdf'):
                with zip_file.open(file_name) as file_content:
                    content = file_content.read()

                    # Save pdf to disk
                    full_path = f'{path}/{name}.pdf'
                    with open(full_path, 'wb') as new_file:
                        new_file.write(content)
                        return full_path
    # Return None if no matchin file is found
    return None

# Parameters

In [3]:
# Link to the SG. OGD repository
SG_OGD_LINK = 'https://daten.stadt.sg.ch/api/explore/v2.1/catalog/datasets/traktandierte-geschaefte-sitzungen-stadtparlament-stgallen/exports/parquet?lang=de&timezone=Europe%2FZurich'

# Paths
DATA_DIRECTORY = 'data'
DOWNLOAD_DIRECTORY = f'{DATA_DIRECTORY}/pdfs'

In [4]:
# Crate folders if the don't exist
if not os.path.exists(DATA_DIRECTORY):
    os.makedirs(DATA_DIRECTORY)

if not os.path.exists(DOWNLOAD_DIRECTORY):
    os.makedirs(DOWNLOAD_DIRECTORY)

# Download

In [10]:
# Download source file from SG_OGD_LINK
df = pd.read_parquet(SG_OGD_LINK)

# Create id column from the date and issue number
df['id'] = df['sitzungsdatum'] + '-' + df['traktandennummer']

# Filter source data and only keep Interpellationen
# df = df[df['ebene5'] == 'Stadtparlament: Interpellationen']
df

Unnamed: 0,nr,sitzungs_id,link_sitzung,legislatur,jahr,bezeichnung,sitzungsdatum,freigabe,ort,zeit,...,ebene6,dokumentendatum,dokumententitel,traktandentitel,vorberatende_kommission,traktandenstatus,traktandennummer,geschaeft_guid,traktandum_guid,id


In [6]:
# Download pdfs for all entries

file_path = [] # Placeholder to track issues where no file was found

for _, row in df.iterrows():
    path = extract_pdf_file(row['download_traktandum'], 'Interpellation', DOWNLOAD_DIRECTORY, row['id'])        
    if path is None:
        print(f'No pdf found for {row['id']}')
    file_path.append(path)

df['local_file_path'] = file_path

In [7]:
# Save source table and indicator if files were found to disk
df.to_csv(f'{DATA_DIRECTORY}/source_table.csv', index=False, sep='\t')