In [None]:
!pip install beautifulsoup4
!pip install lxml

In [None]:
import urllib.request
import json
import time
import random
import re
import subprocess

from bs4 import BeautifulSoup
from datetime import datetime

In [None]:
def getSessions(year, month):
    # Status messsages which month will be currently processed
    print("Getting sessions of the month " + str(month) + " from year " + str(year) + " ...")
    
    url_string = 'https://parlamentsinfo.giessen.de/si0040.php?__cjahr=' + str(year) + '&__cmonat=' + str(month) + '&__canz=1&__cselect=0'
    f = urllib.request.urlopen(url_string)
    
    html = f.read().decode('utf-8')
    soup = BeautifulSoup(html, 'html.parser')
    
    currentWeekDay = None
    
    # Saves all session elements
    list_of_session_elements = []
    
    archive_link = None
    
    # Tries to archive the url to the internet archive and save the link
    try:
        archive_link = archive_url(url_string)
    except BaseException:
        print("Could not get an archive link for: " + url_string)
    
    for tr_element in soup.find_all('tr'):
        session_information = {}
        
        tr_element = str(tr_element)
        
        soup2 = BeautifulSoup(tr_element, 'html.parser')
        span_element = soup2.find('span',{"class": "weekday"})
        
        if (span_element != None):
            currentWeekDay = span_element.getText()
            
        date_string = currentWeekDay + "." + str(month) + "." + str(year)
        session_information["session_date"] = date_string
            
        soup3 = BeautifulSoup(tr_element, 'html.parser')
        div_element = soup3.find('div', {"class": "smc-el-h"})
        
        soup4 = BeautifulSoup(tr_element, 'html.parser')
        li_element = soup4.find('li', {"class": "list-inline-item"})
        
        if (li_element != None):
            times = re.findall('[0-9][0-9]:[0-9][0-9]', li_element.getText())

            if (len(times) == 1):
                session_information["session_starttime"] = times[0]
            elif (len(times) == 2):
                session_information["session_starttime"] = times[0]
                session_information["session_endtime"] = times[1]
    
        if (div_element != None):
            div_element_text = str(div_element)
            soup5 = BeautifulSoup(div_element_text, 'html.parser')
            link_element = soup5.find('a', {"class": "smce-a-u smc-link-normal smc_doc smc_datatype_si"})
            
            name_string = div_element.getText()
            session_information["session_name"] = name_string
            
            now = datetime.now()
            reference_access_date = now.strftime("%d.%m.%Y") 
            
            session_information["reference"] = url_string
            session_information["reference_access_date"] = reference_access_date
            
            if (archive_link != None):
                session_information["reference_archive_link"] = archive_link

            if (link_element != None):
                session_information["link"] = "https://parlamentsinfo.giessen.de/" + link_element['href'] 
                
            list_of_session_elements.append(session_information)
                
    # Return a list with all sessions of the month
    return list_of_session_elements

In [None]:
def get_session_details(url_to_session):
    f = urllib.request.urlopen(url_to_session)
    
    html = f.read().decode('utf-8')
    soup = BeautifulSoup(html, 'html.parser')
    
    session_id = soup.find('div', {"class": "smc-dg-td-2 smc-table-cell siname"}).getText()
    session_name = soup.find('div', {"class": "smc-dg-td-2 smc-table-cell sigrname"}).getText()
    session_date = soup.find('div', {"class": "smc-dg-td-2 smc-table-cell sidat"}).getText()
    session_start_end = soup.find('div', {"class": "smc-dg-td-2 smc-table-cell yytime"}).getText()
            
    now = datetime.now()
    reference_date = now.strftime("%d.%m.%Y") 
    
    session_information = {"session_id": session_id,
                           "session_name": session_name,
                           "session_date": session_date,
                           "reference": url_to_session,
                           "reference_access_date": reference_date
                          }
    
    # Tries to archive the url to the internet archive and save the link
    try:
        archive_link = archive_url(url_to_session)
        session_information["reference_archive_link"] = archive_link
    except BaseException:
        print("Could not get an archive link for: " + url_to_session)

    times = re.findall('[0-9][0-9]:[0-9][0-9]', session_start_end)
    
    if (len(times) == 1):
        session_information["session_starttime"] = times[0]
    elif (len(times) == 2):
        session_information["session_starttime"] = times[0]
        session_information["session_endtime"] = times[1]
    
    agenda = []
    
    for tr_element in soup.find_all('tr', {"class": "smc-t-r-l"}):
        tr_element = str(tr_element)
        
        # Saves the agenda item with a proposal
        agenda_item = {}
        
        soup2 = BeautifulSoup(tr_element, 'html.parser')
        span_element = soup2.find('span',{"class": "badge"})
        
        if (span_element == None):
            continue
        
        span_element_text = span_element.getText()
        
        result_order = re.findall('[0-9]+', span_element_text)
        
        if (len(result_order) > 0):
            agenda_item["order"] = int(result_order[0])
        
       
        
        if (re.findall('(Ö|N){1}', span_element_text))[0] == "Ö":
            agenda_item["public_status"] = True
        elif (re.findall('(Ö|N){1}', span_element_text))[0] == "N":
            agenda_item["public_status"] = False
        
            
        soup3 = BeautifulSoup(tr_element, 'html.parser')
        link_to_proposal = soup3.find('a',{"class": "smce-a-u smc-link-procedure smc_doc smc_field_voname smcnowrap smc_datatype_vo"})
        
        if (link_to_proposal == None):
            continue
            
        url_to_proposal = "https://parlamentsinfo.giessen.de/" + link_to_proposal['href']
        agenda_item["url_to_proposal"] = url_to_proposal
            
        agenda.append(agenda_item)
        
    
    session_information["agenda"] = agenda    
        
            
    return session_information

In [None]:
'''
Funktion reads the propsal information from the webpage of the parlament information system session.net
and saves them into an JSON object and returns it.
'''
def get_proposal_details(url_to_proposal):
    print("Fetching information of " + url_to_proposal + " ...")
    
    # Saves the details of the proposal
    proposal_information = {}
    
    f = urllib.request.urlopen(url_to_proposal)
    
    html = f.read().decode('utf-8')
    soup = BeautifulSoup(html, 'html.parser')
    
    proposal_subject = soup.find('div', {"class": "smc-dg-td-2 smc-table-cell vobetr"})
    proposal_number = soup.find('div', {"class": "smc-dg-td-2 smc-table-cell voname"})
    proposal_filenumber = soup.find('div', {"class": "smc-dg-td-2 smc-table-cell voakz"})
    
    # Catches the parlament information link
    proposal_information["session_net_link"] = url_to_proposal
    
    # Process the subject, proposal_type and the proposal date
    if proposal_subject != None:
        proposal_subject = str(proposal_subject.getText())
        proposal_subject_array = proposal_subject.split("-")
        
        # Checks that all pieces are existing
        if (len(proposal_subject_array) >= 3):
            
            # Catches the proposal subject
            subject = proposal_subject_array[0:-2]
            proposal_information["proposal_subject"] = ("".join(subject)).strip()
            
            # Catches the proposal type
            authors = proposal_subject_array[-2]
            proposal_type = re.findall("(Antrag|Anfrage)", authors)
            if (len(proposal_type) > 0):
                if proposal_type[0] == "Antrag":
                    proposal_information["proposal_type"] = "Antrag"
                elif proposal_type[0] == "Anfrage":
                    proposal_information["proposal_type"] = "Anfrage"
                    
            
            # Catches the proposal date
            authors = proposal_subject_array[-2]
            proposal_date = re.findall("[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9]", authors)
            if (len(proposal_date) > 0):
                proposal_information["proposal_date"] = proposal_date[0]
                
     
    # Process the propsal number
    if proposal_number != None:
        proposal_information["proposal_number"] = proposal_number.getText()
    
    
    # Process the proppsal filenumber
    if proposal_filenumber != None:
        proposal_information["proposal_filenumber"] = proposal_filenumber.getText()
        
    
    # Tries to archive the url to the internet archive and save the link
    try:
        archive_link = archive_url(url_to_proposal)
        proposal_information["reference_archive_link"] = archive_link
    except BaseException:
        print("Could not get an archive link for: " + url_to_proposal)
    
    
    # Calculate the date of the access of the page as a reference
    reference_access_date = datetime.now().strftime("%d.%m.%Y")
    
    proposal_information["reference_url"] = url_to_proposal
    proposal_information["reference_access_date"] = reference_access_date
    
    return proposal_information

In [None]:
'''
Archives the url to the Internet Archive
'''
def archive_url(url):
    command = "wget --spider 'https://web.archive.org/save/" + url + "'"
    
    for i in range(0,5):
        # Increase the waiting time to avoid too many attempts in a short time
        waiting_time = 2 * i * 15
        time.sleep(waiting_time)
    
        process = subprocess.Popen(command, 
                                   stdout = subprocess.PIPE,
                                   stderr = subprocess.PIPE,
                                   text = True,
                                   shell = True
                                  )

        std_out, std_err = process.communicate()

        resultRegEx = re.findall("https:\/\/web.archive\.org\/web\/[0-9]{14}\/", std_err.strip())

        if (len(resultRegEx) > 1):
            return resultRegEx[0] + url
    
    # Raise an exception if the webpage can not be archived
    raise ValueError("No archive url was generated.")

## Schritt 1: Ermittelt alle Gremiensitzungen

In [None]:
list_of_all_sessions = []

for year in range(2000,2023):
    for month in range(1,13):
        # Tries to fetch all sessions from the month
        try:
            list_of_all_sessions = list_of_all_sessions + getSessions(year, month)
        except BaseException:
            print("Could not fetch sessions from month " + str(month) + " of year " + str(year))
        
        # Waits a random time to prevent DDOS protection is triggered
        waiting_time = random.randint(10,60)
        time.sleep(waiting_time)

# Saves all sessions in a JSON file
with open('sessions_list.json', 'w', encoding='utf-8') as f:
    json.dump(list_of_all_sessions, f, ensure_ascii=False, indent=4)

## Schritt 2: Ermittelt die Details zu den Gremiensitzungen und zugehörige Anträge

In [None]:
# Opens the JSON-file with the sessions
file_pointer = open("sessions_list.json")

# Reads the JSON-file with the sessions
data = json.load(file_pointer)

list_of_all_sessions = []

# Iterates through the sessions
for session in data:
    
    print("Get information from " + session["session_name"] + " (" + session["session_date"] + ") ...")
    
    # Checks, if the session has a link
    if 'link' in session.keys():
        try:
            list_of_all_sessions.append(get_session_details(session['link']))
        except BaseException:
            print("Could not get the session details of: " + session['link'])
          
        # Waits a random time to prevent DDOS protection is triggered
        waiting_time = random.randint(0, 60)
        time.sleep(waiting_time)
    else:
        list_of_all_sessions.append(session)
        
        
#Unique list of proposal links
list_of_proposal_links = []


# TODO: Fix comparisons of the links
for session in list_of_all_sessions:
    if 'agenda' in session.keys():
        for agenda_item in session["agenda"]:
            if agenda_item["url_to_proposal"] not in list_of_proposal_links:
                list_of_proposal_links.append({"url_to_proposal":agenda_item["url_to_proposal"]})


# Saves the sessions with the details in a JSON-file
with open('sessions_details_list.json', 'w', encoding='utf-8') as file_pointer:
    json.dump(list_of_all_sessions, file_pointer, ensure_ascii=False, indent=4)
    
# Saves the links of the proposal in a JSON-file
with open('proposal_link_list.json', 'w', encoding='utf-8') as file_pointer:
    json.dump(list_of_proposal_links, file_pointer, ensure_ascii=False, indent=4)

## Schritt 3: Ermittelt alle Details zu den einzelnen Einträgen

In [None]:
# Opens the JSON-file with the proposals links
file_pointer = open("proposal_link_list.json")

# Reads the data from the JSON-file with the proposal links
data = json.load(file_pointer)

list_of_all_proposals_with_details = []

for proposal in data:
    try:
        proposal_with_details = get_proposal_details(proposal["url_to_proposal"])
        list_of_all_proposals_with_details.append(proposal_with_details)
    except BaseException:
        print("Error: Could not get details of the proposal: " + proposal["url_to_proposal"])
    
    # Waits a random time to prevent DDOS protection is triggered
    waiting_time = random.randint(0, 30)
    time.sleep(waiting_time)
    
with open('proposals_with_details.json', 'w', encoding='utf-8') as file_pointer:
    json.dump(list_of_all_proposals_with_details, file_pointer, ensure_ascii=False, indent=4)

## Schritt 4: Erstellung einer JSON-Datei für den Antrags-Import

## Schritt 5: Erstellung einer JSON-Datei für den Gremiensitzungs-Import

## Schritt 6: Import der Daten in die Wikibase-Datenbank