In [336]:
import pandas as pd
import numpy as np
import re

# List of building
In this file we will obtain a list of building, presented here: https://www.wikidata.org/wiki/Wikidata:WikiProject_Philipps-Universit%C3%A4t_Marburg  

We collect only university buildings without those from 'Weitere Gebäude ohne Zuordnung'
  
**Known problems (from Wiki):**
- Consistency of item names (partly German, partly English)
- Numerous missing items
- Scope of items not always clear (institution and/or building, e.g. UB)
- Question about the creation of sub-items (e.g. "Alte Aula" as part of the "Alte Universität")  

Relations between building could be obtained. There are storage somehow in a KDB form (see **Properties**). Might be interesting:
- inception: time when an entity begins to exist
- coordinate location
- street address (however obtained with code from wiki)
- picture: not always available

In [337]:
# Obtain page source from this link https://www.wikidata.org/wiki/Wikidata:WikiProject_Philipps-Universit%C3%A4t_Marburg:
import os
import requests
from bs4 import BeautifulSoup

# Get the page source
url = 'https://www.wikidata.org/wiki/Wikidata:WikiProject_Philipps-Universit%C3%A4t_Marburg'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

Structure of the part, which we are interested in: 
```
# <div class="mw-heading mw-heading1">...</div>
# <div class="mw-heading mw-heading2 ext-discussiontools-init-section">...</div>
# <ul>
#   <li>
#   <li>
#   <li>
#   ...
# </ul>
# <div class="mw-heading mw-heading2 ext-discussiontools-init-section">...</div>
# <ul>
#   <li>
#   <li>
#   <li>
#   ...
# </ul>
# 
# <div class="mw-heading mw-heading1">...</div>

```

In [345]:
def extract_info(building: str) -> tuple:
    """
    Extracts information from a building string.

    Args:
        building (str): Input string containing building information.

    Returns:
        tuple: Contains the following extracted information:
            - name (str): The name of the building.
            - id_link (str): The unique identifier (Q-code).
            - address_code (str): Address code in the format 'A|DD'.
            - address (str): Address text.
            - gebnum (str): Building number (Geb. Nr.).
    """
    name, id_link, address_code, address, gebnum, other = '', '', '', '', '', ''

    # a = building

    gebnum = ''.join(re.findall(r'\(Geb. Nr. (\d+)\)', building))

    if gebnum != '' and re.search(r'\(Geb. Nr. \d+\)',building).span()[1] < len(building):
        building = re.sub(r'\(Geb. Nr. \d+\).*', '', building).strip()
    else:
        building = re.sub(r'\(Geb. Nr. \d+\)', '', building).strip()
    id_link = ''.join(re.findall(r'\((Q\d+)\)', building))
    building = re.sub(r'\(Q\d+\)', '', building).strip()
    address_code = ''.join(re.findall(r'[A-Z]\|\d{2}', building))
    building = [element.strip() for element in re.sub(r'[A-Z]\|\d{2}', '', building).strip().split(',') if element.strip() != '']
    if len(building) > 1:
        address = building[-1]
        name = ', '.join(building[:-1])
    else:
        address = building[0]

    # print(a, '\n', name, id_link, address_code, address, gebnum)
    # print()

    return name, id_link, address_code, address, gebnum

In [346]:
# Extract areas
area_elements = soup.find_all("div", class_="mw-heading mw-heading1")
buildings_dict = {}

# Iterate through each area and extract its campuses and addresses
for area in area_elements[:2]:
    area_name = area.get_text(strip=True)[:-6]
    buildings_dict[area_name] = {}
    
    # Get the next siblings until the next area
    sibling = area.find_next_sibling()

    while sibling and "mw-heading1" not in sibling.get("class", []):
        if "mw-heading2" in sibling.get("class", []):
            campus_name = sibling.get_text(strip=True)[:-6]
            if campus_name == "Weitere Gebäude ohne Zuordnung":
                sibling = sibling.find_next_sibling()
                continue

            # Get addresses for this campus
            addresses = []
            next_sibling = sibling.find_next_sibling()

            if next_sibling and next_sibling.name == "ul":
                for li in next_sibling.find_all("li"):
                    li_text = li.get_text(strip=True)
                    is_relevant_building = 'nicht im Besitz' not in li_text and 'Q437790' not in li_text and 'Q130542938' not in li_text
                    if is_relevant_building:
                        addresses.append(li_text)

                # Extract information from each address
                for i, address in enumerate(addresses):
                    # print(address)
                    name, id_link, address_code, address, gebnum = extract_info(address)
                    addresses[i] = {
                        "name": name,
                        "id_link": id_link,
                        "address_code": address_code,
                        "address": address,
                        "gebnum": gebnum
                    }
                
            buildings_dict[area_name][campus_name] = addresses
            
        sibling = sibling.find_next_sibling()


In [349]:
# save as json buildings_dict
import json
with open('buildings_dict.json', 'w') as f:
    json.dump(buildings_dict, f, indent=4)

In [352]:
num_area, num_campus, num_building = 0, 0, 0

for area in buildings_dict:
    # print(area)
    num_area += 1
    for campus in buildings_dict[area]:
        # print(f"\t{campus}")
        num_campus += 1
        for building in buildings_dict[area][campus]:
            num_building += 1
            # print(f"\t\t{building['name']} - {building['address']} - {building['gebnum']} - {building['id_link']}")

print(f"Number of areas: {num_area}")
print(f"Number of campuses: {num_campus}")
print(f"Number of buildings: {num_building}")

Number of areas: 2
Number of campuses: 14
Number of buildings: 117
