Imports

In [1]:
!pip install folium geopy lxml tqdm



In [2]:
import os
import glob
import time
import folium
print(folium.__version__)
from geopy.geocoders import Nominatim
import pandas as pd
from tqdm import tqdm
from lxml import etree
import re
from collections import Counter

0.20.0


In [3]:
print("Current working directory:", os.getcwd())
print("Files found:", os.listdir("get"))


folder_path = "get"  # <- adjust if needed
file_pattern = "*.xml"  
file_list = glob.glob(os.path.join(folder_path, file_pattern))
geolocator = Nominatim(user_agent="tei-mapper")

Current working directory: c:\Users\michael\OneDrive\documents\DDH25_Praktikum\Ordner_Roman\2025_Otto\HSA
Files found: ['hsa.letter.1.xml', 'hsa.letter.10.xml', 'hsa.letter.100.xml', 'hsa.letter.1000.xml', 'hsa.letter.10000.xml', 'hsa.letter.10001.xml', 'hsa.letter.10002.xml', 'hsa.letter.10003.xml', 'hsa.letter.10004.xml', 'hsa.letter.10005.xml', 'hsa.letter.10006.xml', 'hsa.letter.10007.xml', 'hsa.letter.10008.xml', 'hsa.letter.10009.xml', 'hsa.letter.1001.xml', 'hsa.letter.10010.xml', 'hsa.letter.10011.xml', 'hsa.letter.10012.xml', 'hsa.letter.10013.xml', 'hsa.letter.10014.xml', 'hsa.letter.10015.xml', 'hsa.letter.10016.xml', 'hsa.letter.10017.xml', 'hsa.letter.10018.xml', 'hsa.letter.10019.xml', 'hsa.letter.1002.xml', 'hsa.letter.10020.xml', 'hsa.letter.10021.xml', 'hsa.letter.10022.xml', 'hsa.letter.10023.xml', 'hsa.letter.10024.xml', 'hsa.letter.10025.xml', 'hsa.letter.10026.xml', 'hsa.letter.10027.xml', 'hsa.letter.10028.xml', 'hsa.letter.10029.xml', 'hsa.letter.1003.xml', 'hsa.

In [4]:
TEI_NS = {"tei": "http://www.tei-c.org/ns/1.0"}

def lids_from_tree_or_filename(tree, filename):
    # 1) Prefer xml:ids on <div type="letter">
    xml_ids = tree.xpath("//tei:text//tei:div[@type='letter']/@xml:id", namespaces=TEI_NS)
    lids = []
    for x in xml_ids:
        m = re.match(r'^(L\.\d+)', str(x).strip())
        if m:
            lids.append(m.group(1))
    if lids:
        return sorted(set(lids))

    # 2) Fallback: strict filename pattern hsa.letter.<n>.xml
    m = re.search(r'hsa\.letter\.(\d+)\.xml$', filename, flags=re.IGNORECASE)
    return [f"L.{int(m.group(1))}"] if m else []

sender_locations = []
sender_info = []
existing_coordinates = {}

print("Processing XML files...\n")
for filepath in tqdm(file_list, desc="Extracting from XML"):
    filename = os.path.basename(filepath)

    try:
        tree = etree.parse(filepath)

        # header fields (same for the file; will be copied to every lid in this file)
        place_node = tree.xpath(
            "/tei:TEI/tei:teiHeader/tei:profileDesc/tei:correspDesc/tei:correspAction[@type='sent']/tei:placeName",
            namespaces=TEI_NS
        )
        place_name = place_node[0].text.strip() if place_node and place_node[0].text else None

        surname = tree.xpath(
            "string(/tei:TEI/tei:teiHeader/tei:profileDesc/tei:correspDesc/tei:correspAction[@type='sent']/tei:persName/tei:surname)",
            namespaces=TEI_NS
        ).strip() or None

        forename = tree.xpath(
            "string(/tei:TEI/tei:teiHeader/tei:profileDesc/tei:correspDesc/tei:correspAction[@type='sent']/tei:persName/tei:forename)",
            namespaces=TEI_NS
        ).strip() or None

        # coords: look up the geo that matches the header place_name (if any)
        coords = None
        if place_name:
            geo_nodes = tree.xpath("//tei:profileDesc//tei:placeName", namespaces=TEI_NS)
            for g in geo_nodes:
                name_node = g.find("tei:name", namespaces=TEI_NS)
                geo_node  = g.find("tei:location/tei:geo", namespaces=TEI_NS)
                if name_node is not None and geo_node is not None:
                    if (name_node.text or "").strip() == place_name:
                        coords = (geo_node.text or "").strip()
                        break

        latlon = None
        if coords:
            try:
                lat, lon = map(float, coords.split(","))
                latlon = (lat, lon)
            except Exception:
                latlon = None

        # get lids from xml (or fallback to filename)
        lids = lids_from_tree_or_filename(tree, filename)
        if not lids:
            # last-resort: make one synthetic entry so we can detect it later
            lids = [f"FILE:{filename}"]
            # you can also log this if useful:
            # print(f"WARNING: no LID in xml and filename didn't match: {filename}")

        # write one row per LID found in this file
        for lid in lids:
            sender_locations.append((lid, place_name or "No sender location"))
            sender_info.append((
                lid,
                surname or "No sender surname",
                forename or "No sender forename",
                place_name or "No sender location"
            ))
            existing_coordinates[lid] = latlon  # may be None if no coords

    except Exception as e:
        # if parsing failed, tie the error to the filename so you can investigate
        sender_locations.append((f"FILE:{filename}", f"Error: {str(e)}"))
        existing_coordinates[f"FILE:{filename}"] = None

print("Done.")


Processing XML files...



Extracting from XML: 100%|██████████| 11576/11576 [06:05<00:00, 31.63it/s]

Done.





In [5]:
sender_info_df = pd.DataFrame(sender_info, columns=["lid", "sender_surname", "sender_forename", "place_name"])

sender_info_df.head()

Unnamed: 0,lid,sender_surname,sender_forename,place_name
0,L.1,Baissac,Charles,Port Louis
1,L.10,Machado y Álvarez,Antonio,Sevilla
2,L.100,Wagner,Max Leopold,Berlin
3,L.1000,Schuchardt,Hugo,Graz
4,L.10000,Menéndez Pidal,Ramón,Madrid


In [6]:
valid_places = [place for _, place in sender_locations if place and not place.startswith("No") and not place.startswith("Error")]
place_counts = Counter(valid_places)

place_count_df = pd.DataFrame(place_counts.items(), columns=["place", "count"])

place_count_df.head()

Unnamed: 0,place,count
0,Port Louis,28
1,Sevilla,30
2,Berlin,237
3,Graz,2370
4,Madrid,95


In [7]:
print("\nGeocoding missing places...\n")

unique_places = {
    place
    for _, place in sender_locations
    if place and not place.startswith("No") and not place.startswith("Error")
}

places_to_geocode = unique_places - set(existing_coordinates.keys())

new_coordinates = {}

for place in tqdm(places_to_geocode, desc="Geocoding"):
    try:
        location = geolocator.geocode(place)
        if location:
            new_coordinates[place] = (location.latitude, location.longitude)
        else:
            new_coordinates[place] = (None, None)
    except Exception:
        new_coordinates[place] = (None, None)

    time.sleep(1)  # Respect Nominatim rate limit

existing_coordinates.update(new_coordinates)

mapped_points = []
for lid, place in sender_locations:
    lat, lon = existing_coordinates.get(place, (None, None))
    mapped_points.append((lid, place, lat, lon))



Geocoding missing places...



Geocoding: 100%|██████████| 778/778 [14:49<00:00,  1.14s/it]


In [8]:
print(unique_places)

{'Eibiswald', 'Anglet', 'Bad Homburg vor der Höhe', 'Hornbæk', 'Vejen', 'Braunschweig', 'Praia', 'Asolo', 'Wrschowitz, Prag', 'Rosenburg-Mold', 'Wangerooge', 'Savognin', 'Obersdorf', 'Kandy', 'Media', 'Porto', 'Fălticeni', 'Kopperpahl', 'Marseille', 'Santander', 'Friedenau', 'Klein Flottbek', 'Lichterfelde', 'Altenburg', 'Bad Gleichenberg', 'Avenay', 'Mödling', 'Leysin', 'Cambo-les-Bains', 'Lauterberg im Harz', 'Neuenheim', 'Reichenau an der Rax', 'Carcavelos', 'Kitzbühel', 'Llandrindod Wells', 'Heilbronn', 'Bad Hofgastein', 'Askov', 'Fürstenau, Graubünden', 'Boskovice', 'Ungarisch Hradisch', 'Königliche Weinberge bei Prag', 'Montauban', 'Porto Alegre', 'Mérida', 'Kufstein', 'Klausenburg', 'Frascati', 'Folkestone', 'Schilfa', 'New Haven', 'Lasdehnen', 'Dili', 'Plombières-les-Bains', 'Brühl bei Köln', 'Windsor', 'Lille', 'Hohe Tatra', 'Fernando Póo', 'Zara', 'Baltimore', 'Tschochau', 'Rigi Kaltbad', 'Utrecht', 'Pordoijoch', 'Vichy', 'Philadelphia', 'Emden', 'Lausanne', 'Padua', 'Apatin'

In [9]:
unique_coords_df = (
    pd.Series(existing_coordinates, name="coords")
      .rename_axis("place")
      .reset_index()
)


unique_coords_df[["latitude", "longitude"]] = pd.DataFrame(
    unique_coords_df["coords"].tolist(), index=unique_coords_df.index
)
unique_coords_df = unique_coords_df.drop(columns=["coords"])

unique_coords_df = unique_coords_df[unique_coords_df["place"].isin(place_count_df["place"])]


merged_place_df = place_count_df.merge(unique_coords_df, on="place", how="left")
merged_place_df = merged_place_df.sort_values(by="count", ascending=False).reset_index(drop=True)

merged_place_df.head()

Unnamed: 0,place,count,latitude,longitude
0,Graz,2370,47.070868,15.438279
1,Wien,855,48.208354,16.372504
2,Paris,781,48.85889,2.320041
3,Unbekannt,576,49.984949,8.708246
4,Leipzig,247,51.340632,12.374733


In [10]:
print("\nFinal mapped results:\n")
for lid, place, lat, lon in mapped_points:
    print(f"{lid}: {place} – {lat}, {lon}")


Final mapped results:

L.1: Port Louis – -20.1624522, 57.5028044
L.10: Sevilla – 37.3886303, -5.9953403
L.100: Berlin – 52.510885, 13.3989367
L.1000: Graz – 47.0708678, 15.4382786
L.10000: Madrid – 40.4167047, -3.7035825
L.10001: Madrid – 40.4167047, -3.7035825
L.10002: Madrid – 40.4167047, -3.7035825
L.10003: Antibes – 43.5812868, 7.1262071
L.10004: Madrid – 40.4167047, -3.7035825
L.10005: Madrid – 40.4167047, -3.7035825
L.10006: Madrid – 40.4167047, -3.7035825
L.10007: Madrid – 40.4167047, -3.7035825
L.10008: Madrid – 40.4167047, -3.7035825
L.10009: Graz – 47.0708678, 15.4382786
L.1001: Graz – 47.0708678, 15.4382786
L.10010: Madrid – 40.4167047, -3.7035825
L.10011: Madrid – 40.4167047, -3.7035825
L.10012: Madrid – 40.4167047, -3.7035825
L.10013: Graz – 47.0708678, 15.4382786
L.10014: Madrid – 40.4167047, -3.7035825
L.10015: Madrid – 40.4167047, -3.7035825
L.10016: Madrid – 40.4167047, -3.7035825
L.10017: Madrid – 40.4167047, -3.7035825
L.10018: Graz – 47.0708678, 15.4382786
L.10019:

In [11]:
# unique_places = {}
# for _, place, lat, lon in mapped_points:
#     if place and lat is not None and lon is not None and place not in unique_places:
#         unique_places[place] = (float(lat), float(lon))

# default_center = (0, 0)
# map_center = next(iter(unique_places.values()), default_center)

# m = folium.Map(location=map_center, zoom_start=4)

# for place, (lat, lon) in unique_places.items():
#     folium.Marker(
#         location=[lat, lon],
#         popup=place,
#         tooltip=place
#     ).add_to(m)

# # Display (auto in Jupyter)
# #m
 

In [12]:
df_locations = pd.DataFrame(mapped_points, columns=["filename", "place", "latitude", "longitude"])
df_locations_clean = df_locations.dropna(subset=["latitude", "longitude"])

df_locations_clean.head()

Unnamed: 0,filename,place,latitude,longitude
0,L.1,Port Louis,-20.162452,57.502804
1,L.10,Sevilla,37.38863,-5.99534
2,L.100,Berlin,52.510885,13.398937
3,L.1000,Graz,47.070868,15.438279
4,L.10000,Madrid,40.416705,-3.703582


In [13]:
scores_df = pd.read_csv("scores.csv")
scores_df.columns = ["name", "sender_score"]

sender_info_df["name_lower"] = sender_info_df["sender_surname"].str.lower()
scores_df["name_lower"] = scores_df["name"].str.lower()
names_df = pd.merge(sender_info_df, scores_df[["name_lower", "sender_score"]], on="name_lower", how="left")
names_df = names_df.drop(columns=["name_lower"])
names_df["sender_score"] = names_df["sender_score"].fillna(0)
names_df = names_df.sort_values(by="sender_score", ascending=False)
names_df.head()

Unnamed: 0,lid,sender_surname,sender_forename,place_name,sender_score
3,L.1000,Schuchardt,Hugo,Graz,0.260678
11571,L.9995,Schuchardt,Hugo,Graz,0.260678
11552,L.9978,Schuchardt,Hugo,Graz,0.260678
11549,L.9975,Schuchardt,Hugo,Graz,0.260678
11548,L.9974,Schuchardt,Hugo,Graz,0.260678


In [14]:
scores_df = pd.read_csv("scores.csv")
scores_df.columns = ["place", "score"]

final_df = merged_place_df.copy()
final_df["place_lower"] = final_df["place"].str.lower()
scores_df["place_lower"] = scores_df["place"].str.lower()

final_df = pd.merge(final_df, scores_df[["place_lower", "score"]], on="place_lower", how="left")
final_df["score"] = final_df["score"].fillna(0)
final_df.head()


Unnamed: 0,place,count,latitude,longitude,place_lower,score
0,Graz,2370,47.070868,15.438279,graz,0.059202
1,Wien,855,48.208354,16.372504,wien,0.020744
2,Paris,781,48.85889,2.320041,paris,0.132565
3,Unbekannt,576,49.984949,8.708246,unbekannt,0.0
4,Leipzig,247,51.340632,12.374733,leipzig,0.011794


In [15]:
final_df = final_df.sort_values(by="score", ascending=False)
final_df.head()

Unnamed: 0,place,count,latitude,longitude,place_lower,score
24,London,90,51.489334,-0.144055,london,0.490675
251,Manila,3,14.590449,120.980362,manila,0.325567
64,Tiflis,25,41.693459,44.801449,tiflis,0.174195
47,Oxford,33,51.752013,-1.25785,oxford,0.140106
2,Paris,781,48.85889,2.320041,paris,0.132565


In [16]:
names_df["place_lower"] = names_df["place_name"].str.lower()
final_names_df = pd.merge(final_df, names_df, on="place_lower", how="left")

In [17]:
final_names_df = final_names_df.sort_values(by="score", ascending=False)
final_names_df.head()

Unnamed: 0,place,count,latitude,longitude,place_lower,score,lid,sender_surname,sender_forename,place_name,sender_score
0,London,90,51.489334,-0.144055,london,0.490675,L.5018,Bonaparte,Louis Lucien,London,0.045893
1,London,90,51.489334,-0.144055,london,0.490675,L.5017,Bonaparte,Louis Lucien,London,0.045893
2,London,90,51.489334,-0.144055,london,0.490675,L.5016,Bonaparte,Louis Lucien,London,0.045893
3,London,90,51.489334,-0.144055,london,0.490675,L.5009,Bonaparte,Louis Lucien,London,0.045893
4,London,90,51.489334,-0.144055,london,0.490675,L.5015,Bonaparte,Louis Lucien,London,0.045893


In [18]:
final_names_df.to_csv('final_lc.csv')

In [19]:
# import pandas as pd
# import branca.colormap as cm
# from branca.element import Figure, Element

# max_score = final_df["score"].max()
# min_score = final_df["score"].min()

# def scale_radius(score, min_radius=4, max_radius=18):
#     if max_score == 0:
#         return min_radius
#     return min_radius + (score / max_score) * (max_radius - min_radius)

# colormap = cm.linear.RdYlGn_09.scale(min_score, max_score)
# colormap.caption = 'Place Score'

# if not final_df.empty:
#     center_lat, center_lon = final_df.iloc[0]["latitude"], final_df.iloc[0]["longitude"]
# else:
#     center_lat, center_lon = 0, 0


# fig = Figure(width="100%", height="100%")


# title_html = """
# <h2 align="center" style="font-family:Arial; color:#333333; margin-top:20px;">
#     Hugo Suchardt Archive Interactive Map
# </h2>
# """
# fig.html.add_child(Element(title_html))

# m = folium.Map(location=[center_lat, center_lon], zoom_start=3)
# fig.add_child(m)

# for _, row in final_df.iterrows():
#     if pd.notna(row["latitude"]) and pd.notna(row["longitude"]):
#         score = row["score"]
#         count = row["count"]
#         place = row["place"]

#         folium.CircleMarker(
#             location=(row["latitude"], row["longitude"]),
#             radius=scale_radius(score),
#             color=colormap(score),
#             fill=True,
#             fill_opacity=0.7,
#             fill_color=colormap(score),
#             popup=folium.Popup(
#                 f"<b>{place}</b><br>Score: {score:.2f}<br>Count: {count}",
#                 max_width=300
#             ),
#             tooltip=f"{place} ({score:.2f})"
#         ).add_to(m)


# colormap.add_to(m)

# # Save the final map as HTML
# fig.save("places.html")

In [20]:
import folium
import math
import branca.colormap as cm
from branca.element import Figure, Element

# Copy and prepare sender data
tmp = final_names_df.copy()
tmp["place_lower"] = tmp["place_name"].str.lower()
tmp["sender_score"] = tmp["sender_score"].fillna(0)
tmp["sender_full"] = (
    tmp["sender_forename"].fillna("").str.strip() + " " +
    tmp["sender_surname"].fillna("").str.strip()
).str.strip()           

tmp = (
    tmp.groupby(["place_lower", "sender_full"], as_index=False)["sender_score"]
       .max()
       .sort_values(["place_lower", "sender_score"], ascending=[True, False])
)

senders_by_place = (
    tmp.groupby("place_lower")
       .apply(lambda g: list(zip(g["sender_full"], g["sender_score"])))
       .to_dict()
)

if not final_df.empty:
    center_lat, center_lon = final_df.iloc[0]["latitude"], final_df.iloc[0]["longitude"]
else:
    center_lat, center_lon = 0, 0

agg_df = (
    final_df.groupby(["place", "latitude", "longitude"], as_index=False)
            .agg({"score": "mean", "count": "sum"})
)
agg_df["place_lower"] = agg_df["place"].str.lower()

max_score = agg_df["score"].max()
min_score = agg_df["score"].min()

max_count = agg_df["count"].max()
min_count = agg_df["count"].min()

max_count = final_df["count"].max()

def scale_radius(count, min_radius=2.5, max_radius=15):
    if max_count <= 1:
        return min_radius
    return min_radius + (math.log1p(count) / math.log1p(max_count)) * (max_radius - min_radius)

colormap = cm.linear.RdYlGn_09.scale(min_score, max_score)
colormap.caption = 'Place Score'

# Map figure
fig = Figure(width="100%", height="100%")
title_html = """
<h2 align="center" style="font-family:Arial; color:#333333; margin-top:20px;">
    Hugo Suchardt Archive Interactive Map
</h2>
""" 
fig.html.add_child(Element(title_html))

m = folium.Map(location=[center_lat, center_lon], zoom_start=3)
fig.add_child(m)

for _, row in agg_df.iterrows():
    if pd.notna(row["latitude"]) and pd.notna(row["longitude"]):
        place   = row["place"]
        score   = row["score"]
        count   = row["count"]
        senders = senders_by_place.get(row["place_lower"], [])

        if senders:
            senders_html = "<br>".join(
                f"{i+1}. {name} — {sc:.4f}" for i, (name, sc) in enumerate(senders)
            )
        else:
            senders_html = "<i>No senders found for this place</i>"

        popup_html = f"""
            <b>{place}</b><br>
            Place score: {score:.4f}<br>
            Letters from here: {int(count)}<br>
            <hr style="margin:6px 0;">
            <b>Senders:</b><br>
            {senders_html}
        """

        folium.CircleMarker(
            location=(row["latitude"], row["longitude"]),
            radius=scale_radius(count),   
            color=colormap(score),
            fill=True,
            fill_opacity=0.7,
            fill_color=colormap(score),
            popup=folium.Popup(popup_html, max_width=400),
            tooltip=f"{place} — {count} letters"
        ).add_to(m)


colormap.add_to(m)


fig.save("places.html")


  .apply(lambda g: list(zip(g["sender_full"], g["sender_score"])))
