In [1]:
import json
import re
import numpy as np
import pandas as pd
import folium
from folium import MacroElement
from jinja2 import Template
from pathlib import Path

In [None]:
df_loc = pd.read_csv("final_location.csv", low_memory=False)
df_par = pd.read_csv("final_topics.csv", low_memory=False)

In [3]:
df_loc_sorted = df_loc.sort_values('lid')
df_loc_sorted.head()

Unnamed: 0.1,Unnamed: 0,place,count,latitude,longitude,place_lower,score,lid,sender_surname,sender_forename,place_name,sender_score
7487,7487,Port Louis,28,-20.162452,57.502804,port louis,0.0,L.1,Baissac,Charles,Port Louis,0.0
5426,5426,Sevilla,30,37.38863,-5.99534,sevilla,0.007598,L.10,Machado y Álvarez,Antonio,Sevilla,0.0
10535,10535,Berlin,237,52.510885,13.398937,berlin,0.0,L.100,Wagner,Max Leopold,Berlin,0.0
1418,1418,Graz,2370,47.070868,15.438279,graz,0.059202,L.1000,Schuchardt,Hugo,Graz,0.260678
5353,5353,Madrid,95,40.416705,-3.703582,madrid,0.008863,L.10000,Menéndez Pidal,Ramón,Madrid,0.0


In [4]:
def add_lid_from_pid(df_par):
    """
    Adds df_par['lid'] by taking the 'L.<n>' prefix from df_par['pid'].
    Works for 'L.<n>' and 'L.<n>-<pid>'.
    """
    df_par['lid'] = df_par['pid'].astype(str).str.extract(r'(L\.\d+)', expand=False)
    return df_par

In [5]:
df_par['lid'] = df_par['pid'].astype(str).str.extract(r'(L\.\d+)', expand=False)

In [6]:

df_par.head()

Unnamed: 0.1,Unnamed: 0,pid,sender_id,sender,receiver_id,receiver,date,text,language,keywords,word_count,pred_topic,pred_prob,topic_top_words,_parent_id,_section_idx,topic_label,date_dt,year,lid
0,0,L.1-1,https://gams.uni-graz.at/o:hsa.persons#P.1069,Baissac,https://gams.uni-graz.at/o:hsa.persons#P.109,Schuchardt,1885-01-20,Ma Doudou vous envoie une petite brochure jaun...,fr,,28,-1,0.0,Linguistic correspondence,,,-1_Linguistic correspondence,1885-01-20,1885.0,L.1
1,1,L.1-2,https://gams.uni-graz.at/o:hsa.persons#P.1069,Baissac,https://gams.uni-graz.at/o:hsa.persons#P.109,Schuchardt,1885-01-20,Nous sommes anxieux l’un et l’autre d’avoir de...,fr,,31,44,0.6021955991888062,Health correspondence concerns,,,44_Health correspondence concerns,1885-01-20,1885.0,L.1
2,2,L.10-1,https://gams.uni-graz.at/o:hsa.persons#P.2100,Machado y Álvarez,https://gams.uni-graz.at/o:hsa.persons#P.109,Schuchardt,1882,contesto a todas sus anteriores en forma teleg...,es,Giornale di Filologia Romanza,8,-1,0.0,Linguistic correspondence,,,-1_Linguistic correspondence,1882-01-01,1882.0,L.10
3,3,L.10-2,https://gams.uni-graz.at/o:hsa.persons#P.2100,Machado y Álvarez,https://gams.uni-graz.at/o:hsa.persons#P.109,Schuchardt,1882,1º Sanjurjo ha sido nombrado catedrático de Ma...,es,Giornale di Filologia Romanza,78,-1,0.0,Linguistic correspondence,,,-1_Linguistic correspondence,1882-01-01,1882.0,L.10
4,4,L.10-3,https://gams.uni-graz.at/o:hsa.persons#P.2100,Machado y Álvarez,https://gams.uni-graz.at/o:hsa.persons#P.109,Schuchardt,1882,2º No he podido averiguar el nombre del gobern...,es,Giornale di Filologia Romanza,35,-1,0.0,Linguistic correspondence,,,-1_Linguistic correspondence,1882-01-01,1882.0,L.10


In [7]:
d = df_loc.copy()
d['place'] = d['place'].astype('string').str.strip()
unknown_basic = d[d['place'].isna() | (d['place'] == '')][
    ['lid','place','latitude','longitude']
].drop_duplicates()

print("Missing/blank place_name rows:")
print(unknown_basic.to_string(index=False))

Missing/blank place_name rows:
Empty DataFrame
Columns: [lid, place, latitude, longitude]
Index: []


In [8]:
def join_add_data(df_par, df_loc, cols=None, suffix="_loc", prefer_score=True):
    import numpy as np
    # build a display column that falls back from place_name -> place
    loc = df_loc.copy()
    if 'place_name' in loc.columns or 'place' in loc.columns:
        loc['place_display'] = loc.get('place_name')
        if 'place' in loc.columns:
            loc['place_display'] = loc['place_display'].fillna(loc['place'])
        # strip blanks to NaN
        loc['place_display'] = loc['place_display'].astype('string').str.strip().replace({'': np.nan})

    # default columns to take
    if cols is None:
        cols = [c for c in loc.columns if c != 'lid']
    else:
        cols = [c for c in cols if c in loc.columns and c != 'lid']

    # ensure we include our display column if they asked for a name
    if 'place_name' in cols and 'place_display' in loc.columns:
        cols = [c if c != 'place_name' else 'place_display' for c in cols]

    # prefer highest score per lid if present
    base = loc
    if prefer_score and 'score' in base.columns:
        base = base.sort_values(['lid','score'], ascending=[True,False])

    loc_unique = base[['lid'] + cols].drop_duplicates('lid', keep='first').copy()

    # avoid collisions
    rename_map = {c: f"{c}{suffix}" for c in cols if c in df_par.columns}
    if rename_map:
        loc_unique = loc_unique.rename(columns=rename_map)

    return df_par.merge(loc_unique, on='lid', how='left')

# --- re-run the merge (note we ask for place_name but get the fallback) ---
df_par = join_add_data(
    df_par,
    df_loc,
    cols=['place_name', 'latitude', 'longitude']
).rename(columns={'place_display':'place_name'})  # if the function kept 'place_display_loc', adjust accordingly

# sanity: show rows still missing a place
unknown = df_par[df_par['place_name'].isna()][['lid','pid','place_name','latitude','longitude']].drop_duplicates()
print(f"Rows still missing place_name after fallback: {len(unknown)}")
print(unknown.head(20).to_string(index=False))


Rows still missing place_name after fallback: 360
   lid        pid place_name  latitude  longitude
   NaN L.l.7533-1       <NA>       NaN        NaN
L.4226   L.4226-6       <NA>       NaN        NaN
L.4226   L.4226-4       <NA>       NaN        NaN
L.4226   L.4226-3       <NA>       NaN        NaN
L.4226   L.4226-1       <NA>       NaN        NaN
L.4226   L.4226-5       <NA>       NaN        NaN
L.4226   L.4226-2       <NA>       NaN        NaN
   NaN  L..5211-1       <NA>       NaN        NaN
   NaN  L.L5403-1       <NA>       NaN        NaN
   NaN  L..6030-4       <NA>       NaN        NaN
   NaN  L..6030-5       <NA>       NaN        NaN
   NaN  L..6030-3       <NA>       NaN        NaN
   NaN  L..6030-1       <NA>       NaN        NaN
   NaN  L..6030-2       <NA>       NaN        NaN
L.8481   L.8481-1       <NA>       NaN        NaN
L.9221   L.9221-3       <NA>       NaN        NaN
L.9221   L.9221-4       <NA>       NaN        NaN
L.9221   L.9221-1       <NA>       NaN        NaN


In [9]:
def norm_lid(s):
    if pd.isna(s): return None
    m = re.match(r'^(L\.\d+)', str(s).strip())
    return m.group(1) if m else None

# normalize join keys on BOTH frames
df_par = df_par.copy()
df_loc = df_loc.copy()
df_par['lid_norm'] = df_par['lid'].apply(norm_lid)
df_loc['lid_norm'] = df_loc['lid'].apply(norm_lid)

par_letters = set(df_par['lid_norm'].dropna().unique())
loc_letters = set(df_loc['lid_norm'].dropna().unique())

missing_letters = sorted(par_letters - loc_letters)
print(missing_letters)
print(f"Letters in df_par but NOT in df_loc: {len(missing_letters)}")
print("Sample:", missing_letters[:20])

['L.4226', 'L.8481', 'L.9221']
Letters in df_par but NOT in df_loc: 3
Sample: ['L.4226', 'L.8481', 'L.9221']


In [10]:
d = df_par.copy()
d['place_name'] = d['place_name'].astype('string').str.strip()
unknown_basic = d[d['place_name'].isna() | (d['place_name'] == '')][
    ['lid','pid','place_name','latitude','longitude']
].drop_duplicates()

print("Missing/blank place_name rows:")
print(unknown_basic.to_string(index=False))

Missing/blank place_name rows:
   lid        pid place_name  latitude  longitude
   NaN L.l.7533-1       <NA>       NaN        NaN
L.4226   L.4226-6       <NA>       NaN        NaN
L.4226   L.4226-4       <NA>       NaN        NaN
L.4226   L.4226-3       <NA>       NaN        NaN
L.4226   L.4226-1       <NA>       NaN        NaN
L.4226   L.4226-5       <NA>       NaN        NaN
L.4226   L.4226-2       <NA>       NaN        NaN
   NaN  L..5211-1       <NA>       NaN        NaN
   NaN  L.L5403-1       <NA>       NaN        NaN
   NaN  L..6030-4       <NA>       NaN        NaN
   NaN  L..6030-5       <NA>       NaN        NaN
   NaN  L..6030-3       <NA>       NaN        NaN
   NaN  L..6030-1       <NA>       NaN        NaN
   NaN  L..6030-2       <NA>       NaN        NaN
L.8481   L.8481-1       <NA>       NaN        NaN
L.9221   L.9221-3       <NA>       NaN        NaN
L.9221   L.9221-4       <NA>       NaN        NaN
L.9221   L.9221-1       <NA>       NaN        NaN
L.9221   L.9221-2  

In [11]:
cols_to_drop = ['keywords', 'pred_topic', 'pred_prob', '_parent_id', '_section_idx']
df_par = df_par.drop(columns=[c for c in cols_to_drop if c in df_par.columns])
if 'Unnamed: 0' in df_par.columns:
    df_par = df_par.drop(columns='Unnamed: 0')


In [12]:
def extract_letter_id(series):
    # 'L.<n>' or 'L.<n>-<pid>' -> 'L.<n>'
    return series.astype(str).str.extract(r'^(L\.\d+)', expand=False)

In [13]:
def _prep_counts(df, lid_col='lid', lat_col='latitude', lon_col='longitude', place_col='place_name'):
    df = df.dropna(subset=[lat_col, lon_col]).copy()
    # ensure year as int
    if 'year' not in df.columns:
        df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year
    df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')

    # pure letter id so paragraphs don’t overcount
    df['letter_id'] = df[lid_col].astype(str).str.extract(r'^(L\.\d+)', expand=False)
    df = df.dropna(subset=['letter_id', 'year'])

    base = [lat_col, lon_col] + ([place_col] if place_col in df.columns else [])

    per_year = (
        df.drop_duplicates(base + ['year', 'letter_id'])
          .groupby(base + ['year'], as_index=False)['letter_id']
          .nunique().rename(columns={'letter_id': 'letters_count'})
    )
    all_years = (
        df.drop_duplicates(base + ['letter_id'])
          .groupby(base, as_index=False)['letter_id']
          .nunique().rename(columns={'letter_id': 'letters_count'})
    )
    return per_year, all_years

In [14]:

def make_senders_page(
    df_par,
    outfile="senders.html",
    lid_col="lid",
    lat_col="latitude",
    lon_col="longitude",
    place_col="place_name",
    sender_col="sender",
    year_col="year",
    date_col="date",
    round_coords=6,
):
    df = df_par.copy()

    if sender_col not in df.columns:
      if 'sender_name' in df.columns:
          sender_col = 'sender_name'
      elif 'sender' in df.columns:
          sender_col = 'sender'
      elif {'sender_forename','sender_surname'}.issubset(df.columns):
          df['sender_name'] = (df['sender_forename'].fillna('') + ' ' +
                              df['sender_surname'].fillna('')).str.strip().replace('', 'Unknown sender')
          sender_col = 'sender_name'
      else:
          raise KeyError("No sender column found. Provide sender_col='sender' (or the correct name).")

    # ensure year exists
    if year_col in df.columns:
        df["year"] = pd.to_numeric(df[year_col], errors="coerce").astype("Int64")
    else:
        df["year"] = pd.to_datetime(df[date_col], errors="coerce").dt.year

    # normalize coords + ids
    df[lat_col] = pd.to_numeric(df[lat_col], errors="coerce").round(round_coords)
    df[lon_col] = pd.to_numeric(df[lon_col], errors="coerce").round(round_coords)
    df["letter_id"] = df[lid_col].astype(str).str.extract(r"^(L\.\d+)", expand=False)

    df = df.dropna(subset=[lat_col, lon_col, "year", "letter_id"])

    # helper for dict keys
    def key(lat, lon, y=None):
        k = f"{lat:.{round_coords}f},{lon:.{round_coords}f}"
        return f"{k}|{int(y)}" if y is not None else k

    # ----- per-year list: unique letters per sender -----
    per_year = (
        df.groupby([lat_col, lon_col, "year", sender_col])["letter_id"]
          .nunique().reset_index(name="letters")
    )
    by_year = {}
    for _, r in per_year.iterrows():
        ky = key(r[lat_col], r[lon_col], r["year"])
        by_year.setdefault(ky, []).append(
            {"sender": r[sender_col] or "Unknown sender", "letters": int(r["letters"])}
        )
    # sort each list (desc)
    for v in by_year.values():
        v.sort(key=lambda x: (-x["letters"], x["sender"]))

    # ----- all-time list -----
    all_time_df = (
        df.groupby([lat_col, lon_col, sender_col])["letter_id"]
          .nunique().reset_index(name="letters")
    )
    all_time = {}
    for _, r in all_time_df.iterrows():
        ka = key(r[lat_col], r[lon_col])
        all_time.setdefault(ka, []).append(
            {"sender": r[sender_col] or "Unknown sender", "letters": int(r["letters"])}
        )
    for v in all_time.values():
        v.sort(key=lambda x: (-x["letters"], x["sender"]))

    # place lookup
    place_lookup = (
        df.dropna(subset=[place_col])
          .drop_duplicates([lat_col, lon_col])[[lat_col, lon_col, place_col]]
    )
    places = {
        key(r[lat_col], r[lon_col]): r[place_col]
        for _, r in place_lookup.iterrows()
    }

    # write page (no f-strings → safe braces)
    page = """
<!doctype html>
<meta charset="utf-8">
<title>Senders</title>
<style>
 body{font:14px/1.4 system-ui, Arial, sans-serif; margin:20px;}
 .pill{display:inline-block; padding:2px 8px; border-radius:999px; background:#eef; margin-left:6px;}
 .muted{color:#666}
</style>
<h2 id="title">Senders</h2>
<div class="muted">Tip: change the URL query, e.g. <code>?lat=48.208&lon=16.373&year=1885</code> or <code>year=all</code></div>
<hr>
<ol id="list"></ol>

<script>
  // embedded data
  const BY_YEAR = __BY_YEAR__;
  const ALL_TIME = __ALL_TIME__;
  const PLACES = __PLACES__;
  const ROUND = __ROUND__;

  function q(name){ return new URLSearchParams(location.search).get(name); }
  function key(lat, lon, y){
    function r(n){ return Number(n).toFixed(ROUND); }
    const base = r(lat)+","+r(lon);
    return (y===null||y==="all") ? base : (base+"|"+parseInt(y,10));
  }

  function render(){
    const lat = parseFloat(q("lat"));
    const lon = parseFloat(q("lon"));
    const year = q("year"); // "all" or number

    const baseKey = key(lat, lon, null);
    const place = PLACES[baseKey] || "Unknown place";

    let data, title;
    if(year && year.toLowerCase() === "all"){
      data = ALL_TIME[baseKey] || [];
      title = place + " — all years";
    } else {
      const y = parseInt(year, 10);
      data = BY_YEAR[key(lat, lon, y)] || [];
      title = place + " — " + y;
    }

    document.getElementById("title").textContent = title;

    const ol = document.getElementById("list");
    ol.innerHTML = "";
    if(!data.length){
      ol.innerHTML = "<li class='muted'>No senders found.</li>";
      return;
    }

    // what to pass for the year param in the link
    const yearParam = (year && year.toLowerCase() === "all")
      ? "all"
      : (isNaN(parseInt(year, 10)) ? "all" : String(parseInt(year, 10)));

    data.forEach(row => {
      const li = document.createElement("li");

      // clickable sender → letters.html
      const link = document.createElement("a");
      link.href = "letters.html?lat=" + lat.toFixed(ROUND) +
                  "&lon=" + lon.toFixed(ROUND) +
                  "&year=" + yearParam +
                  "&sender=" + encodeURIComponent(row.sender || "");
      link.target = "_blank";
      link.textContent = row.sender || "Unknown sender";

      // pill with count
      const pill = document.createElement("span");
      pill.className = "pill";
      pill.textContent = row.letters + " letter" + (row.letters === 1 ? "" : "s");

      li.appendChild(link);
      li.appendChild(pill);
      ol.appendChild(li);
    });
  }
  render();
</script>
"""
    page = page.replace("__BY_YEAR__", json.dumps(by_year, separators=(',', ':')))
    page = page.replace("__ALL_TIME__", json.dumps(all_time, separators=(',', ':')))
    page = page.replace("__PLACES__", json.dumps(places, separators=(',', ':')))
    page = page.replace("__ROUND__", str(round_coords))

    Path(outfile).write_text(page, encoding="utf-8")
    return outfile

In [15]:
def make_letters_page(
    df_par,
    outfile="letters.html",
    lid_col="lid",
    lat_col="latitude",
    lon_col="longitude",
    place_col="place_name",
    sender_col="sender",          
    topic_label_col="topic_label",
    pred_topic_col="pred_topic",
    date_col="date",
    year_col="year",
    round_coords=6,
):
    df = df_par.copy()

    # --- robust columns ---
    if sender_col not in df.columns:
        if "sender_name" in df.columns: sender_col = "sender_name"
        elif "sender" in df.columns:    sender_col = "sender"
        else: raise KeyError("No sender column found (sender or sender_name).")

    # ensure year + coords numeric
    if year_col in df.columns:
        df["year"] = pd.to_numeric(df[year_col], errors="coerce").astype("Int64")
    else:
        df["year"] = pd.to_datetime(df[date_col], errors="coerce").dt.year

    df[lat_col] = pd.to_numeric(df[lat_col], errors="coerce").round(round_coords)
    df[lon_col] = pd.to_numeric(df[lon_col], errors="coerce").round(round_coords)

    # normalize to pure letter id: 'L.<n>'
    df["letter_id"] = df[lid_col].astype(str).str.extract(r"^(L\.\d+)", expand=False)

    # compact topic
    if topic_label_col in df.columns:
        df["topic"] = df[topic_label_col]
    elif pred_topic_col in df.columns:
        df["topic"] = df[pred_topic_col]
    else:
        df["topic"] = pd.NA

    df = df.dropna(subset=[lat_col, lon_col, "year", "letter_id", sender_col])

    # helper to build dict keys
    def key(lat, lon, sender, year_or_all):
        base = f"{lat:.{round_coords}f},{lon:.{round_coords}f}|{sender}"
        return f"{base}|{year_or_all}"

    # aggregate per (place, sender, year) → list of letters
    # group to letters first
    g = (df.groupby([lat_col, lon_col, sender_col, "year", "letter_id"], dropna=True)
           .agg(
               date_min = (date_col, "min"),
               receiver = ("receiver", "first"),
               topics   = ( "topic", lambda s: sorted({t for t in s.dropna()})),
               paragraphs=("pid","count") if "pid" in df.columns else (sender_col, "size")
           ).reset_index())

    # pack per year
    by_key = {}
    for _, r in g.iterrows():
        ky = key(r[lat_col], r[lon_col], r[sender_col], int(r["year"]))
        by_key.setdefault(ky, []).append({
            "lid":       r["letter_id"],
            "date":      r["date_min"],
            "receiver":  r["receiver"],
            "topics":    r["topics"],
            "paragraphs": int(r["paragraphs"])
        })

    # all-time for sender at a place
    g_all = (df.groupby([lat_col, lon_col, sender_col, "letter_id"], dropna=True)
               .agg(
                   date_min = (date_col, "min"),
                   receiver = ("receiver", "first"),
                   topics   = ("topic", lambda s: sorted({t for t in s.dropna()})),
                   paragraphs=("pid","count") if "pid" in df.columns else (sender_col, "size")
               ).reset_index())

    all_key = {}
    for _, r in g_all.iterrows():
        ka = key(r[lat_col], r[lon_col], r[sender_col], "all")
        all_key.setdefault(ka, []).append({
            "lid":       r["letter_id"],
            "date":      r["date_min"],
            "receiver":  r["receiver"],
            "topics":    r["topics"],
            "paragraphs": int(r["paragraphs"])
        })

    # place names (nice title)
    places = (df.dropna(subset=[place_col])
                .drop_duplicates([lat_col, lon_col])
                .set_index([lat_col, lon_col])[place_col]
                .to_dict())

    # ---- HTML (self-contained, no f-strings) ----
    html = """
<!doctype html>
<meta charset="utf-8">
<title>Letters</title>
<style>
 body{font:14px/1.45 system-ui, Arial, sans-serif; margin:20px; max-width:900px}
 h2{margin:0 0 4px}
 .muted{color:#666}
 .tag{display:inline-block; margin:2px 4px 0 0; padding:2px 6px; border-radius:999px; background:#eee; font-size:12px}
 .card{border:1px solid #ddd; border-radius:8px; padding:10px 12px; margin:10px 0}
 .lid{font-weight:600}
 .pill{display:inline-block; padding:2px 8px; border-radius:999px; background:#eef; margin-left:6px; font-size:12px}
 a{color:#0b69c7; text-decoration:none}
 a:hover{text-decoration:underline}
</style>

<h2 id="title">Letters</h2>
<div class="muted">Change the URL like: <code>?lat=48.208&lon=16.373&year=1885&sender=Machado%20y%20Álvarez</code> or <code>year=all</code>.</div>
<hr>
<div id="list"></div>

<script>
  const DATA = __BY_KEY__;
  const DATA_ALL = __ALL_KEY__;
  const PLACES = __PLACES__;
  const ROUND = __ROUND__;

  function q(name){ return new URLSearchParams(location.search).get(name); }
  function k(lat, lon, sender, yr){
    function r(n){ return Number(n).toFixed(ROUND); }
    return r(lat)+","+r(lon)+"|"+(sender||"")+"|"+yr;
  }
  function esc(s){ return (s||"").replace(/[&<>]/g, t=>({'&':'&amp;','<':'&lt;','>':'&gt;'}[t])); }

  function render(){
    const lat = parseFloat(q("lat"));
    const lon = parseFloat(q("lon"));
    const sender = q("sender") || "";
    const year = (q("year")||"").toLowerCase()==="all" ? "all" : parseInt(q("year"),10);

    const placeKey = Number(lat).toFixed(ROUND)+","+Number(lon).toFixed(ROUND);
    const place = PLACES[placeKey] || "Unknown place";

    const key = k(lat, lon, sender, year);
    const rows = (year==="all" ? DATA_ALL[key] : DATA[key]) || [];

    const title = place + " — " + (sender||"Unknown sender") + " — " + (year==="all" ? "all years" : year);
    document.getElementById("title").textContent = title;

    const root = document.getElementById("list");
    root.innerHTML = "";
    if(!rows.length){
      root.innerHTML = "<div class='muted'>No letters found for this selection.</div>";
      return;
    }

    rows.sort((a,b)=> (a.date||"") < (b.date||"") ? -1 : 1);

    rows.forEach(r=>{
    const div = document.createElement("div");
    div.className = "card";
    const topics = (r.topics||[]).map(t=>"<span class='tag'>"+esc(t)+"</span>").join(" ");
    div.innerHTML =
        "<div><a class='lid' target='_blank' href='letter.html?lid="+encodeURIComponent(r.lid)+"'>"+esc(r.lid)+"</a>" +
        (r.date ? " — "+esc(r.date) : "") +
        (r.receiver ? " — to "+esc(r.receiver) : "") +
        "<span class='pill'>"+r.paragraphs+" paragraph"+(r.paragraphs===1?"":"s")+"</span></div>" +
        (topics ? "<div style='margin-top:6px'>"+topics+"</div>" : "");
    document.getElementById("list").appendChild(div);
    });
  }
  render();
</script>
"""
    html = html.replace("__BY_KEY__",   json.dumps(by_key,  separators=(",", ":")))
    html = html.replace("__ALL_KEY__",  json.dumps(all_key, separators=(",", ":")))
    # convert place tuple keys to string keys like "lat,lon"
    places_str_keys = {f"{float(k[0]):.{round_coords}f},{float(k[1]):.{round_coords}f}": v for k,v in places.items()}
    html = html.replace("__PLACES__",   json.dumps(places_str_keys, separators=(",", ":")))
    html = html.replace("__ROUND__",    str(round_coords))

    Path(outfile).write_text(html, encoding="utf-8")
    return outfile

In [16]:
class YearSliderJS(MacroElement):
    """Embed aggregated data + slider + JS rendering logic into the folium map."""
    def __init__(self, data_by_year, all_data, lat_key='latitude', lon_key='longitude', place_key='place_name'):
        super().__init__()
        data_json = json.dumps(data_by_year, separators=(',', ':'))
        all_json  = json.dumps(all_data,  separators=(',', ':'))
        tpl = """
        {% macro script(this, kwargs) %}
        // ----- embedded data -----
        var DATA_BY_YEAR = """ + data_json + """;
        var ALL_DATA = """ + all_json + """;
        var LAT = '""" + lat_key + """', LON = '""" + lon_key + """', PLACE = '""" + place_key + """', COUNT = 'letters_count';

        // ----- layers -----
        var mapObj   = {{this._parent.get_name()}};
        var yearLayer = L.layerGroup().addTo(mapObj);
        var allLayer  = L.layerGroup(); // not shown by default

        // ----- utils -----
        function extentCounts(){
          var mn = Infinity, mx = -Infinity;
          Object.values(DATA_BY_YEAR).forEach(arr => arr.forEach(r => {
            mn = Math.min(mn, r[COUNT]); mx = Math.max(mx, r[COUNT]);
          }));
          if (!isFinite(mn)) { mn = 1; mx = 1; }
          return [mn, mx];
        }
        function colorFor(v, mn, mx){
          var t = (mx===mn) ? 1 : (v - mn)/(mx - mn);
          var r = Math.round(255 * t);
          var g = Math.round(200 * (1 - t) + 55 * t);
          return "rgb(" + r + "," + g + ",0)";
        }
        var MIN_COUNT_FLOOR = 100;

        function radiusFor(v){
          // your base scaling (keep as-is)
          var r = 0.6 + 0.5 * Math.sqrt(Math.max(0, v));
          // compute the floor once from the same scaling
          var floorR = 0.6 + 0.5 * Math.sqrt(MIN_COUNT_FLOOR);
          return Math.max(r, floorR);
        }

        function drawYear(y){
          // hide the "all years" layer if it was on
          if (mapObj.hasLayer(allLayer)) mapObj.removeLayer(allLayer);

          yearLayer.clearLayers();
          var arr = DATA_BY_YEAR[y] || [];
          var ext = extentCounts(), mn = ext[0], mx = ext[1];

          arr.forEach(function(r){
            var lat = +r[LAT], lon = +r[LON], cnt = +r[COUNT];
            var place = (r[PLACE] || "Unknown place");
            var link = "senders.html?lat=" + lat.toFixed(6) + "&lon=" + lon.toFixed(6) + "&year=" + y;

            L.circleMarker([lat, lon], {
              radius: radiusFor(cnt),
              color: colorFor(cnt, mn, mx),
              fillColor: colorFor(cnt, mn, mx),
              fillOpacity: 0.7,
              weight: 0.8
            })
            .bindPopup(
              "<b>" + place + "</b><br>" +
              "Letters: " + cnt + " (" + y + ")<br>" +
              "<a href='" + link + "' target='_blank'>Open senders</a>"
            )
            .bindTooltip(place + " — " + cnt + " (" + y + ")")
            .addTo(yearLayer);   // <-- IMPORTANT: add to yearLayer
          });
        }
        
        function drawAll(){
          allLayer.clearLayers();
          if (!mapObj.hasLayer(allLayer)) mapObj.addLayer(allLayer);
          ALL_DATA.forEach(function(r){
            var lat = +r[LAT], lon = +r[LON], cnt = +r[COUNT];
            var place = (r[PLACE]||"Unknown place");
            var link = "senders.html?lat=" + lat.toFixed(6) + "&lon=" + lon.toFixed(6) + "&year=all";

            L.circleMarker([lat, lon], {
              radius: radiusFor(cnt),
              color: "steelblue", fillColor: "steelblue",
              fillOpacity: 0.7, weight: 0.8
            })
            .bindPopup(
              "<b>" + place + "</b><br>" +
              "Letters: " + cnt + " (all years)<br>" +
              "<a href='" + link + "' target='_blank'>Open senders</a>"
            )
            .bindTooltip(place + " — " + cnt + " (all years)")
            .addTo(allLayer);
          });
        }
        function hideAll(){ allLayer.clearLayers(); if (mapObj.hasLayer(allLayer)) mapObj.removeLayer(allLayer); }

        // ----- UI (slider + checkbox) -----
        var years = Object.keys(DATA_BY_YEAR).map(Number).sort(function(a,b){return a-b;});
        var startYear = years.length ? years[0] : 0;
        var endYear   = years.length ? years[years.length-1] : 0;

        var ctl = L.control({position:'topleft'});
        ctl.onAdd = function(){
          var div = L.DomUtil.create('div','leaflet-bar');
          div.style.background='white'; div.style.padding='6px 8px'; div.style.lineHeight='1.2';
          div.innerHTML =
            '<div style="margin-bottom:4px;font:12px/1.2 Arial, sans-serif;">' +
            '<label style="user-select:none;"><input id="chk_allyears" type="checkbox"> Show all years</label>' +
            '</div>' +
            '<div style="font:12px/1.2 Arial, sans-serif;">' +
            '<label style="user-select:none;">Year: <span id="lbl_year">'+startYear+'</span></label>' +
            '<input id="rng_year" type="range" step="1" style="width:180px;">' +
            '</div>';
          L.DomEvent.disableClickPropagation(div);
          return div;
        };
        ctl.addTo(mapObj);

        var rng = document.getElementById('rng_year');
        var lbl = document.getElementById('lbl_year');
        var chk = document.getElementById('chk_allyears');

        rng.min = startYear; rng.max = endYear; rng.value = startYear; lbl.textContent = startYear;
        if (years.length){ drawYear(startYear); }

        rng.addEventListener('input', function(e){
          var y = parseInt(e.target.value, 10);
          lbl.textContent = y;
          hideAll();
          drawYear(y);
        });

        chk.addEventListener('change', function(e){
          if (e.target.checked){
            yearLayer.clearLayers();
            drawAll();
          } else {
            hideAll();
            drawYear(parseInt(rng.value, 10));
          }
        });
        {% endmacro %}
        """
        self._template = Template(tpl)


In [17]:

def make_letter_viewer_page(
    df_par,
    outfile="letter.html",
    lid_col="lid",                 # 'L.<n>' (pure letter id)
    pid_col="pid",                 # 'L.<n>-<p>' (paragraph id)
    text_col="text",
    topic_label_col="topic_label",
    pred_topic_col="pred_topic",
    date_col="date",
    sender_col="sender",           # or 'sender_name'
    receiver_col="receiver",
    place_col="place_name",
    lat_col="latitude",
    lon_col="longitude",
    lang_col="language",
):
    df = df_par.copy()

    # -------- normalize keys --------
    # If lid is not present, derive from pid (supports 'L.<n>-<p>')
    if lid_col not in df.columns:
        if pid_col in df.columns:
            df["letter_id"] = df[pid_col].astype(str).str.extract(r"^(L\.\d+)", expand=False)
        else:
            raise KeyError("Need either a 'lid' column or a 'pid' column to derive the letter id.")
    else:
        df["letter_id"] = df[lid_col].astype(str).str.extract(r"^(L\.\d+)", expand=False)

    # paragraph index for sorting (from 'L.<n>-<p>')
    if pid_col in df.columns:
        df["para_idx"] = (
            df[pid_col].astype(str)
              .str.extract(r"L\.\d+-(\d+)$", expand=False)
              .astype("Int64")
        )
    else:
        df["para_idx"] = pd.NA

    # choose topic column (label first, fallback to model prediction)
    if topic_label_col in df.columns:
        df["topic"] = df[topic_label_col]
    elif pred_topic_col in df.columns:
        df["topic"] = df[pred_topic_col]
    else:
        df["topic"] = pd.NA

    # numeric coords if available
    for c in (lat_col, lon_col):
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # -------- aggregate into a compact JSON payload --------
    # meta per letter
    def first_nonnull(s):
        s = s.dropna()
        return s.iloc[0] if len(s) else None

    meta_cols = {
        "date": date_col,
        "sender": sender_col if sender_col in df.columns else None,
        "receiver": receiver_col if receiver_col in df.columns else None,
        "place": place_col if place_col in df.columns else None,
        "latitude": lat_col if lat_col in df.columns else None,
        "longitude": lon_col if lon_col in df.columns else None,
        "language": lang_col if lang_col in df.columns else None,
    }
    keep_meta = {k:v for k,v in meta_cols.items() if v is not None}

    meta = (
        df.groupby("letter_id")
          .agg({v: first_nonnull for v in keep_meta.values()})
          .rename(columns={v:k for k,v in keep_meta.items()})
          .reset_index()
    )

    # paragraphs per letter
    para_cols = [c for c in [pid_col, "para_idx", "topic", text_col] if c in df.columns]
    paras = (
        df.dropna(subset=["letter_id"])[["letter_id"] + para_cols]
          .sort_values(["letter_id", "para_idx"])
    )

    # build dict: { 'L.n': {meta..., 'paragraphs':[ {pid, idx, topic, text}, ... ] } }
    letters = {}
    meta_idx = meta.set_index("letter_id").to_dict(orient="index")
    for lid, rows in paras.groupby("letter_id"):
        paragraphs = []
        for _, r in rows.iterrows():
            paragraphs.append({
                "pid": r.get(pid_col) if pid_col in rows.columns else None,
                "idx": None if pd.isna(r.get("para_idx")) else int(r.get("para_idx")),
                "topic": r.get("topic"),
                "text": r.get(text_col),
            })
        letters[lid] = {
            **meta_idx.get(lid, {}),
            "paragraphs": paragraphs,
        }

    # also include any letters with metadata but no paragraphs (rare)
    for lid, m in meta_idx.items():
        letters.setdefault(lid, {**m, "paragraphs": []})

    # -------- HTML (no f-strings → safe braces) --------
    html = """
<!doctype html>
<meta charset="utf-8">
<title>Letter</title>
<style>
 body{font:15px/1.55 system-ui, -apple-system, Segoe UI, Arial, sans-serif; margin:20px; max-width:900px}
 header{margin-bottom:10px}
 h1{font-size:22px; margin:0 0 4px}
 .muted{color:#666}
 .meta{margin:6px 0 14px; color:#333}
 .meta span{display:inline-block; margin-right:12px}
 .controls{display:flex; gap:8px; align-items:center; margin:8px 0 12px}
 .controls input[type="text"]{flex:1; padding:6px 8px; border:1px solid #ccc; border-radius:6px}
 .card{border:1px solid #e2e2e2; border-radius:10px; padding:10px 12px; margin:10px 0}
 .pid{font-weight:600; color:#0b69c7}
 .topic{display:inline-block; padding:2px 8px; border-radius:999px; background:#f1f1f1; font-size:12px; margin-left:6px}
 .hl{background: #ffe38a}
</style>

<header>
  <h1 id="title">Letter</h1>
  <div class="meta" id="meta"></div>
  <div class="controls">
    <input id="search" type="text" placeholder="Search in this letter…">
    <label class="muted"><input id="toggleTopics" type="checkbox" checked> show topics</label>
  </div>
</header>

<div id="paras"></div>

<script>
  // embedded payload
  const LETTERS = __LETTERS__;

  function q(name){ return new URLSearchParams(location.search).get(name); }
  function esc(s){ return (s??'').toString().replace(/[&<>]/g, t=>({'&':'&amp;','<':'&lt;','>':'&gt;'}[t])); }

  // highlight helper
  function hi(text, needle){
    if(!needle){ return esc(text||''); }
    try{
      const re = new RegExp(needle.replace(/[.*+?^${}()|[\\]\\\\]/g, '\\\\$&'), 'gi');
      return esc(text||'').replace(re, m=>'<span class="hl">'+m+'</span>');
    }catch(e){
      return esc(text||'');
    }
  }

  function render(){
    const lid = q('lid');     // expect 'L.12'
    const pid = q('pid');     // optional: jump to a paragraph
    const data = LETTERS[lid];

    const title = document.getElementById('title');
    const meta  = document.getElementById('meta');
    const box   = document.getElementById('paras');

    if(!data){
      title.textContent = 'Letter ' + (lid||'');
      box.innerHTML = "<div class='muted'>No data found for this letter.</div>";
      return;
    }

    // header
    title.textContent = 'Letter ' + lid;
    const bits = [];
    if(data.date)     bits.push('<span>📅 '+esc(data.date)+'</span>');
    if(data.sender && data.receiver) bits.push('<span>✉️ '+esc(data.sender)+' → '+esc(data.receiver)+'</span>');
    if(data.place)    bits.push('<span>📍 '+esc(data.place)+'</span>');
    if(data.language) bits.push('<span>🗣 '+esc(data.language)+'</span>');
    meta.innerHTML = bits.join('');

    // paragraphs
    const showTopics = ()=> document.getElementById('toggleTopics').checked;
    const needle = ()=> document.getElementById('search').value.trim();

    function draw(){
      const n = needle();
      box.innerHTML = '';
      (data.paragraphs||[]).forEach((p,i)=>{
        const id = p.pid || (lid+'-'+(p.idx??(i+1)));
        const topic = p.topic ? ('<span class="topic">'+esc(p.topic)+'</span>') : '';
        const head = '<div class="pid" id="'+esc(id)+'">'+esc(id) + (showTopics()? topic : '') + '</div>';
        const body = '<div>'+ hi(p.text||'', n) +'</div>';
        const card = document.createElement('div');
        card.className = 'card';
        card.innerHTML = head + body;
        box.appendChild(card);
      });

      // jump to a pid if provided
      if(pid){
        const el = document.getElementById(esc(pid));
        if(el){ el.scrollIntoView({behavior:'smooth', block:'start'}); }
      }
    }

    document.getElementById('toggleTopics').addEventListener('change', draw);
    document.getElementById('search').addEventListener('input', draw);
    draw();
  }
  render();
</script>
"""
    html = html.replace("__LETTERS__", json.dumps(letters, separators=(",", ":")))
    Path(outfile).write_text(html, encoding="utf-8")
    return outfile

In [18]:
def make_year_slider_map(df_par, outfile='letters_timeline_map.html',
                         lid_col='lid', lat_col='latitude', lon_col='longitude', place_col='place_name'):
    per_year, all_years = _prep_counts(df_par, lid_col, lat_col, lon_col, place_col)

    # pack JSON payloads for JS
    years_sorted = sorted(per_year['year'].dropna().astype(int).unique().tolist())
    data_by_year = {
        int(y): per_year.loc[per_year['year']==y, [lat_col, lon_col, place_col, 'letters_count']].to_dict('records')
        for y in years_sorted
    }
    all_data = all_years[[lat_col, lon_col, place_col, 'letters_count']].to_dict('records')

    # map center
    src = per_year if not per_year.empty else all_years
    center = [float(src[lat_col].median()), float(src[lon_col].median())]
    m = folium.Map(location=center, zoom_start=4, control_scale=True)

    # attach the slider logic
    m.add_child(YearSliderJS(data_by_year, all_data, lat_key=lat_col, lon_key=lon_col, place_key=place_col))
    m.save(outfile)
    return outfile

In [19]:
make_senders_page(df_par, outfile="senders.html")
make_letters_page(df_par, outfile="letters.html", sender_col="sender") 
make_letter_viewer_page(df_par, outfile="letter.html", sender_col="sender")


path = make_year_slider_map(df_par, outfile="letters_timeline_map.html")
print("Saved:", path)

Saved: letters_timeline_map.html


In [20]:
d = df_par.copy()
d['place_name'] = d['place_name'].astype('string').str.strip()
unknown_basic = d[d['place_name'].isna() | (d['place_name'] == '')][
    ['lid','pid','place_name','latitude','longitude']
].drop_duplicates()

print("Missing/blank place_name rows:")
print(unknown_basic.to_string(index=False))

Missing/blank place_name rows:
   lid        pid place_name  latitude  longitude
   NaN L.l.7533-1       <NA>       NaN        NaN
L.4226   L.4226-6       <NA>       NaN        NaN
L.4226   L.4226-4       <NA>       NaN        NaN
L.4226   L.4226-3       <NA>       NaN        NaN
L.4226   L.4226-1       <NA>       NaN        NaN
L.4226   L.4226-5       <NA>       NaN        NaN
L.4226   L.4226-2       <NA>       NaN        NaN
   NaN  L..5211-1       <NA>       NaN        NaN
   NaN  L.L5403-1       <NA>       NaN        NaN
   NaN  L..6030-4       <NA>       NaN        NaN
   NaN  L..6030-5       <NA>       NaN        NaN
   NaN  L..6030-3       <NA>       NaN        NaN
   NaN  L..6030-1       <NA>       NaN        NaN
   NaN  L..6030-2       <NA>       NaN        NaN
L.8481   L.8481-1       <NA>       NaN        NaN
L.9221   L.9221-3       <NA>       NaN        NaN
L.9221   L.9221-4       <NA>       NaN        NaN
L.9221   L.9221-1       <NA>       NaN        NaN
L.9221   L.9221-2  