## Setup

In [1]:
from collections import defaultdict
import json
from datetime import datetime
import math

In [2]:
import re
import pandas as pd
from tqdm.auto import tqdm
pd.set_option('display.max_colwidth', 0)

import networkx as nx
from pyvis.network import Network
df  = pd.read_csv("../data/positions_df.csv")
df.columns

Index(['pub', 'pub_date', 'page_num', 'page_id', 'image_uri', 'uuid', 'label',
       'ocr', 'ocr_without_lb', 'text', 'transkribus_pagenum',
       'text_without_lb', 'position', 'position_cleaned'],
      dtype='object')

## Data cleanup

In [3]:
POSITIONS_EXCLUDE = ['DFTTFF—',
 'DPDwSf—', 'EEE', 'NRCCCCCOCT',
 'FFur',
 'BUEO', 'Ein', 'ININ',
 'AVISO', 'Für','32⁰0⁰'
]
replace_list =  r'[\d*,:.;—＋-]'
def cleaned_up_position(position_str):
    pos_list = eval(position_str)
    pos_list_cleaned =[]
    for p in pos_list:
        if p in POSITIONS_EXCLUDE: continue
        p_cleaned = re.sub(replace_list, '', p).strip()
        if len(p)>2 and not p_cleaned.isnumeric():
            pos_list_cleaned.append(p_cleaned)

    if not pos_list_cleaned:
        return ["unknown"]
        
    return pos_list_cleaned

In [4]:
df["position_cleaned"] = df.position.apply(cleaned_up_position)
df.to_csv("../data/positions_df.csv", index=False)

## Create graph pyvis/networkx **too slow

Create a node for every position and link ads that adverise for that position to the position node

In [5]:
G = nx.Graph()
position_node_size = defaultdict(int)
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    G.add_node(row.uuid, label=row.page_id, group=row.label)
    if row.position_cleaned:
        # print(row.position)
        for p in row.position_cleaned:
            G.add_node(p, label=p, group="position")
            G.add_edge(row.uuid, p )
print(len(G.nodes))

  0%|          | 0/14337 [00:00<?, ?it/s]

19359


In [6]:
net = Network(notebook=True, height="90vh", width="100%", bgcolor="#222", font_color="white")
net.from_nx(G)
net.toggle_physics(False)
net.show_buttons()
# net.show("preview.html")

preview.html


## Create graph data in JSON

```
{
  "nodes": [
    {"id": "1", "label": "Node 1", "group": "A"},
    {"id": "2", "label": "Node 2", "group": "A"},
    {"id": "3", "label": "Node 3", "group": "B"}
  ],
  "links": [
    {"source": "1", "target": "2"},
    {"source": "2", "target": "3"},
    {"source": "1", "target": "3"}
  ]
}
```

In [5]:
positions_added = []
nodes_added = []
nodes = []
links = []
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    for p in row.position_cleaned:
        if p not in positions_added:
            nodes.append({"id": p, "label":p, "group":"position"})
            positions_added.append(p)
            
        links.append({"source": p, "target": row.uuid})
        if not row.uuid in nodes_added:
            nodes.append({"id": row.uuid, "label":row.page_id, "group": row.label} )
            nodes_added.append(row.uuid)


  0%|          | 0/14337 [00:00<?, ?it/s]

### Save JSON data to file

In [6]:
with open("../data/data.json", "w") as f:
    f.write(json.dumps({"nodes": nodes, "links": links}))

## Create source and target pair for Cosmograph

Graph data
File that contains a list of graph edges records. It must have at least two columns representing source and target nodes correspondingly.

If the records have time associated with them, you can provide that information in the time column. In that case an interactive timeline will be displayed below the graph.

Extra columns can contain values for specifying link color and size. For example, if you want to specify a specific color for a specific link, you can create a column named color and provide hex (or any common color format) values there. Columns will be validated and the ones with incorrect content will be ignored.

Supported formats: .csv, .tsv, .ssv

Graph metadata
Optional file that contains a list of nodes and corresponding values that can be used to set custom color and size of nodes. It must contain a column called id, matching entries from the main data file.

Supported formats: .csv, .tsv, .ssv

In [20]:
node_colours = {"job_search": "#ffcc66",
               "job_offer": "#7fff7f",
               "position": "#66b3ff",
               "service_offer":"#ff6699",
               "vermittlung":"#fdbf6f",
               "heading": "#b3b3b3"}
nodes_added = []

data = []
metadata = []
link_count = {}
df = df.fillna("")
df = df[df.text_without_lb!=""]
df = df[df.label!="heading"]
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    pub_date = datetime.strptime(f"{row.pub_date}", "%Y%m%d").date()
    metadata.append({"id": row.uuid, "label":"", "group": row.label, "color": node_colours[row.label], "pub":row.pub, "text": row.text_without_lb, "size":2} )
    for p in row.position_cleaned:
        data.append({"source": p.lower(), "target": row.uuid, "time": row.pub_date})
        if p.lower() not in nodes_added:
            metadata.append({"id": p.lower(), "label":p, "group": "position", "color":node_colours["position"], "pub": "", "text": "", "size":1 })
            nodes_added.append(p.lower())
            link_count[p.lower()] = 1
        else:
            # increase node size
            link_count[p.lower()] += 1
            next(item for item in metadata if item['id'] == p.lower())['size'] = 5+ math.log(link_count[p.lower()])
        
    


  0%|          | 0/9681 [00:00<?, ?it/s]

In [21]:
pd.DataFrame(data).to_csv("../data/cosmo_data.csv", index=False)
pd.DataFrame(metadata).to_csv("../data/cosmo_metadata.csv", index=False)

In [22]:
with open("../dataviz/cosmograph/data/nodes_links.ts", "w") as f:
    f.write(f"""export const nodes = {metadata};
    export const links = {data};
    """)

In [31]:
data[0]

{'source': 'reisender',
 'target': 'c28b8ff3-5581-495b-a449-149489a5398f',
 'time': '19250906'}

In [65]:
for row in data:
    if "familien" in row["source"].lower():
        print(row["target"])

9c05424a-5357-43ac-8218-4e75a1e98333
72b7d6a3-438f-4db0-8a5c-4edd0e28aa50
18351bde-dfb5-45e3-8f77-55479a9e6405
ab0d5d58-df5f-4dda-92bc-85a59c4ea6ce
e670b2ca-1292-4e86-83dc-071ad84f9648
38552a82-a91c-401e-852f-95a58e1a8c85


In [66]:
df[df.uuid=="9c05424a-5357-43ac-8218-4e75a1e98333"]

Unnamed: 0,pub,pub_date,page_num,page_id,image_uri,uuid,label,ocr,ocr_without_lb,text,transkribus_pagenum,text_without_lb,position,position_cleaned
1232,nwj,19080112,37,nwj_19080112_037,https://anno.onb.ac.at/cgi-content/annoshow?call=nwj|19080112|037|33.0|0,9c05424a-5357-43ac-8218-4e75a1e98333,job_search,"Familienvater\nbittet um irgend welche Schreibarbeit.\nWar 3 Jahre ſchwer krank und befindet\nſich mit ſeiner Frau und 7 kleinen\nHindern in bitterſter Not. J. Leitner,\n17. Bez., Kapitelgaſſe 4, 1. St., Tür 7.\nö 4921—6.\n","Familienvater bittet um irgend welche Schreibarbeit. War 3 Jahre ſchwer krank und befindet ſich mit ſeiner Frau und 7 kleinen Hindern in bitterſter Not. J. Leitner, 17. Bez., Kapitelgaſſe 4, 1. St., Tür 7. ö 4921—6.","Familienvater\nbittet um irgend welche Schreibarbeit.\nWar 3 Jahre ſchwer krank und befindet\nſich mit ſeiner Frau und 7 kleinen\nKindern in bitterſter Not. J. Leitner,\n17. Bez., Kapitelgaſſe 4, 1. St., Tür 7.\n4924—6\n",,"Familienvater bittet um irgend welche Schreibarbeit. War 3 Jahre ſchwer krank und befindet ſich mit ſeiner Frau und 7 kleinen Kindern in bitterſter Not. J. Leitner, 17. Bez., Kapitelgaſſe 4, 1. St., Tür 7. 4924—6",['Familienvater'],[Familienvater]


In [67]:
df[df.uuid=="72b7d6a3-438f-4db0-8a5c-4edd0e28aa50"]

Unnamed: 0,pub,pub_date,page_num,page_id,image_uri,uuid,label,ocr,ocr_without_lb,text,transkribus_pagenum,text_without_lb,position,position_cleaned
1488,nwj,19000515,15,nwj_19000515_015,https://anno.onb.ac.at/cgi-content/annoshow?call=nwj|19000515|015|33.0|0,72b7d6a3-438f-4db0-8a5c-4edd0e28aa50,job_search,"Familienvater\nbranchekundig, ſchöne Schrift, pünktlich,\nintelligent und ehrlich, bittet um irgend\nwelchen Verdienſt, geht auch auswärts,\neventuell übernimmt er Geſchäft auf\nRechnung, wobei ihm ſeine Frau, die\nauch vom Handelsſach iſt, tüchtig zur\nSeite ſtehr. Jede Beſchäftigung wird\nangenommen. Gefl. Anträge unter\nttung 7586“ an die Admin. 7586 —6\n","Familienvater branchekundig, ſchöne Schrift, pünktlich, intelligent und ehrlich, bittet um irgend welchen Verdienſt, geht auch auswärts, eventuell übernimmt er Geſchäft auf Rechnung, wobei ihm ſeine Frau, die auch vom Handelsſach iſt, tüchtig zur Seite ſtehr. Jede Beſchäftigung wird angenommen. Gefl. Anträge unter ttung 7586“ an die Admin. 7586 —6","Familienvater\nbranchekundig, ſchöne Schrift, pünktlich,\nintelligent und ehrlich, bittet um irgend\nwelchen Verdienſt, geht auch auswärts,\neventuell übernimmt er Geſchäft auf\nRechnung, wobei ihm ſeine Frau, die\nauch vom Handelsſach iſt, tüchtig zur\nSeite ſtehr. Jede Beſchäftigung wird\nangenommen. Gefl. Anträge unter\n„Rettung 7586“ an die Admin. 7586—6\n",,"Familienvater branchekundig, ſchöne Schrift, pünktlich, intelligent und ehrlich, bittet um irgend welchen Verdienſt, geht auch auswärts, eventuell übernimmt er Geſchäft auf Rechnung, wobei ihm ſeine Frau, die auch vom Handelsſach iſt, tüchtig zur Seite ſtehr. Jede Beſchäftigung wird angenommen. Gefl. Anträge unter „Rettung 7586“ an die Admin. 7586—6",['Familienvater'],[Familienvater]


In [14]:
df.position.unique()

array(["['Reisender']", '[]', "['Vertreter']", ..., "['Brottrüger']",
       "['Müller']", "['Hausmagd']"], dtype=object)

In [15]:
df.columns

Index(['pub', 'pub_date', 'page_num', 'page_id', 'image_uri', 'uuid', 'label',
       'ocr', 'ocr_without_lb', 'text', 'transkribus_pagenum',
       'text_without_lb', 'position', 'position_cleaned'],
      dtype='object')

In [24]:
df.label.unique()

array(['job_search', 'job_offer', 'service_offer', 'vermittlung'],
      dtype=object)

In [23]:
df[df.label=="heading"].shape[0]

0