## Extra: example NER on a map

Import de benodigde packages

In [15]:
import pandas as pd
import spacy
from spacy import displacy
from collections import Counter
import pickle
import plotly.express as px
import matplotlib.pyplot as plt
import folium

Laad de dataset

In [16]:
fiets_nlp = pd.read_pickle("data/fiets_nlp.pkl")

In [4]:
fiets_nlp.head(2)

Unnamed: 0.1,Unnamed: 0,identifier,type,title,date,content,subcategory,category,Year,DL score,spatial,length,doc
21061,84265,http://resolver.kb.nl/resolve?urn=MMKB23:00140...,artikel,BUITENLAND. POLITIEK OVERZICHT.,1880/05/26 00:00:00,"’s Gravenhage, 25 Mei.’t Is te Berlijn een lie...",fiets,fiets,1880,71.212121,Landelijk,792,"(’s, Gravenhage, ,, 25, Mei.’t, Is, te, Berlij..."
21072,84295,http://resolver.kb.nl/resolve?urn=ddd:01011723...,artikel,Gemengde Berichten.,1869/09/20 00:00:00,"Naar men verneemt, bloeit thans in den Hortus ...",rijwiel,fiets,1869,77.823129,Landelijk,735,"(Naar, men, verneemt, ,, bloeit, thans, in, de..."


In [5]:
fiets_nlp.Year.value_counts()

1869    70
1883    27
1875    20
1884    16
1871    16
1876    14
1879    14
1870    12
1874    12
1878    12
1880    11
1872     9
1882     9
1881     7
1873     5
1877     1
Name: Year, dtype: int64

In [6]:
def get_ner(doc, entity):
    return [ent.text for ent in doc.ents if ent.label_ == entity]

In [7]:
fiets_nlp['GPE'] = fiets_nlp['doc'].apply(lambda x: get_ner(x, 'GPE'))

In [8]:
# Load coordinates data from CSV
coordinates_df = pd.read_csv('data/nl.csv')
coordinates_df.head(2)

Unnamed: 0,city,lat,lng,country,iso2,admin_name,capital,population,population_proper
0,Tilburg,51.55,5.0833,Netherlands,NL,Noord-Brabant,minor,1944588,1944588
1,Amsterdam,52.3728,4.8936,Netherlands,NL,Noord-Holland,primary,1459402,917923


In [9]:
# Load coordinates data from CSV
coordinates_df = pd.read_csv('data/worldcities.csv')
coordinates_df.head(2)

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.687,139.7495,Japan,JP,JPN,Tōkyō,primary,37785000.0,1392685764
1,Jakarta,Jakarta,-6.175,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077


In [10]:
# Create a Python dictionary mapping city names to coordinates
city_to_coords = {row['city']: (row['lat'], row['lng']) for idx, row in coordinates_df.iterrows()}

In [11]:
# Function to get coordinates for GPEs
def get_gpe_coords(gpe_list):
    return [city_to_coords[gpe] for gpe in gpe_list if gpe in city_to_coords]

# Apply the function to create a new column with coordinates
fiets_nlp['GPE_coords'] = fiets_nlp['GPE'].apply(get_gpe_coords)

# Display the DataFrame
pd.set_option('display.max_colwidth', None)
display(fiets_nlp[['identifier','GPE','GPE_coords']].head())


Unnamed: 0,identifier,GPE,GPE_coords
21061,http://resolver.kb.nl/resolve?urn=MMKB23:001409122:mpeg21:a00001:ocr,"[Berlijn, Rome, Rome, nagaat, Granlle, Berlijn, Gladgesticht, G, Europa, Granville, Parijs]","[(43.226, -75.4909), (43.226, -75.4909), (48.8374, -1.5939)]"
21072,http://resolver.kb.nl/resolve?urn=ddd:010117239:mpeg21:a0029:ocr,"[Zwolle, Zondag, Gorinchem, Californië]","[(52.5167, 6.1), (51.8306, 4.9742)]"
21076,http://resolver.kb.nl/resolve?urn=ddd:010263245:mpeg21:a0008:ocr,"[Frankrijk, Parijs, Parijs, Commun, Parijs, Rothschild, Berlijn]",[]
21078,http://resolver.kb.nl/resolve?urn=ddd:010117225:mpeg21:a0014:ocr,"[gemeente Zype, Edinburg, Amerika, Parijs, Bardin, Engeland]","[(26.3196, -98.1597)]"
21079,http://resolver.kb.nl/resolve?urn=ddd:010097660:mpeg21:a0006:ocr,"[Pari, afgeloopon, Neder_nd, Do Gazelle van Antwerpen, België, Nederland, Brussel]","[(29.9707, -94.0015)]"


In [12]:
# Plot the coordinates on a map
map = folium.Map(location=[52.0, 5.0], zoom_start=7)

for coords_list in fiets_nlp['GPE_coords']:
    for coords in coords_list:
        folium.Marker(location=coords).add_to(map)

# Save the map to an HTML file
map.save('map.html')

# Display the map
map

In [13]:
fiets_nlp.Year.unique()

array([1880, 1869, 1870, 1874, 1883, 1875, 1876, 1879, 1884, 1873, 1878,
       1871, 1872, 1881, 1882, 1877], dtype=int64)

In [14]:
# Define a color mapping for each unique krantnaam
color_mapping = {
    1869: 'blue',
    1870: 'green',
    1871: 'red',
    1872: 'yellow',
    1873: 'orange',
    1874: 'brown',
    1875: 'purple',
    1876: 'lightblue',
    1877: 'pink',
    1878: 'magenta',
    1879: 'darkgreen',
    1880: 'darkblue',
    1881: 'black',  
    1882: 'black', 
    1883: 'black' ,
    1884: 'black'
    
}

# Plot the coordinates on a map
map = folium.Map(location=[52.0, 5.0], zoom_start=7)

for index, row in fiets_nlp.iterrows():
    krantnaam = row['Year']
    coords_list = row['GPE_coords']
    color = color_mapping.get(krantnaam, 'black')  # Default to black if krantnaam not in mapping
    
    for coords in coords_list:
        folium.Marker(location=coords, icon=folium.Icon(color=color)).add_to(map)

# Save the map to an HTML file
map.save('map.html')

# Display the map
map


  folium.Marker(location=coords, icon=folium.Icon(color=color)).add_to(map)
