# process data


In [44]:
import pandas as pd
import json

## process gazetteers

In [39]:
canmore_df = pd.read_csv("./HES/canmore_text_extract.csv", low_memory=False)

parishes = canmore_df['PARISH'].str.title().unique().tolist()
counties = canmore_df['COUNTY_NAME'].str.title().unique().tolist()

canmore_text = parishes + counties

print(len(parishes), len(counties), len(canmore_text))

parishes[0:10], counties[0:10]

183 11 194


(['Rerrick',
  'Dumfries',
  'Kirkcudbright',
  'Innerleithen',
  'Drumelzier',
  'Yarrow',
  'Ettrick',
  'Chirnside',
  'Kirkhope',
  'Selkirk'],
 ['Kirkcudbrightshire',
  'Dumfries-Shire',
  'Peebles-Shire',
  'Selkirkshire',
  'Berwickshire',
  'Not Applicable',
  'Wigtownshire',
  'Roxburghshire',
  'Midlothian',
  'East Lothian'])

In [40]:
geonames_df = pd.read_csv("./BL/Geonames_GBNI_places.csv", header=None, names=['place_name'])
geonames_text = geonames_df['place_name'].unique().tolist()

iams_df = pd.read_csv("./BL/IAMS_GBNI_places.csv", header=None)
iams_text = iams_df[0].unique().tolist()

print(len(geonames_text), len(iams_text))

geonames_text[0:10], iams_text[0:10]

7168 15873


(['North and Middle Littleton',
  'Barton-on-the-Heath',
  'Zennor',
  'Zeals',
  'Zeal Monachorum',
  'Yoxford',
  'Yoxall',
  'Youlgreave',
  'Yewbarrow',
  'Yetminster'],
 ['Middle Littleton',
  'North Littleton',
  'Barton-on-the-Heath',
  'Zennor',
  'Zeals',
  'Zeal Monachorum',
  'Yoxford',
  'Yoxall',
  'All Saints, Youlgreave',
  'Yellegrave'])

In [41]:
bl_places_df = pd.read_csv("./BL/BL_Med_places.csv", header=None)
bl_places_text = bl_places[0].apply(lambda i: i.split(";")[0].split(",")[0].strip()).unique().tolist()

print(len(bl_places_text))

bl_places_text[0:20]

925


['Northern England',
 'Durham',
 'Beverlaco',
 'Dukmanton',
 'Ettewelle',
 'Leicester',
 'Salopesburi',
 'Shrewsbury',
 'Stanlegh',
 'Jezemue',
 'Novo Castello',
 'Berewico super Tweda',
 'Weardale Forest',
 'Berewyke',
 'Northumberland',
 'Jesemuth',
 'Novo Castro',
 'Cavereswall',
 'Hetyleia',
 'Derbyshire']

In [43]:
gazetteer = list(set(parishes + counties + iams_text + bl_places_text))

len(gazetteer)

16032

In [46]:
gazetteer_dict = [{"label": "GPE", "pattern": place} for place in gazetteer]

with open("gazetteer.jsonl", "w") as f:
    for item in gazetteer_dict:
        json.dump(item, f)
        f.write("\n")