In [53]:
import pandas as pd

#### Read Files

In [55]:
# Reads the NER gotten from BERT (missing book 1)
def read_fromBERT():
    nerbert = []
    for i in range(2,9):
        path = "BERT/NER_BERT_Book" + str(i) + ".csv"
        f = open(path, encoding='utf-8', errors='ignore')
        lines = f.read()
        split = lines.split("\n")
        nerlist = []
        for item in split:
            line = item.split(',')
            if len(line)>2:
                nerlist.append([''.join(line[:len(line)-2]), line[-2], line[-1]])
        nerbert.append(nerlist)
    return nerbert

# Reads the NER gotten from FLAIR
def read_fromFLAIR():
    nerflair = []
    for i in range(1, 9): 
        path = "NER_FLAIR/ner" + str(i) + '.txt'
        f = open(path, encoding='utf-8')
        lines = f.readlines()
        nerlist = []
        for line in lines:
            split = line.split(" ")
            nerlist.append([' '.join(split[:len(split)-2]), split[-2], split[-1][:-2]])
        nerflair.append(nerlist)
    return nerflair

# reads the location information with longitude and latitides from csv
def read_locs():
    return pd.read_csv('locs_lonlat.csv')

#### Select only people and locations from ner lists

In [51]:
# selects only the people and the locations from the input NERS
def select_people_and_locs_only(nerfrombert):
    peopleandlocs = []
    for book in nerfrombert:
        for ner in book:
            if ner[1] == 'B-PER' or ner[1] == 'PERSON':
                peopleandlocs.append(ner)
            elif ner[1] == 'B-LOC' or ner[1] == 'GPE' or ner[1] == 'LOC':
                peopleandlocs.append(ner)
    return peopleandlocs

### Localization Methods

In [59]:
def proximity_measure(inputners, proximity, person='Claire'):
    locslist = []
    for i in range(len(inputners)):
        if inputners[i][0] == person:
            index = int(inputners[i][2])
            curloc = []
            
            # look further
            for j in range(1,len(inputners)-i):
                if inputners[i+j-1][2] <= inputners[i+j][2] and int(inputners[i+j][2])-index <= proximity:
                    if inputners[i+j][1] == 'B-LOC' or inputners[i+j][1] == 'LOC' or inputners[i+j][1] == 'GPE':
                        curloc.append(inputners[i+j][0])
                else:
                    break
                    
            # look back
            for j in range(1, len(inputners)):
                if inputners[i-j-1][2] >= inputners[i-j][2] and index-int(inputners[i-j][2]) <= proximity:
                    if inputners[i-j][1] == 'B-LOC'or inputners[i-j][1] == 'LOC' or inputners[i-j][1] == 'GPE':
                        curloc.append(inputners[i-j][0])
                else:
                    break
            locslist.append([index, curloc])   
    return locslist

def get_coordinates(locations):
    totallonglats = []
    for entry in locations:
        longlats = []
        for location in entry[1]:
            found = locinformation[locinformation['Name'] == location]
            if len(found) > 0:
                long = found['Longitude'].item()
                lat = found['Latitude'].item()
                if long is not None and lat is not None and long != 'None' and lat != 'None':
                    longlats.append([location, long, lat])
        if len(longlats) > 0:
            totallonglats.append(longlats)
    return totallonglats

def get_innercluster_loc(longlats):
    finalentities = []
    for entity in longlats:
        smallesttotal = float('inf')
        bestindex = 0
        for i in range(len(entity)):
            totaldist = 0
            for j in range(1, len(entity)-i):
                totaldist = totaldist + (float(entity[i][1]) - float(entity[j][1]))**2 + (float(entity[i][2]) - float(entity[j][2]))**2

            if totaldist < smallesttotal:
                smallesttotal = totaldist
                bestindex = i
        finalentities.append(entity[i][0])
    return finalentities

### Deploy Methods

In [57]:
# read locations
locinformation = read_locs()

# read ners flair and select only people and locs
nerflair = read_fromFLAIR()
peoplelocsflair = select_people_and_locs_only(nerflair)

# read ners bert and select only people and locs
nerbert = read_fromBERT()
peoplelocsbert = select_people_and_locs_only(nerbert)

In [64]:
locationsflair = proximity_measure(peoplelocsflair[10000:12000], 10000, 'Claire')
longlatsflair = get_coordinates(locationsflair)
finallocsflair = get_innercluster_loc(longlatsflair)

locationsbert = proximity_measure(peoplelocsbert[:2000], 30, 'Claire')
longlatsbert = get_coordinates(locationsbert)
finallocsbert = get_innercluster_loc(longlatsbert)

In [66]:
finallocsbert

['Highlands',
 'Highlands',
 'Highlands',
 'Cumberland',
 'Culloden',
 'Culloden',
 'Culloden',
 'Culloden',
 'Culloden',
 'Culloden',
 'Culloden',
 'Culloden',
 'Culloden',
 'Culloden',
 'London',
 'London',
 'Culloden',
 'Culloden',
 'Culloden',
 'Culloden',
 'London',
 'Fort William',
 'Fort William',
 'London',
 'Highlands',
 'London',
 'London',
 'London',
 'London',
 'London',
 'London',
 'London',
 'London',
 'Cumberland',
 'London',
 'Culloden',
 'Culloden',
 'London',
 'London',
 'London',
 'London',
 'London',
 'London',
 'London',
 'London',
 'London',
 'Culloden',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Scotland',
 'Loch Ness',
 'Loch Ness',
 'Loch Ness',
 'Loch Ness',
 'Culloden',
 'Culloden',
 'Culloden