In [1]:
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import sys

In [2]:
with open("/Volumes/Drive/GitHub/DaVinciCodeTheTrackOfRobertLangdon/models/roberta-results", "rb") as fp:   # Unpickling
# with open("C:/Users/Admin/Documents/University/CurrentClass/IRTM/DaVinciCodeTheTrackOfRobertLangdon/models/roberta-results", "rb") as fp:
    bert_tags = pickle.load(fp)

In [3]:
bert_tags

[(' The Da Vinci Code', 'MISC', 1),
 (' Dan Brown', 'PER', 1),
 (' Louvre Museum', 'LOC', 1),
 (' Paris', 'LOC', 1),
 (' Jacques Sauniere', 'PER', 1),
 (' Grand Gallery', 'LOC', 1),
 (' Caravaggio', 'PER', 2),
 (' Sauniere', 'PER', 3),
 (' albino', 'MISC', 16),
 (' Sauniere', 'PER', 31),
 (' Sauniere', 'PER', 33),
 (' Sauniere', 'PER', 44),
 (' Sauniere', 'PER', 51),
 (' senechaux', 'PER', 51),
 (' Sauniere', 'PER', 64),
 (' Sauniere', 'PER', 65),
 (' Sauniere', 'PER', 66),
 (' Sauniere', 'PER', 70),
 (" Guerre d'Algerie", 'MISC', 76),
 ('.', 'MISC', 76),
 (' Jacques Sauniere', 'PER', 80),
 (' Jacques Sauniere', 'PER', 97),
 (' Grand Gallery', 'LOC', 103),
 (' Sauniere', 'PER', 104),
 (' Robert Langdon', 'PER', 108),
 (' Renaissance', 'MISC', 111),
 (' Louis XVI', 'MISC', 111),
 (' HOTEL RITZ PARIS', 'LOC', 113),
 (' Langdon', 'PER', 114),
 (' Langdon', 'PER', 116),
 (' Langdon', 'PER', 119),
 (' Langdon', 'PER', 124),
 (' AMERICAN UNIVERSITY OF PARIS', 'ORG', 127),
 (' ROBERT LANGDON'

In [4]:
def find_closest_loc(tag, index):
    # finds closest location to index
    curr_loc = ''
    dist = sys.maxsize
    for i in range(len(tag)):
        if bert_tags[i][1] == "LOC":
            if abs(bert_tags[i][2]-index) < dist:
                dist = abs(bert_tags[i][2]-index)
                curr_loc = bert_tags[i][0]
    return curr_loc

In [5]:
# go through bert_tags and map person to location based on closest distance index
pers_loc_temp = []
for i in tqdm(range(len(bert_tags))):
    if bert_tags[i][1] == "PER":
        loc = find_closest_loc(bert_tags, bert_tags[i][2])
        pers_loc_temp.append([bert_tags[i][0], loc])

pers_loc = []
# remove duplicates in pers_loc if index follows directly
for i in range(len(pers_loc_temp)-1):
    if not(pers_loc_temp[i][0] == pers_loc_temp[i+1][0] and pers_loc_temp[i][1] == pers_loc_temp[i+1][1]):
        pers_loc.append(pers_loc_temp[i])
pers_loc[:5]

100%|██████████| 9823/9823 [00:03<00:00, 2637.41it/s]


[[' Dan Brown', ' Louvre Museum'],
 [' Jacques Sauniere', ' Louvre Museum'],
 [' Caravaggio', ' Louvre Museum'],
 [' Sauniere', ' Louvre Museum'],
 [' senechaux', ' Louvre Museum']]

In [6]:
vectorizer = TfidfVectorizer()


def compute_similarity(a, b):
  tfidf = vectorizer.fit_transform([a, b])
  similarity = ((tfidf * tfidf.T).toarray())[0,1]
  return similarity

from functools import lru_cache

def lev_dist(a, b):
    @lru_cache(None)  # for memorization
    def min_dist(s1, s2):

        if s1 == len(a) or s2 == len(b):
            return len(a) - s1 + len(b) - s2

        # no change required
        if a[s1] == b[s2]:
            return min_dist(s1 + 1, s2 + 1)

        return 1 + min(
            min_dist(s1, s2 + 1),      # insert character
            min_dist(s1 + 1, s2),      # delete character
            min_dist(s1 + 1, s2 + 1),  # replace character
        )

    return min_dist(0, 0)

In [7]:
characters = ["Jacques Saunière", "Manuel Aringarosa", "Sister Sandrine Bieil", "Marie Chauvel", "Jerome Collet", "Simon Edwards", "Bezu Fache", "Jonas Faukman", "Pamela Gettum", "Claude Grouard", "Robert Langdon", "Rémy Legaludec", "Sophie Neveu"]

In [8]:
compute_similarity("Langdon", "robert langdon")

0.5797386715376657

In [9]:
lev_dist("robert langdon", "robert langdon")

0

Using Levenshtein distance

In [10]:
unique_chars = set()
threshold = 5
printed = []
# Compute the similarity between the characters and the persons
for i in range(len(pers_loc)):
  for j in range(len(characters)):
    temp = []
    char1 = pers_loc[i][0].lower()
    char2 = characters[j].lower()
    distance = lev_dist(char1, char2)
    if distance < threshold:
      unique_chars.add(char2)
      pers_loc[i][0] = char2
      if char1 not in printed:
        print(char1, '---' ,char2, " with similarity of {:.2f}".format(distance))
        printed.append(char1)
      break
unique_chars

 jacques sauniere --- jacques saunière  with similarity of 2.00
 robert langdon --- robert langdon  with similarity of 1.00
 jerome collet --- jerome collet  with similarity of 1.00
 bezu fache --- bezu fache  with similarity of 1.00
robert langdon --- robert langdon  with similarity of 0.00
 fache --- bezu fache  with similarity of 4.00
 manuel aringarosa --- manuel aringarosa  with similarity of 1.00
jacques sauniere --- jacques saunière  with similarity of 1.00
 sophie neveu --- sophie neveu  with similarity of 1.00
bezu fache --- bezu fache  with similarity of 0.00
sophie neveu --- sophie neveu  with similarity of 0.00
 claude grouard --- claude grouard  with similarity of 1.00
 jonas faukman --- jonas faukman  with similarity of 1.00
 remy legaludec --- rémy legaludec  with similarity of 2.00
 simon edwards --- simon edwards  with similarity of 1.00
pamela gettum --- pamela gettum  with similarity of 0.00
 pamela gettum --- pamela gettum  with similarity of 1.00
 marie chauvel ---

{'bezu fache',
 'claude grouard',
 'jacques saunière',
 'jerome collet',
 'jonas faukman',
 'manuel aringarosa',
 'marie chauvel',
 'pamela gettum',
 'robert langdon',
 'rémy legaludec',
 'simon edwards',
 'sophie neveu'}

Using soft tf-idf

In [11]:
unique_chars = set()
threshold = 0.35
printed = []
# Compute the similarity between the characters and the persons
for i in range(len(pers_loc)):
  for j in range(len(characters)):
    temp = []
    char1 = pers_loc[i][0].lower()
    char2 = characters[j].lower()
    distance = compute_similarity(char1, char2)
    if distance > threshold:
      unique_chars.add(char2)
      pers_loc[i][0] = char2
      if char1 not in printed:
        print(char1, '---' ,char2, " with similarity of {:.2f}".format(distance))
        printed.append(char1)
      break
unique_chars

jacques saunière --- jacques saunière  with similarity of 1.00
robert langdon --- robert langdon  with similarity of 1.00
 langdon --- robert langdon  with similarity of 0.58
 robert --- robert langdon  with similarity of 0.58
jerome collet --- jerome collet  with similarity of 1.00
bezu fache --- bezu fache  with similarity of 1.00
manuel aringarosa --- manuel aringarosa  with similarity of 1.00
 aringarosa --- manuel aringarosa  with similarity of 0.58
 collet --- jerome collet  with similarity of 0.58
 sandrine bieil --- sister sandrine bieil  with similarity of 0.71
 sandrine --- sister sandrine bieil  with similarity of 0.45
 neveu --- sophie neveu  with similarity of 0.58
sophie neveu --- sophie neveu  with similarity of 1.00
 sophie --- sophie neveu  with similarity of 0.58
fache --- bezu fache  with similarity of 0.58
sophie --- sophie neveu  with similarity of 0.58
langdon --- robert langdon  with similarity of 0.58
robert --- robert langdon  with similarity of 0.58
claude gro

{'bezu fache',
 'claude grouard',
 'jacques saunière',
 'jerome collet',
 'jonas faukman',
 'manuel aringarosa',
 'marie chauvel',
 'pamela gettum',
 'robert langdon',
 'rémy legaludec',
 'simon edwards',
 'sister sandrine bieil',
 'sophie neveu'}

In [12]:
# pers_loc_Tmp = pers_loc[:15]
# pers_loc_Tmp.append([' Sauniere', ' Grand Gallery'])
pers_loc

[[' Dan Brown', ' Louvre Museum'],
 ['jacques saunière', ' Louvre Museum'],
 [' Caravaggio', ' Louvre Museum'],
 [' Sauniere', ' Louvre Museum'],
 [' senechaux', ' Louvre Museum'],
 [' Sauniere', ' Grand Gallery'],
 ['jacques saunière', ' Grand Gallery'],
 [' Sauniere', ' Grand Gallery'],
 ['robert langdon', ' Grand Gallery'],
 ['robert langdon', ' HOTEL RITZ PARIS'],
 ['robert langdon', ' Chartres Cathedral'],
 ['robert langdon', ' Chartres Cathedral'],
 ['robert langdon', ' Chartres Cathedral'],
 ['robert langdon', ' Vatican'],
 ['robert langdon', ' Paris'],
 ['robert langdon', ' CITY OF LIGHTS'],
 ['robert langdon', ' CITY OF LIGHTS'],
 ['robert langdon', ' CITY OF LIGHTS'],
 ['robert langdon', ' Pavilion Dauphine'],
 ['robert langdon', ' Vatican'],
 ['Harrison Ford', ' Vatican'],
 [' Monique', ' Vatican'],
 ['robert langdon', ' Vatican'],
 ['robert langdon', ' Louvre'],
 ['jerome collet', ' Louvre'],
 ['robert langdon', ' Louvre'],
 ['jacques saunière', ' Louvre'],
 ['robert langdo

In [13]:
def duplicatedDeletion(pers_loc_Tmp):
    i = 0
    while(i < len(pers_loc_Tmp)):
        j = i+1
        while(j<len(pers_loc_Tmp)):
            if pers_loc_Tmp[i][0] == pers_loc_Tmp[j][0] and pers_loc_Tmp[i][1] == pers_loc_Tmp[j][1]:
                del pers_loc_Tmp[j]
                j = i+1
                continue
            elif pers_loc_Tmp[i][0] == pers_loc_Tmp[j][0] and pers_loc_Tmp[i][1] != pers_loc_Tmp[j][1]:
                break
            j+=1
        i+=1
    return pers_loc_Tmp

duplicatedDeletion(pers_loc);

In [14]:
pers_loc[:10]

[[' Dan Brown', ' Louvre Museum'],
 ['jacques saunière', ' Louvre Museum'],
 [' Caravaggio', ' Louvre Museum'],
 [' Sauniere', ' Louvre Museum'],
 [' senechaux', ' Louvre Museum'],
 [' Sauniere', ' Grand Gallery'],
 ['jacques saunière', ' Grand Gallery'],
 ['robert langdon', ' Grand Gallery'],
 ['robert langdon', ' HOTEL RITZ PARIS'],
 ['robert langdon', ' Chartres Cathedral']]

In [15]:
# only print pers_loc when robert langdon is in subindex 0
loc_robert = [x for x in pers_loc if x[0] == 'robert langdon']
loc_robert[:10]

[['robert langdon', ' Grand Gallery'],
 ['robert langdon', ' HOTEL RITZ PARIS'],
 ['robert langdon', ' Chartres Cathedral'],
 ['robert langdon', ' Vatican'],
 ['robert langdon', ' Paris'],
 ['robert langdon', ' CITY OF LIGHTS'],
 ['robert langdon', ' Pavilion Dauphine'],
 ['robert langdon', ' Vatican'],
 ['robert langdon', ' Louvre'],
 ['robert langdon', ' Vatican City']]

In [16]:
len(loc_robert)

561

In [19]:
for pers, loc in loc_robert:
    print(loc + ',')

 Grand Gallery,
 HOTEL RITZ PARIS,
 Chartres Cathedral,
 Vatican,
 Paris,
 CITY OF LIGHTS,
 Pavilion Dauphine,
 Vatican,
 Louvre,
 Vatican City,
 Rue La Bruyere,
 Opera House,
 Paris,
 Europe,
 Ritz,
 Eiffel Tower,
 Rome,
 Eiffel Tower,
 France,
 Tuileries,
 Tuileries Gardens,
 Seine,
 Ramses,
 Musee du Louvre,
 Louvre,
Louvre,
 Louvre,
 Paris,
 F,
 Louvre,
 Paris,
 Louvre,
 DENON,
 Paris,
 Denon Wing,
 Louvre,
 Denon Wing,
 France,
 Louvre,
 Grand Gallery,
 Louvre,
 Murray Hill Place,
 Grand Gallery,
 Washington Monuments,
 Vatican Secret Archives,
 Rome,
 Louvre,
 United States,
 Venus,
 Hollywood,
 Church,
 Vatican,
 United States,
 Venus,
 Langdon,
 Louvre,
 Saint-Sulpice,
 Grand Gallery,
 B.C.,
 Paris,
 Vatican,
 National Gallery,
 Paris,
Capitaine,
 Britain,
 U.S.,
 States,
 U.S.,
 Paris,
 Mount Vesuvius,
 Church of Saint-Sulpice,
 U.S.,
 Grand Gallery,
 Louvre,
 Paris,
 France,
 Paris,
 France,
 Place Saint-Sulpice,
 England,
 Louvre,
 Paris,
 Denon Wing,
 U.S.,
 Louvre,
 Grand 

In [17]:
loc_robert

[['robert langdon', ' Grand Gallery'],
 ['robert langdon', ' HOTEL RITZ PARIS'],
 ['robert langdon', ' Chartres Cathedral'],
 ['robert langdon', ' Vatican'],
 ['robert langdon', ' Paris'],
 ['robert langdon', ' CITY OF LIGHTS'],
 ['robert langdon', ' Pavilion Dauphine'],
 ['robert langdon', ' Vatican'],
 ['robert langdon', ' Louvre'],
 ['robert langdon', ' Vatican City'],
 ['robert langdon', ' Rue La Bruyere'],
 ['robert langdon', ' Opera House'],
 ['robert langdon', ' Paris'],
 ['robert langdon', ' Europe'],
 ['robert langdon', ' Ritz'],
 ['robert langdon', ' Eiffel Tower'],
 ['robert langdon', ' Rome'],
 ['robert langdon', ' Eiffel Tower'],
 ['robert langdon', ' France'],
 ['robert langdon', ' Tuileries'],
 ['robert langdon', ' Tuileries Gardens'],
 ['robert langdon', ' Seine'],
 ['robert langdon', ' Ramses'],
 ['robert langdon', ' Musee du Louvre'],
 ['robert langdon', ' Louvre'],
 ['robert langdon', 'Louvre'],
 ['robert langdon', ' Louvre'],
 ['robert langdon', ' Paris'],
 ['robert