# Named Entity Recognition on PILOT files using classic SpaCy pipeline

MiMoText pilot files are: 

* Senac_Emigre
* Maistre_Voyage
* Sade_Aline
* Sade_Justine
* Bernadin_Paul
* Laclos_Liaisons
* Retif_Paysanne
* Retif_Paysan
* Mercier_An
* Retif_AntiJustine
* Rousseau_Julie
* Voltaire_Candide

For full list of metadata and MiMoText IDs see https://docs.google.com/spreadsheets/d/10HrWlxkAuOiMxgyDa4K8cA7syvbFJGAW2kgbonyyDvQ/edit#gid=0 

The pretrained statistical models for French is multi-task CNN trained on UD French Sequoia and WikiNER. Assigns context-specific token vectors, POS tags, dependency parse and named entities.

When you call `nlp` on a text, spaCy first tokenizes the text to produce a `Doc` object. The `Doc` is then processed in several different steps – this is also referred to as the processing pipeline. The pipeline used by the default models consists of a tagger, a parser and an entity recognizer. Each pipeline component returns the processed `Doc`, which is then passed on to the next component.

In [4]:
import spacy
import re
import glob
import nltk
import sklearn
from spacy import pipeline
from spacy import morphology
from spacy import displacy 
from collections import Counter
import fr_core_news_lg
import requests 
sklearn.feature_extraction.text.CountVectorizer

# loading of french language model
nlp = fr_core_news_lg.load()


In [5]:
# printing out a sorted list of the ten most common LOC entities within the text 
voltaire_candide = requests.get('https://raw.githubusercontent.com/MiMoText/roman-dixhuit/master/plain/files/Voltaire_Candide.txt')
voltaire_candide = nlp(voltaire_candide.text)
listOfPER_voltaire_candide = [ent for ent in voltaire_candide.ents if ent.label_ == 'LOC']
Counter([ent.text.strip() for ent in listOfPER_voltaire_candide]).most_common(10)

[('Venise', 27),
 ('Cacambo', 24),
 ('Bulgares', 21),
 ('Buenos', 13),
 ('Constantinople', 12),
 ('Paris', 12),
 ('Portugal', 9),
 ('Europe', 9),
 ('Westphalie', 8),
 ('Surinam', 8)]

In [6]:
# printing out a sorted list of the ten most common LOC entities within the text 
senac_emigre = requests.get('https://raw.githubusercontent.com/MiMoText/roman-dixhuit/master/plain/files/Senac_Emigre.txt')
senac_emigre = nlp(senac_emigre.text)
Counter([ent.text.strip() for ent in [ent for ent in senac_emigre.ents if ent.label_ == 'LOC']]).most_common(10)

[('Français', 53),
 ('Paris', 37),
 ('France', 34),
 ('Francfort', 33),
 ('la France', 26),
 ('Mayence', 24),
 ('de France', 17),
 ('Chevalier', 13),
 ('Française', 13),
 ('Lœwenstein', 13)]

In [8]:
maistre_voyage = requests.get('https://raw.githubusercontent.com/MiMoText/roman-dixhuit/master/plain/files/Maistre_Voyage.txt')
maistre_voyage = nlp(maistre_voyage.text)
listOfLOC_maistre_voyage = [ent for ent in maistre_voyage.ents if ent.label_ == 'LOC']
Counter([ent.text.strip() for ent in listOfLOC_maistre_voyage]).most_common(10)

[('»--Il', 3),
 ('Alpes', 3),
 ('Turin', 3),
 ('Rome', 2),
 ('Champs', 2),
 ("d'Italie", 2),
 ('Paris', 2),
 ('Paris;--aucun', 1),
 ('présente!', 1),
 ("fleurs;--c'", 1)]

In [9]:
laclos_liaisons = requests.get('https://raw.githubusercontent.com/MiMoText/roman-dixhuit/master/plain/files/Laclos_Liaisons.txt')
laclos_liaisons  = nlp(laclos_liaisons.text)
listOfLOC_laclos_liaisons = [ent for ent in laclos_liaisons.ents if ent.label_ == 'LOC']
Counter([ent.text.strip() for ent in listOfLOC_laclos_liaisons]).most_common(10)

[('Paris', 105),
 ('Valmont', 36),
 ('Opéra', 14),
 ('petite Volanges', 12),
 ('Belleroche', 11),
 ('Gercourt', 8),
 ('Suisse', 6),
 ('Volanges', 5),
 ('Providence', 5),
 ('Est', 5)]

In [10]:
#Increasing the max_length for longer novels 
nlp.max_length = 1700000

In [11]:
rousseau_julie = requests.get('https://raw.githubusercontent.com/MiMoText/roman-dixhuit/master/plain/files/Rousseau_Julie.txt')
rousseau_julie = nlp(rousseau_julie.text)
listOfLOC_rousseau_julie = [ent for ent in rousseau_julie.ents if ent.label_ == 'LOC']
Counter([ent.text.strip() for ent in listOfLOC_rousseau_julie]).most_common(10)

[('Wolmar', 71),
 ('Paris', 65),
 ('Clarens', 30),
 ('Saint-Preux', 26),
 ('Orbe', 20),
 ('Londres', 20),
 ('Rome', 18),
 ('Angleterre', 15),
 ('Opéra', 15),
 ('Geneve', 14)]

In [13]:
retif_paysanne = requests.get('https://raw.githubusercontent.com/MiMoText/roman-dixhuit/master/plain/files/Retif_Paysanne.txt')
retif_paysanne= nlp(retif_paysanne.text)
listOfLOC_retif_paysanne = [ent for ent in retif_paysanne.ents if ent.label_ == 'LOC']
Counter([ent.text.strip() for ent in listOfLOC_retif_paysanne]).most_common(10)

[('Paris', 71),
 ('Lagouache', 38),
 ('Italien', 21),
 ('Zéphire', 14),
 ('S', 12),
 ('Opéra', 12),
 ('Trémoussée', 12),
 ('M', 10),
 ('Est', 8),
 ('P.-S.--Je', 8)]

In [None]:
#-->> Check: Why are there unusual LOC entitites ini retif_paysanne? Displacy renders the whole text with named entities (grey = PERS , orange = LOC, blue = ORG)
displacy.render(retif_paysanne,style = 'ent', jupyter=True)

In [14]:
retif_antijustine = requests.get('https://raw.githubusercontent.com/MiMoText/roman18/master/plain/files/Retif_AntiJustine.txt')
retif_antijustine= nlp(retif_antijustine.text)
listOfLOC_retif_antijustine = [ent for ent in retif_antijustine.ents if ent.label_ == 'LOC']
Counter([ent.text.strip() for ent in listOfLOC_retif_antijustine]).most_common(10)

[('Conquette', 68),
 ('Minonne', 46),
 ('Traitdamour', 43),
 ('Rosemauve', 32),
 ('Guaé', 29),
 ('Tendrelys', 21),
 ('Connette', 20),
 ('Brideconin', 15),
 ('Montencon', 9),
 ('Brisemotte', 9)]

In [15]:
sade_justine = requests.get('https://raw.githubusercontent.com/MiMoText/roman-dixhuit/master/plain/files/Sade_Justine.txt')
sade_justine = nlp(sade_justine.text)
listOfLOC_sade_justine = [ent for ent in sade_justine.ents if ent.label_ == 'LOC']
Counter([ent.text.strip() for ent in listOfLOC_sade_justine]).most_common(10)


[('Cardoville', 34),
 ('Sévérino', 31),
 ('Paris', 26),
 ('Lyon', 26),
 ('Grenoble', 14),
 ('France', 10),
 ('Corville', 9),
 ('Gernande', 8),
 ('Dauphiné', 7),
 ('Saint-Florent', 6)]

In [16]:
sade_aline = requests.get('https://raw.githubusercontent.com/MiMoText/roman-dixhuit/master/plain/files/Sade_Aline.txt')
sade_aline = nlp(sade_aline.text)
listOfLOC_sade_aline = [ent for ent in sade_aline.ents if ent.label_ == 'LOC']
Counter([ent.text.strip() for ent in listOfLOC_sade_aline]).most_common(10)

[('Valcour', 144),
 ('Paris', 96),
 ('Déterville', 64),
 ('Blamont', 58),
 ('État', 52),
 ('Sainville', 51),
 ('Europe', 51),
 ('Portugais', 49),
 ('Vertfeuille', 36),
 ('Madrid', 34)]

In [17]:
bernadin_paul = requests.get('https://raw.githubusercontent.com/MiMoText/roman18/master/plain/files/Bernardin_Paul.txt')
bernadin_paul = nlp(bernadin_paul.text)
listOfLOC_bernadin_paul = [ent for ent in bernadin_paul .ents if ent.label_ == 'LOC']
Counter([ent.text.strip() for ent in listOfLOC_bernadin_paul ]).most_common(10)

[('Virginie', 218),
 ('Domingue', 34),
 ("l'Europe", 21),
 ('France', 21),
 ('Europe', 14),
 ('Indes', 13),
 ('Port-Louis', 13),
 ('Pamplemousses', 12),
 ('Paris', 10),
 ('Providence', 9)]

In [18]:
mercier_an = requests.get('https://raw.githubusercontent.com/MiMoText/roman18/master/plain/files/Mercier_An.txt')
mercier_an  = nlp(mercier_an.text)
listOfLOC_mercier_an  = [ent for ent in mercier_an.ents if ent.label_ == 'LOC']
Counter([ent.text.strip() for ent in listOfLOC_mercier_an]).most_common(10)


[('État', 26),
 ('Paris', 14),
 ('Anglois', 7),
 ('Angleterre', 6),
 ("l'Europe", 6),
 ('États', 6),
 ('la Chine', 5),
 ('Rome', 5),
 ('Romains', 4),
 ('Être Suprême', 4)]

# PER entities

Printing out a sorted list of the ten most common PER entities within the french novels (pilote corpus MiMoText)

In [26]:
Counter([ent.text.strip() for ent in [ent for ent in voltaire_candide.ents if ent.label_ == 'PER']]).most_common(10)

[('Candide', 245),
 ('Cunégonde', 110),
 ('Martin', 108),
 ('Pangloss', 86),
 ('Cacambo', 53),
 ('Paquette', 16),
 ('Eldorado', 13),
 ('Pococurante', 11),
 ('Giroflée', 8),
 ('M.', 7)]

In [27]:
Counter([ent.text.strip() for ent in [ent for ent in senac_emigre.ents if ent.label_ == 'PER']]).most_common(10)

[('Marquis', 331),
 ('Adieu', 125),
 ('Commandeur', 85),
 ('Marquis de St. Alban', 67),
 ('Madame', 61),
 ('Mademoiselle', 60),
 ('Melle Émilie de Wergentheim', 52),
 ('Roi', 52),
 ('Bertrand', 50),
 ('Président', 48)]

In [28]:
Counter([ent.text.strip() for ent in [ent for ent in maistre_voyage.ents if ent.label_ == 'PER']]).most_common(10)

[('Joannetti', 19),
 ('Rosine', 11),
 ('Aspasie', 8),
 ('Raphaël', 7),
 ('Hippocrate', 6),
 ('Platon', 5),
 ('madame de Hautcastel', 5),
 ('Périclès', 4),
 ('Est-il', 2),
 ('Mille', 2)]

In [29]:
Counter([ent.text.strip() for ent in [ent for ent in laclos_liaisons.ents if ent.label_ == 'PER']]).most_common(10)

[('Adieu', 128),
 ('Monsieur', 87),
 ('Madame', 86),
 ('Vicomte de Valmont', 86),
 ('M. de Valmont', 64),
 ('Marquise de Merteuil', 63),
 ('Prévan', 58),
 ('Cécile', 52),
 ('Vicomte', 50),
 ('Cécile Volanges', 40)]

In [30]:
Counter([ent.text.strip() for ent in [ent for ent in rousseau_julie.ents if ent.label_ == 'PER']]).most_common(10)

[('Julie', 291),
 ('Ciel', 93),
 ('Milord Edouard', 75),
 ('M. de Wolmar', 70),
 ('Claire', 67),
 ('Milord', 61),
 ('hui’hui', 52),
 ('Adieu', 37),
 ('Fanchon', 37),
 ('Madame', 29)]

In [21]:
Counter([ent.text.strip() for ent in [ent for ent in retif_paysanne.ents if ent.label_ == 'PER']]).most_common(10)

[('Edmond', 592),
 ('Mme Parangon', 221),
 ('Laure', 133),
 ('Gaudet', 94),
 ('Fanchon', 76),
 ('M. Gaudet', 74),
 ('Mme Canon', 67),
 ('Fanchette', 58),
 ('Mlle Fanchette', 56),
 ('Edmée', 53)]

In [36]:
Counter([ent.text.strip() for ent in [ent for ent in retif_antijustine.ents if ent.label_ == 'PER']]).most_common(10)

[('Vitnègre', 93),
 ('Traitdamour', 83),
 ('Conquette', 53),
 ('Timori', 41),
 ('Guaé', 41),
 ('Fysitère', 38),
 ('Rosemauve', 36),
 ('Connette', 30),
 ('Conquette Ingénue', 23),
 ('Non', 14)]

In [33]:
Counter([ent.text.strip() for ent in [ent for ent in sade_justine.ents if ent.label_ == 'PER']]).most_common(10)

[('Thérèse', 229),
 ('Rodin', 78),
 ('Roland', 74),
 ('Dubois', 68),
 ('Saint-Florent', 47),
 ('Clément', 38),
 ('Rosalie', 33),
 ('Omphale', 31),
 ('Mme de Lorsange', 26),
 ('Sévérino', 26)]

In [59]:
Counter([ent.text.strip() for ent in [ent for ent in sade_aline.ents if ent.label_ == 'PER']]).most_common(10)

[('Aline', 299),
 ('Léonore', 165),
 ('Sophie', 164),
 ('Clémentine', 156),
 ('madame de Blamont', 144),
 ('Valcour', 104),
 ('Zamé', 93),
 ('Sainville', 72),
 ('Madame de Blamont', 71),
 ('Isabeau', 45)]

In [30]:
Counter([ent.text.strip() for ent in [ent for ent in bernadin_paul.ents if ent.label_ == 'PER']]).most_common(10)

[('Paul', 177),
 ('madame de la Tour', 56),
 ('Marguerite', 37),
 ('Domingue', 36),
 ('Madame de la Tour', 25),
 ('Marie', 21),
 ('M. de la Bourdonnais', 15),
 ('Fidèle', 8),
 ('Homère', 7),
 ('M. Roger', 6)]

In [39]:
Counter([ent.text.strip() for ent in [ent for ent in mercier_an.ents if ent.label_ == 'PER']]).most_common(10)

[('François', 13),
 ('Monsieur', 8),
 ('Roi', 8),
 ('Messieurs', 8),
 ('Oui', 8),
 ('Voltaire', 7),
 ('Louis XV', 6),
 ('Corneille', 6),
 ('Henri IV', 5),
 ('Louis XIV', 5)]

In [35]:
# Computing Similarity with word vectors (SpaCy)

In [60]:
print('voltaire_candide et laclos_liaisons ',voltaire_candide.similarity(laclos_liaisons))
print('voltaire_candide et senac_emigre',voltaire_candide.similarity(senac_emigre))
print('voltaire_candide et sade aline',voltaire_candide.similarity(sade_aline))
print('voltaire_candide et maistre_voyage',voltaire_candide.similarity(maistre_voyage))

voltaire_candide et laclos_liaisons  0.9450388522973917
voltaire_candide et senac_emigre 0.9870599846699816
voltaire_candide et sade aline 0.9669153988417551
voltaire_candide et maistre_voyage 0.9868767207249752
