In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df_label = pd.read_csv("Data/voies_paris_labelisé.csv", sep=';', dtype={'Dates2':str}, index_col=0)

In [4]:
df_label.head()

Unnamed: 0,DATE ARRET,CDGI,QUARTIER,TYPE DE VOIE,PREFIXE de VOIE,NOM de VOIE,Libellé VOIE,CVOIE,HISTORIQUE,Personne,...,profession,profession2,profession3,LARGEUR,LONG,DETAIL,ARRONDISSEMENT,STATUT PRESUME,DEBUT,FIN
0,1864-02-03T00:53:28+00:53,4166,Roquette.,rue,,gerbier,rue Gerbier,4112,"Pierre Jean Baptiste Gerbier (1725-1788), avoc...",Pierre Jean Baptiste Gerbier,...,avocat,,,13.0,174.0,Ouverture et alignements (non retenus au POS e...,XIe,voie publique,"rue de la Folie-Regnault, 15.","rue de la Roquette, 168 bis."
1,,4994,Saint-Ambroise.,cité,,joly,cité Joly,4886,"M. Joly, propriétaire.",M. Joly,...,propriétaire,,,12.0,135.0,"Classement, alignements (non retenus au POS et...",XIe,voie publique,"rue du Chemin Vert, 121.",en impasse
2,,5441,Saint-Ambroise.,rue,,léchevin,rue Léchevin,5376,"M. Léchevin, propriétaire.\nHistorique. ~ Avan...",M. Léchevin,...,propriétaire,,,15.0,73.0,"Classement, alignements (non retenus au POS et...",XIe,voie publique,"avenue Parmentier, 64.","passage Saint-Ambroise, 9 bis."
3,,8208,Folie-Méricourt. Saint-Ambroise. Roquette.,boulevard,,richard lenoir,boulevard Richard Lenoir,8197,"François Richard (1765-1839), manufacturier d'...",François Richard,...,manufacturier,,,60.0,1500.0,Alignements (non retenus au POS et non repris ...,XIe,voie publique,"boulevard Beaumarchais, 2 et place de la Basti...","avenue de la République, 22, rue Rampon et pla..."
4,1885-10-11T00:53:28+00:53,9057,Saint-Ambroise.,rue,,spinoza,rue Spinoza,8671,"Baruch Spinoza (1632-1677), philosophe holland...",Baruch Spinoza,...,philosophe,,,12.0,77.0,"Ouverture, alignements (non retenus au POS et ...",XIe,voie publique,"avenue de la République, 103.","boulevard de Ménilmontant, 81."


# 1. Preparing training dataset

In [5]:
training = df_label[:117]

In [6]:
labels_dictionary = pd.DataFrame({'Colonnes' : ['Personne', 'Dates', 'Dates2', 'profession', 'profession2'],
                                 'Labels' : ['PERSON', 'DATES', 'DATES', 'JOB', 'JOB']})

In [7]:
def find_position(row, entity):
    if not pd.isnull(training.loc[training.index==row, entity][row]):
        label = labels_dictionary[labels_dictionary['Colonnes']==entity].reset_index()['Labels'][0]
        for word in re.finditer(training.loc[(training.index==row), entity][row], training.loc[training.index==row, 'HISTORIQUE'][row]) :
            position = word.span()
            position_label = position + (label,)
            return position_label

In [8]:
TRAIN_DATA = []
for i in range(len(training)) : 
    if not pd.isnull(training.loc[training.index==i, 'HISTORIQUE'][i]):
        entities = []
        for entity in labels_dictionary['Colonnes'].tolist():
            position_label = find_position(i, entity)
            if position_label is not None:
                entities.append(position_label)
        entities_dict = {'entities' : entities}
        training_row = (training.loc[(training.index==i), 'HISTORIQUE'][i], entities_dict)
        TRAIN_DATA.append(training_row)

In [9]:
TRAIN_DATA

[('Pierre Jean Baptiste Gerbier (1725-1788), avocat ; quartier où ont été groupés des noms de juristes.',
  {'entities': [(0, 28, 'PERSON'), (30, 39, 'DATES'), (42, 48, 'JOB')]}),
 ('M. Joly, propriétaire.', {'entities': [(0, 7, 'PERSON'), (9, 21, 'JOB')]}),
 ("M. Léchevin, propriétaire.\nHistorique. ~ Avant 1942 : passage Léchevin. La partie de ce passage qui débouchait rue Saint-Ambroise a été absorbée par l'avenue Parmentier.",
  {'entities': [(0, 11, 'PERSON'), (13, 25, 'JOB')]}),
 ("François Richard (1765-1839), manufacturier d'étoffes. A la mort de son associé, Joseph Lenoir-Dufresne (1768-1806), il adopta partiellement son nom en souvenir de la firme qu'ils avaient menée en commun et s'appela désormais, François Richard-Lenoir ; quartier de fabriques.\nHistorique. ~ Cette voie qui recouvre le canal Saint-Martin a englobé une partie des quais de Valmy (précédemment quai Louis XVIII) et de Jemmapes (précédemment quai Charles X). La couverture du canal et la suppression des quais, 

# 2. NER Model

In [10]:
import random
import spacy
from spacy.util import minibatch, compounding
from pathlib import Path

output_dir="model"

In [11]:
def ner_model(model=None, output_dir=r'model', n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)
    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [12]:
ner_model()

Created blank 'en' model
Losses {'ner': 1529.2217318146159}
Losses {'ner': 422.0696238849141}
Losses {'ner': 416.62937896468316}
Losses {'ner': 418.87076676723376}
Losses {'ner': 379.17217754074187}
Losses {'ner': 273.3441686204864}
Losses {'ner': 251.63609408800846}
Losses {'ner': 256.56112315061796}
Losses {'ner': 203.17437320545648}
Losses {'ner': 159.76939570676882}
Losses {'ner': 136.42119682432448}
Losses {'ner': 144.27304638136312}
Losses {'ner': 192.22314246272904}
Losses {'ner': 138.93397383304384}
Losses {'ner': 122.65722837058857}
Losses {'ner': 83.36264144753292}
Losses {'ner': 77.75662960884253}
Losses {'ner': 88.95093345588664}
Losses {'ner': 74.15301663827577}
Losses {'ner': 69.10317460023607}
Losses {'ner': 73.21425860954972}
Losses {'ner': 78.68610469957453}
Losses {'ner': 52.26755070910405}
Losses {'ner': 64.23233272258119}
Losses {'ner': 71.99661411069759}
Losses {'ner': 106.45692872318655}
Losses {'ner': 42.34251663844662}
Losses {'ner': 90.54701220200161}
Losses {'

Entities [('Jean-Baptiste Colbert', 'PERSON'), ('1665-1746', 'DATES'), ('diplomate', 'JOB'), ("homme d'Etat", 'JOB')]
Tokens [('Jean', 'PERSON', 3), ('-', 'PERSON', 1), ('Baptiste', 'PERSON', 1), ('Colbert', 'PERSON', 1), (',', '', 2), ('marquis', '', 2), ('de', '', 2), ('Torcy', '', 2), ('(', '', 2), ('1665', 'DATES', 3), ('-', 'DATES', 1), ('1746', 'DATES', 1), (')', '', 2), (',', '', 2), ('neveu', '', 2), ('de', '', 2), ('Colbert', '', 2), (',', '', 2), ('diplomate', 'JOB', 3), ('et', '', 2), ('homme', 'JOB', 3), ("d'Etat", 'JOB', 1), ('français', '', 2), ('.', '', 2), ('\n', '', 2), ('Historique', '', 2), ('.', '', 2), ('~', '', 2), ('Précédemment', '', 2), ('rue', '', 2), ('du', '', 2), ('Bon', '', 2), ('Puits', '', 2), ('.', '', 2), ('Cette', '', 2), ('voie', '', 2), ('est', '', 2), ('tracée', '', 2), ('sur', '', 2), ('le', '', 2), ('plan', '', 2), ('de', '', 2), ('Roussel', '', 2), ('(', '', 2), ('1730', '', 2), (')', '', 2), ('.', '', 2), ('Plus', '', 2), ('anciennement', '', 2

Entities [('Etienne Louis Malus', 'PERSON'), ('1775-1812', 'DATES'), ('physicien', 'JOB')]
Tokens [('Etienne', 'PERSON', 3), ('Louis', 'PERSON', 1), ('Malus', 'PERSON', 1), ('(', '', 2), ('1775', 'DATES', 3), ('-', 'DATES', 1), ('1812', 'DATES', 1), (')', '', 2), (',', '', 2), ('physicien', 'JOB', 3), ('français', '', 2), (';', '', 2), ('quartier', '', 2), ('où', '', 2), ('ont', '', 2), ('été', '', 2), ('groupés', '', 2), ('des', '', 2), ('noms', '', 2), ('de', '', 2), ('savants', '', 2), ('.', '', 2), ('\n', '', 2), ('Historique', '', 2), ('.', '', 2), ('~', '', 2), ('Ouverte', '', 2), ('par', '', 2), ('la', '', 2), ('Ville', '', 2), ('de', '', 2), ('Paris', '', 2), ('.', '', 2)]
Entities [('Paul Delaroche', 'PERSON'), ('1797-1856', 'DATES'), ('peintre', 'JOB')]
Tokens [('Hippolyte', '', 2), (',', '', 2), ('dit', '', 2), ('Paul', 'PERSON', 3), ('Delaroche', 'PERSON', 1), ('(', '', 2), ('1797', 'DATES', 3), ('-', 'DATES', 1), ('1856', 'DATES', 1), (')', '', 2), (',', '', 2), ('peintre'

In [13]:
nlp = spacy.load(output_dir)
def apply_ner_model(text) :
    doc = nlp(text)
    dict_entities = {}
    #print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    
    for label in ['PERSON', 'DATES', 'JOB'] :
        label_results=[]
        for ent in doc.ents:
            if ent.label_== label:
                label_results.append(ent.text)
        if label_results!= []:
            dict_entities[label] = label_results
    return dict_entities

In [14]:
df_label['Results_ner_model'] = df_label.loc[~df_label['HISTORIQUE'].isna(), 'HISTORIQUE'].apply(lambda x: apply_ner_model(x))

# 3. Cleaning Results

## Variable "is person"

In [15]:
df_label['bool_personne'] = 0
df_label.loc[(df_label['Results_ner_model']!={}) & (~df_label['Results_ner_model'].isna()), 'bool_personne'] = 1

In [16]:
df_label['bool_personne'].value_counts()

1    3751
0    2724
Name: bool_personne, dtype: int64

## Cleaning jobs variables

In [17]:
def clean_ner_results(text, entity):
    output_preds = []
    output_pred = text.get(entity)
    output_preds.append(output_pred)
    return output_preds

In [18]:
entities = ['PERSON', 'DATES', 'JOB']
for entity in entities:
    df_label[entity+str('_pred')] = df_label.loc[~df_label['Results_ner_model'].isna(), 'Results_ner_model'].apply(lambda x: clean_ner_results(x, entity))
    df_label.loc[(df_label[entity+str('_pred')] is None) | (df_label[entity+str('_pred')].isna()), entity+str('_pred')]=[None]
    df_label.loc[~df_label[entity+str('_pred')].isna(), entity+str('_pred')] = df_label.loc[~df_label[entity+str('_pred')].isna(), entity+str('_pred')].apply(lambda x: x[0])

In [127]:
# Remove numbers if in jobs (error)
def remove_numbers(mylist,myregex = re.compile(r'\d')):
    return [s for s in mylist if not myregex.search(s)]
df_label['JOB_pred'] = df_label.loc[~df_label['JOB_pred'].isna(), 'JOB_pred'].apply(lambda x: remove_numbers(x))

In [142]:
df_label['JOB_pred'] = df_label.loc[~df_label['JOB_pred'].isna(), 'JOB_pred'].apply(lambda x: [i.lower() for i in x])

In [145]:
list_jobs = df_label.loc[~df_label['JOB_pred'].isna(), 'JOB_pred'].tolist()
list_jobs = set([item for sublist in list_jobs for item in sublist])
list_jobs

{'a',
 'abbesse',
 'abbé',
 'accordant',
 'accordeur',
 'acteur',
 'actrice',
 'adjudant',
 'administrateur',
 'affichiste',
 'agronome',
 'alchimiste',
 'alexandre chatrian',
 'amateur',
 'ambassadeur',
 'ami',
 'amie',
 'amiral',
 'analyste',
 'anatomiste',
 'ancien',
 'ancienne',
 'andré mesmin',
 'anglais',
 'animateur',
 'animatrice',
 'anthropologue',
 'antiquaire',
 'antoine charles horace',
 'antérieurement',
 'apothicaire',
 'apôtre',
 'aquafortiste',
 'aquarelliste',
 'archevêque',
 'architecte',
 'architectes',
 'archiviste',
 'archéologue',
 'argentier',
 'artiste',
 'artur london',
 'assistante',
 'astronome',
 'auguste charles',
 'aumônier',
 'auparavant',
 'auprès',
 'auteur',
 'auteure',
 'avant',
 'aviateur',
 'aviateurs',
 'aviatrice',
 'avocat',
 'avocate',
 'ayant',
 'aéronaute',
 'bactériologiste',
 'bactériologistes',
 'banquier',
 'baron',
 'baronne',
 'bibliophile',
 'bibliothécaire',
 'bienfaiteur',
 'biologiste',
 'botaniste',
 'bourgeois',
 'brasseur',
 'brûl

In [150]:
df_label['JOB1'] = df_label['JOB_pred'].apply(pd.Series)[0]

In [151]:
df_label['JOBS_list'] = df_label['JOB_pred']

In [153]:
df_label.to_csv('Data/df_clean.csv', sep=';')