In [6]:
import spacy
from spacy.kb import KnowledgeBase
from spacy.util import minibatch, compounding
import os
import csv
from pathlib import Path
from collections import Counter
import random
import pickle


output_dir = Path.cwd() / "T-AIA-901-MPL_7"

def load_entities():
    output_dir = Path.cwd()
    entities_loc = output_dir / "entities.csv"

    names = dict()
    descriptions = dict()
    with entities_loc.open("r", encoding="utf8") as csvfile:
        csvreader = csv.reader(csvfile, delimiter=",")
        for row in csvreader:
            qid = row[0]
            name = row[1]
            desc = row[2]
            names[qid] = name
            descriptions[qid] = desc
    return names, descriptions

def create_kb():
    nlp =spacy.load("fr_core_news_sm")
    text = "Je veux aller de Paris à Londres."
    doc = nlp(text)
    name_dict, desc_dict = load_entities()

    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=96)

    for qid, desc in desc_dict.items():
        desc_doc = nlp(desc)
        desc_enc = desc_doc.vector
        kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)
    
    for qid, name in name_dict.items():
        kb.add_alias(alias=name, entities=[qid], probabilities=[1.0])
    
    qids = name_dict.keys()
    probs = [0.5 for qid in qids]
    kb.add_alias(alias="Paris", entities=qids, probabilities=probs)

    kb.dump(output_dir / "T-AIA-901-MPL_7/spacy/entity-linking" / "my_kb")
    nlp.to_disk(output_dir / "T-AIA-901-MPL_7/spacy/entity-linking" / "my_nlp")


def train_el():
    nlp = spacy.load("fr_core_news_sm")
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1)
    kb.load_bulk(output_dir / "my_kb")

    dataset = []
    csv_loc = output_dir / "asset/SpeechDestination.csv"
    with csv_loc.open("r", encoding="utf8") as csvfile:
        next(csvfile)
        i = 0
        for line in csvfile:
            list = line.split(",")
            list[3] = list[3][:-1]
            if list[3] == "false":
                text = list[0]
                QIDDepart = "DEPART"
                QIDDestination = "DESTINATION"
                offsetDepart = (text.find(list[1]), text.find(list[1]) + len(list[1]))
                offsetDestination = (text.find(list[2]), text.find(list[2]) + len(list[2]))
                links_dict_depart = {QIDDepart: 1.0}
                links_dict_destination = {QIDDestination: 1.0}
                dataset.append((text, {"links": {offsetDepart: links_dict_depart, offsetDestination: links_dict_destination}}))
                i += 1
                if i == 100:
                    break
    
    gold_ids = []
    for text, annot in dataset:
        for span, links_dict in annot["links"].items():
            for link, value in links_dict.items():
                if value:
                    gold_ids.append(link)
    
    train_dataset = []
    test_dataset = []
    random.shuffle(dataset)

    for i, (text, annot) in enumerate(dataset):
        if i < len(dataset) * 0.8:
            train_dataset.extend(dataset[i])
        else:
            test_dataset.extend(dataset[i])

    TRAIN_DOCS = []
    for text, annot in train_dataset:
        doc = nlp(text)
        TRAIN_DOCS.append(doc)
    
    entity_linker = nlp.create_pipe("entity_linker", config={"incl_prior": False})
    entity_linker.set_kb(kb)
    nlp.add_pipe(entity_linker, last=True)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(500):
            random.shuffle(TRAIN_DOCS)
            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            if itn % 50 == 0:
                print(itn, "Losses", losses)
    print(itn, "Losses", losses)

    nlp.to_disk(output_dir / "T-AIA-901-MPL_7/spacy/entity-linking" / "my_nlp_el")

    with open(output_dir / "T-AIA-901-MPL_7/spacy/entity-linking" / "test_set.plk" , "wb") as f:
        pickle.dump(test_dataset, f)


if __name__ == "__main__":
    create_kb()
    # train_el()

    # nlp = spacy.load(output_dir / "T-AIA-901-MPL_7/spacy/entity-linking" / "my_nlp_el")
    # text = 'Je vais de Paris à Marseille.'
    # doc = nlp(text)
    # for ent in doc.ents:
    #     print(ent.text, ent.label_, ent.kb_id_)


AttributeError: 'spacy.kb.KnowledgeBase' object has no attribute 'dump'