In [2]:
import spacy
import pickle
import random
from spacy.training import Example

In [3]:
# Chargement des données d'entraînement
train_data = pickle.load(open("CV-Parsing-using-Spacy-3-master/data/training/train_data.pkl", "rb"))

# Création d'un modèle spaCy vierge
nlp = spacy.blank('en')

def clean_annotations(text, annotations):
    """ Nettoyer les annotations pour éviter les espaces et chevauchements """
    entities = []
    seen_offsets = set()  # Pour éviter les doublons et les chevauchements

    for start, end, label in sorted(annotations["entities"], key=lambda x: x[0]):
        # Vérifier les espaces et corriger
        while start < end and text[start].isspace():
            start += 1
        while end > start and text[end - 1].isspace():
            end -= 1

        # Vérifier si l'entité se chevauche avec une existante
        overlap = any(s < end and start < e for s, e in seen_offsets)
        if not overlap:
            entities.append((start, end, label))
            seen_offsets.add((start, end))

    return {"entities": entities}

def train_model(train_data):
    # Ajouter le composant NER
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")  # Correction ici

    # Ajouter les labels d'entités au NER
    for text, annotations in train_data:
        cleaned_annotations = clean_annotations(text, annotations)  # Correction ici
        for ent in cleaned_annotations["entities"]:
            ner.add_label(ent[2])

    # Désactiver les autres pipes pendant l'entraînement
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.initialize()  # Correction ici (remplace begin_training)

        for itn in range(10):  # 10 itérations d'entraînement
            print(f"Starting iteration {itn + 1}")
            random.shuffle(train_data)
            losses = {}
            examples = []

            for text, annotations in train_data:
                try:
                    cleaned_annotations = clean_annotations(text, annotations)  # Correction ici
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, cleaned_annotations)
                    examples.append(example)
                except Exception as e:
                    print(f"Erreur pour le texte: {text[:50]}... -> {e}")

            # Mise à jour du modèle
            try:
                nlp.update(
                    examples,
                    drop=0.2,
                    losses=losses
                )
            except Exception as e:
                print(f"Erreur lors de la mise à jour du modèle : {e}")

            print(f"Losses à l'itération {itn + 1}: {losses}")

# Entraîner le modèle
train_model(train_data)
nlp.to_disk("mon_modele_ner")
print("Modèle entraîné et sauvegardé avec succès.")

Starting iteration 1




: 

In [15]:
nlp_model = spacy.load("mon_modele_ner")

In [16]:
train_data[0]

("Ashish Indoriya Sr. Systems Engineer at Infosys Limited  Hyderabad, Telangana - Email me on Indeed: indeed.com/r/Ashish- Indoriya/84f99c99ebe940be  • Master of Computer Application (MCA) from Bhilai Institute of Technology, Durg, 2014. • Having 3.3 years of Experience on Software Development at Infosys limited. • Extensive working experience on Java, Spring, Hibernate and SQL • Knowledge of design patterns such as Singleton, Factory, Façade, Observer and MVC. • Knowledge of Front-end web development using JavaScript, JQuery, CSS &amp; HTML. • Having knowledge of Oracle SQL Database. • Reliable as a fully contributing, responsible and accountable member of task/ project teams with highly honed creative, logical and analytical approach. • Automated some of HRMS processes like Hiring, transfer, termination to help speed up the QA process. • Hands on knowledge of C, C++ including advanced concepts such as pointers and Dynamic Memory Management. • Learning Hadoop and Big data analysis usi

In [17]:
doc=nlp_model(train_data[0][0])


In [19]:
for ent in doc.ents:
    print(f"Texte : {ent.text}, Entité : {ent.label_}")

