In [1]:
import re
import os
from nltk import PunktSentenceTokenizer
from spacy.tokens import Doc, Span, Token
import multiprocessing
import itertools
import numpy as np
from spacy.language import Language

from recap_am.controller.extract_features import set_features
from recap_am.controller.nlp import parse
from recap_am.model.config import Config

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = Config.get_instance()
lang = config["nlp"]["language"]

In [None]:

def clean_text(text):
    text = re.sub(r"&nbsp;[a-zA-Z0-9]?", "", text)
    text = (
        text.replace("Art.", "Artikel")
        .replace("Abs.", "Absatz")
        .replace("u.a.", "unter anderem")
        .replace("U.a.", "Unter anderem")
        .replace("u.E.", "unseres Erachtens")
        .replace("U.E.", "Unseres Erachtens")
        .replace("vgl.", "vergleiche")
        .replace("Vgl.", "Vergleiche")
        .replace("bzw.", "beziehungsweise")
        .replace("i.V.m.", "im Vergleich mit")
        .replace("Buchst.", "Buchstabe")
        .replace("d.h.", "das heißt")
        .replace("'", "")
        .replace("-", " ")
        .replace(";", "")
    )
    text = re.sub(r"[^a-zA-Z0-9.,?!äÄöÖüÜ:;&ß%$'\"()[\]{} -]\n", "", text)
    text = text.replace("...", "")
    text = re.sub(r" +", " ", text)
    text = text.strip(" ")
    return text

def add_labels(doc, labels):
    """Ajoute les étiquettes à partir d'une liste."""
    adu_labels_list = []
    clpr_label_list = []
    for idx, label in enumerate(labels):
        label = label.strip("\n").strip(" ")
        if label == "Claim":
            adu_labels_list.append(1)
            clpr_label_list.append(1)
        elif label == "Premise":
            adu_labels_list.append(1)
            clpr_label_list.append(0)
        elif label == "MajorClaim":
            adu_labels_list.append(1)
            clpr_label_list.append(1)
        elif label == "None":
            adu_labels_list.append(0)
        elif label == "ADU":
            adu_labels_list.append(1)
        elif label == "1":
            adu_labels_list.append(1)
        elif label == "0":
            adu_labels_list.append(0)
    if len(adu_labels_list) > len(doc._.Features):
        adu_labels_list = adu_labels_list[: len(doc._.Features)]
    elif len(adu_labels_list) < len(doc._.Features):
        add_on = np.random.randint(low=0, high=1, size=len(doc._.Features) - len(adu_labels_list)).tolist()
        adu_labels_list.extend(add_on)
    nr_adus = sum([1 for l in adu_labels_list if l == 1])
    if len(clpr_label_list) > nr_adus:
        clpr_label_list = clpr_label_list[:nr_adus]
    elif len(clpr_label_list) < nr_adus:
        add_on = np.random.randint(low=0, high=1, size=nr_adus - len(clpr_label_list)).tolist()
        clpr_label_list.extend(add_on)
    doc._.Labels = adu_labels_list
    doc._.CLPR_Labels = clpr_label_list
    return doc

def prep_training(filename, input_text, labels_list):
    doc = parse(input_text)
    doc._.key = filename
    doc = set_features(doc)
    doc = add_labels(doc, labels_list)
    return doc

In [None]:

def read_in(file_name1, file_name2, texts, label_list):
    if os.path.isfile(file_name1):
        with open(file_name1, "r+", encoding="utf8", errors="ignore") as f:
            text = f.read()
        with open(file_name2, "r+", encoding="utf8", errors="ignore") as f:
            labels = f.read().split("\n")
    else:
        with open(config["adu"]["path"]["input"] + "/" + file_name1, "r+", encoding="utf8", errors="ignore") as f:
            text = f.read()
        with open(config["adu"]["path"]["label"] + "/" + file_name2, "r+", encoding="utf8", errors="ignore") as f:
            labels = f.read().split("\n")
    text = clean_text(text)
    texts.append(text)
    label_list.append(labels)

def merge_docs(doc_list):
    comb_feat = list(itertools.chain.from_iterable(list(map(lambda x: x._.Features, doc_list))))
    comb_label = list(itertools.chain.from_iterable(list(map(lambda x: x._.Labels, doc_list))))
    comb_clpr_label = list(itertools.chain.from_iterable(list(map(lambda x: x._.CLPR_Labels, doc_list))))
    comb_embedding = list(itertools.chain.from_iterable(list(map(lambda x: x._.embeddings, doc_list))))
    final_text = "FinalDocument"
    final_doc = parse(final_text)
    final_doc._.Features = comb_feat
    final_doc._.Labels = comb_label
    final_doc._.CLPR_Labels = comb_clpr_label
    final_doc._.embeddings = comb_embedding
    print("Merged Lists")
    return final_doc


In [7]:
    # Initialiser localement les listes partagées
if config["debug"]:
    texts = []
    label_list = []
    print("h")

       

In [8]:

input_list="C:\\Users\\DELL\\argument-graph-mining\\data\\ADU\\in\\PE\\en\train\\micro_c004.text"
label="C:\\Users\\DELL\\argument-graph-mining\\data\\ADU\\in\\PE\\en\\train\\micro_c004.label"
manager = multiprocessing.Manager()
texts = manager.list()
label_list = manager.list()


In [None]:
if isinstance(input_list, list) or isinstance(input_list, tuple):
    jobs = []
    for idx, infile in enumerate(input_list):
        print("Reading Document\t%s" % infile)
        p = multiprocessing.Process(target=read_in, args=(infile, label[idx], texts, label_list))
        jobs.append(p)
        p.start()
    for proc in jobs:
        proc.join()
    doc_list = []
    for idx, doc in enumerate(
        parse.pipe(
            texts,
            disable=["ner"],
            batch_size=80,
            n_process=multiprocessing.cpu_count(),
        )
    ):
        print("Processing Document\t%i" % idx)
        doc._.key = input_list[idx]
        doc = set_features(doc)
        doc = add_labels(doc, label_list[idx])
        doc_list.append(doc)
    final_doc = merge_docs(doc_list)
else:
    with open(input_list, "r+", encoding="utf8") as f:
        text = f.read()
    text = clean_text(text)
    with open(label, "r+", encoding="utf8") as f:
        labels = f.read().split("\n")
    final_doc = prep_training(input_list, text, labels)


In [None]:
texts=[]
label_list=[]
file1="C:\\Users\\DELL\\argument-graph-mining\\data\\ADU\\in\\PE\\en\train\\micro_c004.text"
file2="C:\\Users\\DELL\\argument-graph-mining\\data\\ADU\\in\\PE\\en\\train\\micro_c004.label"
read_in(file1,file2,texts,label_list)
print(texts,label_list)