In [None]:
!pip install "spacy[transformers]"
!python -m spacy download en_core_web_trf

In [None]:
import os
import random
import copy
import numpy as np
from recap_am.adu import run_task, classify, utilities
from recap_am.controller.preprocess import read_files, prep_production
from recap_am.model.config import Config
import recap_am.adu.utilities as utils
from recap_am.adu.feature_select import filter_feats, add_embeddings
import recap_am.relation.controller.pairwise_comparison as pc
from sklearn.utils import shuffle
import argparse
import multiprocessing

In [3]:
def single_run(input_files, label_files, test_files, test_labels, adu_mode="classifed"):
    doc = read_files(input_files, label_files)
    print("Read Train Files")
    cl_result = run_task.run_mc_train(doc)
    print("Finished Training")
    t_doc = read_files(test_files, test_labels)
    print("Read Test Files")
    if adu_mode == "true":
        acc, prec, rec, f1 = run_task.run_test_mc(t_doc, cl_result)
        print("Precision:\tCLPR\t%8.8f" % prec)
    elif adu_mode == "classified":
        orig_adus_labels = t_doc._.Labels
        t_doc = filter_feats(t_doc, load=False)
        print("Filtered feats")
        t_doc = add_embeddings(t_doc)
        print("Added Embdes")
        t_doc = classify.predict(t_doc)
        mc_feats = []
        for idx, l in enumerate(t_doc._.Labels):
            if l == 1:
                mc_feats.append(t_doc._.Features[idx])
        t_doc._.MC_Features = mc_feats
        feature = t_doc._.MC_Features
        label = t_doc._.MC_Labels
        feature = np.asarray(feature)
        predictions = cl_result.predict(feature).tolist()
        cl_iter = 0
        correct_count = 0
        for idx, l in enumerate(orig_adus_labels):
            if cl_iter < len(predictions):
                if orig_adus_labels[idx] == 1:
                    if orig_adus_labels[idx] == t_doc._.Labels[idx]:
                        if label[cl_iter] == predictions[cl_iter]:
                            correct_count += 1
                    cl_iter += 1
            else:
                break
        prec = correct_count / len(orig_adus_labels)
        print("Precision:\tCLPR\t%8.8f" % prec)

#pas changé pour mc
def LOOCV(input_files, label_files, test_files, test_labels, n_runs=5, adu_mode="true"):
    avg_cl_acc = 0.0
    avg_cl_prec = 0.0
    avg_cl_rec = 0.0
    avg_cl_f1 = 0.0
    for i in range(n_runs):
        print("Run", i)
        input_files, label_files = shuffle(input_files, label_files)
        test_files = test_files[-1]
        test_labels = test_labels[-1]
        doc = read_files(input_files, label_files)
        print("Read Train Files")
        cl_result = run_task.run_clpr_train(doc)
        print("Finished Training")
        t_doc = read_files(test_files, test_labels)
        print("Read Test Files")
        if adu_mode == "true":
            acc, prec, rec, f1 = run_task.run_clpr_test(t_doc, cl_result)
            avg_cl_acc += acc
            avg_cl_prec += prec
            avg_cl_rec += rec
            avg_cl_f1 += f1
        elif adu_mode == "classified":
            orig_adus_labels = t_doc._.Labels
            orig_adus = t_doc._.ADU_Sents
            t_doc = filter_feats(t_doc, load=False)
            print("Filtered feats")
            t_doc = add_embeddings(t_doc)
            print("Added Embdes")
            t_doc = classify.predict(t_doc)
            clpr_feats = []
            for idx, l in enumerate(t_doc._.Labels):
                if l == 1:
                    clpr_feats.append(t_doc._.Features[idx])
            t_doc._.CLPR_Features = clpr_feats
            feature = t_doc._.CLPR_Features
            label = t_doc._.CLPR_Labels
            feature = np.asarray(feature)
            predictions = cl_result.predict(feature).tolist()
            cl_iter = 0
            correct_count = 0
            for idx, l in enumerate(orig_adus_labels):
                if orig_adus_labels[idx] == 1:
                    if orig_adus_labels[idx] == t_doc._.Labels[idx]:
                        if label[cl_iter] == predictions[cl_iter]:
                            correct_count += 1
                    cl_iter += 1
            acc = correct_count / len(orig_adus)
            avg_cl_acc += acc
    avg_cl_acc /= n_runs
    print("Avg Accuracy:\tCLPR\t%8.8f" % avg_cl_acc)
    if adu_mode == "true":
        avg_cl_prec /= n_runs
        avg_cl_rec /= n_runs
        avg_cl_f1 /= n_runs
        print("Avg Precision:\tCLPR\t%8.8f" % avg_cl_prec)
        print("Avg Recall:\tCLPR\t%8.8f" % avg_cl_rec)
        print("Avg F1-Score:\tCLPR\t%8.8f" % avg_cl_f1)

def read(in_files, l_files, input_file):
    if os.path.isdir(input_file):
        for file in os.listdir(input_file):
            in_files, l_files = read(in_files, l_files, input_file + "/" + file)
    else:
        if input_file.endswith(".text"):
            in_files.append(input_file)
            l_files.append(input_file.replace(".text", ".label"))
    return in_files, l_files



In [4]:
if __name__ == '__main__':
    multiprocessing.freeze_support()
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-m",
        "--mode",
        default="single",
        type=str,
        help="Single or LOOCV run",
        choices=["LOOCV", "single"],
    )
    parser.add_argument(
        "-a",
        "--adu",
        default="true",
        type=str,
        help="Classified or True ADUs",
        choices=["classified", "true"],
    )
    args, unknown = parser.parse_known_args()
    mode = args.mode
    adu = args.adu
    in_files = []
    l_files = []
    t_files = []
    t_labels = []
    config = Config.get_instance()
    datapath = "data/ADU/in/PE/" + config["nlp"]["language"]    
    #train_path="C:/Users/DELL/Documents/cours/INSA/MIC/stages/stage_IRIT/fichiers/train_petit"
    #test_path="C:/Users/DELL/Documents/cours/INSA/MIC/stages/stage_IRIT/fichiers/test_petit"
    train_path = datapath + "/train"
    test_path = datapath + "/test"

In [5]:


for in_file in os.listdir(train_path):
    in_files, l_files = read(in_files, l_files, train_path + "/" + in_file)

for in_file in os.listdir(test_path):
     t_files, t_labels = read(t_files, t_labels, test_path + "/" + in_file)






In [6]:
mode="single"
if mode == "LOOCV":
    LOOCV(in_files, l_files, t_files, t_labels, adu_mode=adu)
elif mode == "single":
    single_run(in_files, l_files, t_files, t_labels, adu_mode=adu)

Reading Document	data/ADU/in/PE/en/train/essay101.text
Reading Document	data/ADU/in/PE/en/train/essay112.text
Reading Document	data/ADU/in/PE/en/train/essay117.text
Reading Document	data/ADU/in/PE/en/train/essay126.text
Reading Document	data/ADU/in/PE/en/train/essay141.text
Reading Document	data/ADU/in/PE/en/train/essay148.text
Reading Document	data/ADU/in/PE/en/train/essay150.text
Reading Document	data/ADU/in/PE/en/train/essay152.text
Reading Document	data/ADU/in/PE/en/train/essay174.text
Reading Document	data/ADU/in/PE/en/train/essay177.text
Reading Document	data/ADU/in/PE/en/train/essay186.text
Reading Document	data/ADU/in/PE/en/train/essay192.text
Reading Document	data/ADU/in/PE/en/train/essay196.text
Reading Document	data/ADU/in/PE/en/train/essay197.text
Reading Document	data/ADU/in/PE/en/train/essay198.text
Reading Document	data/ADU/in/PE/en/train/essay201.text
Reading Document	data/ADU/in/PE/en/train/essay211.text
Reading Document	data/ADU/in/PE/en/train/essay218.text
Reading Do

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
