### Libraries

In [None]:
import sys
sys.path.append(r"C:\Pro\Stages\A4 - DVRC\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\myLibraries")

### Optimized Generalized Scorer

In [None]:
from custom_score.utils import *
from custom_score.score import score 
from rouge_score import rouge_scorer
import bert_score
import pandas as pd
from scipy.stats import pearsonr
from colorama import Fore, Style
from datetime import datetime
import numpy as np

import sys
sys.path.append(get_git_root())

from BARTScore.bart_score import BARTScorer

class Refiner:

    def __init__(self, corpus, gold, model=None, metric=score, ratio=2, threshold=0.70, maxSpacing=10, printRange=range(0, 1)):
        """
        Constructor of the Refiner class. Aims at reducing the size and noise of a given independant list of documents.
        
        :param1 self (Refiner): Object to initialize.
        :param2 corpus (List): List of documents to simplify.
        :param3 gold (List): List of gold summaries to compare to the extractive summary created with the refiner.
        :param4 model (Any): Model used to compute scores and create sentence's ranking.
        :param5 ratio (float, int or array-like): Number determining how much the reference text will be shortened. 
        :param6 threshold (float): Number between 0 and 1 indicating the lowest acceptable quality when tuning the length of the summary.
        :param7 maxSpacing (int): Maximal number of adjacent space to be found and suppressed in the corpus.
        :param8 printRange (range): Range of corpus that should be displayed when the Refiner object in printed. 
        """
        self.corpus = corpus
        self.gold = gold
        self.processedCorpus = None
        self.model = model
        self.metric = metric
        self.ratio = ratio
        self.threshold = threshold
        self.ms = maxSpacing
        self.refined = None
        self.printRange = printRange
        self.selectedIndexes = None

    def refine(self, checkpoints=False, saveRate=50):
        """
        Return a reduced string computed using static embedding vectors similarity. Also denoises the data by removing superfluous elements such as "\n" or useless signs.

        :param1 self (Refiner): Refiner Object (see __init__ function for more details).
        :param2 checkpoints (bool): Indicates whether the refining should save partial outputs along computation to prevent from losing data in the context of a crash.
        :param3 saveRate (int): Only applicable id safe equals True. Specify the number of consicutive iterations after which a checkpoint should be created. 

        :output refined (string): refined version of the initial document.
        """
        self.refined = []
        self.selectedIndexes = []
        self.processedCorpus = []
        if checkpoints:
            iter = 0
            start = datetime.now()
            createFolder = True

        for indiv in self.corpus:
            #preprocess corpus
            #clean = cleanString(indiv, self.ms)
            clean = indiv
            sentences = clean.split(".")
            sentences.pop()
            temp = []
            for sentence in sentences: 
                if sentence != None and sentence != "":
                    temp.append(sentence)
            sentences = temp
            respaced_sentences = []
            for sentence in sentences:
                if sentence[0] == " ":
                    sentence = sentence[1:]
                respaced_sentences.append(sentence)
            self.processedCorpus.append(respaced_sentences)

            #compute ranking
            scores = []
            formated_refs = []
            formated_cands = []
            for sentence in respaced_sentences:
                formated_refs.append(indiv.replace(sentence+".", ""))
                formated_cands.append(sentence)
                #scoreOut = self.scorer(indiv.replace(sentence+".", ""), sentence)
                #scores.append(scoreOut)
            scores = self.scorer(formated_refs, formated_cands)

            #compute distances
            distances = []
            for x in range(len(respaced_sentences)):
                try:
                    distance = self.scorer(respaced_sentences[x]*len(respaced_sentences), respaced_sentences)
                except:
                    distance = [-1]*len(respaced_sentences)
                distances.append(distance)
            distances = parseDistances(distances)

            #selection of best individuals
            indices = None
            if type(self.ratio) == int or type(self.ratio) == float: 
                indices = sentenceSelection(respaced_sentences, scores, distances, self.ratio)
            else:
                for curRatio in sorted(self.ratio):
                    curIndices = sentenceSelection(respaced_sentences, scores, distances, curRatio)
                    subCurRefined = [respaced_sentences[i] for i in curIndices]
                    curSentence = " ".join(subCurRefined)
                    curScore = self.scorer(indiv.replace(curSentence+".", ""), curSentence) #potentiellement faux 
                    if curScore < self.threshold:
                        try:
                            indices = curBest
                        except:
                            indices = curIndices
                        finally:
                            break
                    else:
                        curBest = curIndices
                if indices is None:
                    indices = curIndices
            indices.sort()
            curRefined = []
            for index in indices:
                curRefined.append(respaced_sentences[index])
            curRefined = ". \n".join(curRefined) + "."
            self.selectedIndexes.append(indices)
            self.refined.append(curRefined)

            #checkpoint verification
            if checkpoints:
                if iter % saveRate == 0 and iter != 0:
                    stop = datetime.now()
                    partial_runtime = stop - start
                    self.save(runtime=partial_runtime, new=createFolder)
                    createFolder = False
                iter += 1
        if checkpoints:
            stop = datetime.now()
            runtime = stop - start
            self.save(runtime=runtime, new=createFolder)

    def scorer(self, refs, cands, param="F"):
        param = param.upper()
        if self.metric.__module__ == "custom_score.score":
            if self.model == None:
                self.model = model_load("Word2Vec", True)
            scores = self.metric(self.model, cands, refs)[0]
            R, P, F = []
            for score in scores:
                R.append(score[0])
                P.append(score[1])
                F.append(score[2])
            
        elif self.metric.__module__ == "bert_score.score":
            with nostd():
                scores = self.metric(cands, refs, lang="en", verbose=0)
            P = scores[0].tolist()
            R = scores[1].tolist()
            F = scores[2].tolist()
      
        if param == "F":
            output = F
        elif param == "R":
            output = R
        elif param == "P":
            output = P
        elif param == "ALL":
            output = (R, P, F)
        return output

    def assess(self, start=0, stop=None, verbose=True):
        """
        Assesses quality of the refined corpus by computing Static BERTscore and Rouge-Score on the refined version compared to it's initial version.

        :param1 self (Refiner): Refiner Object (see __init__ function for more details).
        :param2 start (int): Starting index to assess.
        :param3 stop (int): Ending index to assess.
        :param4 verbose (Boolean): When put to True, assess results will be printed.

        :output (dict): Dictionnary containing both the scores of Static BERTScore, BERTScore, BARTScore and Rouge as well as their correlation.
        """
        assert self.refined != None, "refined corpus doesn't exists"
        
        if stop == None:
            stop = len(self.refined)
        subset_refined = self.refined[start:stop]
        subset_gold = self.gold[start:stop]

        #Static BERTScore computation
        scoreOut = score(self.model, subset_refined, subset_gold)
        customScore = [parseScore(curScore) for curScore in scoreOut]

        #Rouge-Score computation
        rougeScorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        rougeScore = [rougeScorer.score(c, r) for c, r in zip(subset_gold, subset_refined)]

        #BERTScore computation
        with nostd():
            bertscore = bert_score.score(subset_refined, subset_gold, lang="en", verbose=0)

        #bartscore
        bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')
        bartscore = bart_scorer.score(subset_refined, subset_gold, batch_size=4)

        #Data formating
        custom_R = [round(t, 2) for t in customScore]
        bertscore_R = [round(t.item(), 2) for t in bertscore[1]]
        bartscore = [round(t, 2) for t in bartscore]
        rouge1_R = [round(t['rouge1'][0], 2) for t in rougeScore]
        rouge2_R = [round(t['rouge2'][0], 2) for t in rougeScore]
        rougeL_R = [round(t['rougeL'][0], 2) for t in rougeScore]

        dfCustom = pd.DataFrame({'CBERT' : custom_R,
                                 'BERTScore' : bertscore_R,
                                 'BARTScore' : bartscore,
                                 'R-1' : rouge1_R,
                                 'R-2' : rouge2_R,
                                 'R-L' : rougeL_R
                                })

        #Correlation estimation
        pearsonCor_c_r1 = np.round(pearsonr(custom_R, rouge1_R), 2)
        pearsonCor_c_r2 = np.round(pearsonr(custom_R, rouge2_R), 2)
        pearsonCor_c_rl = np.round(pearsonr(custom_R, rougeL_R), 2)
        pearsonCor_bertscore_r1 = np.round(pearsonr(bertscore_R, rouge1_R), 2)
        pearsonCor_bertscore_r2 = np.round(pearsonr(bertscore_R, rouge2_R), 2)
        pearsonCor_bertscore_rl = np.round(pearsonr(bertscore_R, rougeL_R), 2)
        pearsonCor_bartscore_r1 = np.round(pearsonr(bartscore, rouge1_R), 2)
        pearsonCor_bartscore_r2 = np.round(pearsonr(bartscore, rouge2_R), 2)
        pearsonCor_bartscore_rl = np.round(pearsonr(bartscore, rougeL_R), 2)

        dfCor = pd.DataFrame({'pearson_CBERT_R-1' : pearsonCor_c_r1,
                              'pearson_CBERT_R-2' : pearsonCor_c_r2,
                              'pearson_CBERT_R-L' : pearsonCor_c_rl,
                              'pearson_BERT_R-1' : pearsonCor_bertscore_r1,
                              'pearson_BERT_R-2' : pearsonCor_bertscore_r2,
                              'pearson_BERT_R-l' : pearsonCor_bertscore_rl,
                              'pearson_BART_R-1' : pearsonCor_bartscore_r1,
                              'pearson_BART_R-2' : pearsonCor_bartscore_r2,
                              'pearson_BART_R-l' : pearsonCor_bartscore_rl}, index=["Pearson score", "p-value"])
        if verbose:
            printout = "Scores: \n"
            printout += dfCustom.to_string().decode("utf8") + "\n\n"
            printout += "Correlations: \n"
            printout += dfCor.to_string().decode("utf8")
            print(printout)

        return {"scores": dfCustom, "correlations": dfCor}
    
    def to_dataframe(self):
        """
        Transforms a Refiner object to a dataframe.

        :param1 self (Refiner): Refiner Object (see __init__ function for more details).

        :output output (DataFrame): DataFrame containing both the corpus and the refined texts of the Refiner class. 
        """
        output = pd.DataFrame({"text": self.corpus,
                               "summary": self.refined,
                               "processedText": [". ".join(c) for c in self.processedCorpus]})
        return output

    def save(self, runtime=None, new=True):
        """
        Saves Refiner output to a local folder.

        :param1 self (Refiner): Refiner Object (see __init__ function for more details).
        :param2 new (bool): Indicates if a new folder should be created. If false, output is append to the most recent ouput folder.
        """

        #evaluation
        start = 0
        stop = len(self.refined)
        assessement = self.assess(start=start, stop=stop)

        #mainDf = r.to_dataframe()
        scoreDf = assessement["scores"]
        corDf = assessement["correlations"]

        #write output
        main_folder_path = os.path.join(get_git_root(), r"myLibraries\refining_output")
        countfile_name = r"count.txt"
        if new:
            count = updateFileCount(os.path.join(main_folder_path, countfile_name))
        else:
            count = readFileCount(os.path.join(main_folder_path, countfile_name))

        current_path = os.path.join(main_folder_path, f"experimentation_{count}")
        try:
            os.mkdir(current_path)
        except FileExistsError:
            pass

        #mainDf.to_csv(os.path.join(current_path, "main.csv"))
        scoreDf.to_csv(os.path.join(current_path, "scores.csv"))
        corDf.to_csv(os.path.join(current_path, "correlations.csv"))
        with open(os.path.join(current_path, "runtimes.txt"), "w") as f:
            f.write(str(runtime))

    def __str__(self) -> str:
        """
        Summarizes Refiner object to a string.

        :param1 self (Refiner): Refiner Object (see __init__ function for more details).

        :output printout (string): Summarized informations about the refiner object.
        """

        printout = "--------REFINER OBJECT--------\n\n"
        printout += "Number of Documents : " + str(len(self.corpus)) + "\n"
        printout += "Corpus Avg Size     : " + str(int(np.average([len(x) for x in self.corpus]))+1) + "\n"
        printout += "Refined Avg Size    : " + str(int(np.average([len(x) for x in self.refined]))+1) + "\n"
        printout += "Ratio(s)            : " + str(self.ratio) + "\n"
        printout += "Threshold           : " + str(self.threshold) + "\n"
        printout += "Maximum Spacing     : " + str(self.ms) + "\n"
        
        self.printRange = self.printRange if self.printRange.start >= 0 and self.printRange.stop < len(self.processedCorpus) else range(0, len(self.processedCorpus))

        for index in self.printRange:
            printout += f"\nCorpus no.{index+1} : \n" + str(".\n".join([f"{Fore.LIGHTGREEN_EX}{self.processedCorpus[index][i]}{Style.RESET_ALL}"
                                                        if i in self.selectedIndexes[index]
                                                        else f"{Fore.RED}{self.processedCorpus[index][i]}{Style.RESET_ALL}"
                                                        for i in range(len(self.processedCorpus[index]))])) + "." + "\n"
        printout += "\n------------------------------"
        return printout

In [None]:
w2v = model_load("Word2Vec", True)

In [None]:
r = Refiner(["I am Marius. I like trains.", "Engineering is good. It is fun."], ["My name is Marius. I think trains are cool.", "I enjoy datascience. It is my studies."], w2v, metric=bert_score.score)

In [None]:
r.refine()

In [None]:
_=r.assess()

### Refiner .py Test

see file <refine_tests_2.py>

In [1]:
import sys
sys.path.append(r"C:\Pro\Stages\A4 - DVRC\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\myLibraries")

In [2]:
from refine_tests_2 import Refiner
from custom_score.utils import model_load
from custom_score.score import score
import bert_score

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
w2v = model_load("Word2Vec", True)

In [4]:
r = Refiner(["I am Marius. I like trains.", "Engineering is good. It is fun."], ["My name is Marius. I think trains are cool.", "I enjoy datascience. It is my studies."], w2v, metric=bert_score.score, ratio=[2, 3])
r.refine()

In [None]:
out = r.assess()
print(r)

### Generalized Refiner - Test on Billsum

In [None]:
import sys
sys.path.append(r"C:\Pro\Stages\A4 - DVRC\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\myLibraries")

from refine_tests_2 import Refiner
from custom_score.utils import model_load
from custom_score.score import score
import bert_score
import pandas as pd

In [None]:
dataset_url="https://drive.google.com/file/d/1Wd0M3qepNF6B4YwFYrpo7CaSERpudAG_/view?usp=share_link"
dataset_url='https://drive.google.com/uc?export=download&id=' + dataset_url.split('/')[-2]
dataset = pd.read_json(dataset_url, lines=True)
dataset = dataset.loc[:, ["text", "summary"]]

In [None]:
subset = dataset.iloc[:2, :]

In [None]:
w2v = model_load("Word2Vec", True)

In [None]:
r = Refiner(subset["text"].to_list(), subset["summary"].to_list(), w2v, metric=bert_score.score)
r.refine()

In [None]:
r.assess()

In [None]:
print(r)

### Package test

In [1]:
import sys
sys.path.append(r"C:\Pro\Stages\A4 - DVRC\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\myLibraries")
from custom_score.refine import Refiner
from custom_score.utils import model_load
from custom_score.score import score
import bert_score
from datasets_loaders.loaders import load_billsum

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
billsum = load_billsum()

In [3]:
subset = billsum.iloc[30:32, :]

In [4]:
w2v = model_load("Word2Vec", True)

In [5]:
r = Refiner(subset["text"].to_list(), subset["summary"].to_list(), w2v, metric=bert_score.score, dist_metric=score)
r.refine()

In [6]:
_=r.assess()

Scores: 
   CBERT-R  CBERT-P  CBERT-F  BERTScore-R  BERTScore-P  BERTScore-F  BARTScore   R-1   R-2   R-L
0     0.83     0.55     0.67         0.87         0.79         0.83       0.37  0.15  0.07  0.11
1     0.86     0.60     0.70         0.87         0.79         0.83       0.44  0.09  0.05  0.07

Correlations: 
               pearson_CBERT_R-1  pearson_CBERT_R-2  pearson_CBERT_R-L  pearson_BERT_R-1  pearson_BERT_R-2  pearson_BERT_R-l  pearson_BART_R-1  pearson_BART_R-2  pearson_BART_R-l
Pearson score               -1.0               -1.0               -1.0               NaN               NaN               NaN              -1.0              -1.0              -1.0
p-value                      1.0                1.0                1.0               NaN               NaN               NaN               1.0               1.0               1.0




In [7]:
print(r)

--------REFINER OBJECT--------

Number of Documents : 2
Corpus Avg Size     : 6339
Refined Avg Size    : 3442
Ratio(s)            : 2
Threshold           : 0.7
Maximum Spacing     : 10

------------------------------

Corpus no.1 : 83.0%
[95m(76%)[0m [92mSECTION 1[0m.
[95m(75%)[0m [92mSHORT TITLE[0m.
[95m(80%)[0m [92mThis Act may be referred to as the Sweetgrass Hills Protection Act of 1995[0m.
[95m(77%)[0m [92mSEC[0m.
[95m(76%)[0m [31m2[0m.
[95m(76%)[0m [92mSPECIAL MANAGEMENT AREA[0m.
[95m(77%)[0m [92m(a) In General[0m.
[95m(82%)[0m [31mFor the purpose of conserving, protecting, and enhancing the exceptional scenic, wildlife, water quality, and cultural characteristics of lands along the Sweetgrass Hills in north central Montana, there is hereby established the Sweetgrass Hills Natural Area within the Bureau of Land Management's Sweetgrass Hills Area of Critical Environmental Concern (ACEC) as identified in the West HiLine Resource Management Plan in the

In [9]:
r.showDistributions([0, 1])

Corpus n.0 : 83.0% 



Corpus n.1 : 83.0% 

