### Download packages

In [None]:
!pip install einops

### Import modules

In [1]:
import os
import csv
import json
import re
import torch
import time
import datetime
import evaluate
import re

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoConfig
)


  from .autonotebook import tqdm as notebook_tqdm


### Define Working conditions

In [2]:
# Dataset
name_dataset = "LanguageTestDataSet" # The dataset must be in a certain format

# Models used
used_models_name = {"mosaicml/mpt-7b-instruct"} # Models taken for computation 

# Input limits
token_limit = 3500 # To be determined with the context length of the used models

# Description of the inference wanted
desc = "This inference batch is aimed at testing MPT7B on summarization. There is no French summarization at all."

# Caracteristics of the inference wanted
max_new_tokens=700
top_k=20
temperature=0.3

### Useful functions and constants

In [9]:
# Prompting elements
instruction_templates = {"text_en":"Summarize the following text:\n\n{text}", "text_fr":"Résume le texte suivant:\n\n{text}"}
text_keys = {'text_en'} # Cet ensemble donne les clés pour le texte français et anglais. On retire 'text_fr' pour les modèles anglais
key_to_language = {'text_fr':'0', 'text_en':'1'}
language_to_key = {'0':'text_fr', '1':'text_en'}
number_to_code = {'0':'fr', '1':'en', '2':'en'}

# Miscellaneous useful functions and constants
def count_words(s):
    words = re.findall(r'\b\w+\b', s)  # Find all word-like sequences using regular expression
    return len(words)  # Return the number of words

def mkdir(folder_path):
    try:
        os.mkdir(folder_path)
    except FileExistsError:
        pass

def classify(text): # Classificateur déterministe basique, qui accumule des indices de langue et renvoie le langage avec le plsu haut score
    score_en = 0
    score_fr = 0
    score_en += len(find_all_occurrences_regex(text, " and " ))
    score_en += len(find_all_occurrences_regex(text, " of " ))
    score_en += len(find_all_occurrences_regex(text, " the " ))
    score_en += len(find_all_occurrences_regex(text, " in " ))
    score_en += len(find_all_occurrences_regex(text, " is " ))
    score_en += len(find_all_occurrences_regex(text, " for " ))
    score_en += len(find_all_occurrences_regex(text, " how " ))
    score_en += len(find_all_occurrences_regex(text, " with " ))
    score_fr += len(find_all_occurrences_regex(text, " le " ))
    score_fr += len(find_all_occurrences_regex(text, " la " ))
    score_fr += len(find_all_occurrences_regex(text, " de " ))
    score_fr += len(find_all_occurrences_regex(text, " un " ))
    score_fr += len(find_all_occurrences_regex(text, " une " ))
    score_fr += len(find_all_occurrences_regex(text, " et " ))
    score_fr += len(find_all_occurrences_regex(text, " à " ))
    score_fr += len(find_all_occurrences_regex(text, " avec " ))
    score_fr += len(find_all_occurrences_regex(text, " il " ))
    score_fr += len(find_all_occurrences_regex(text, " pour " ))
    if score_fr + score_en <= 3: return 2
    if score_fr > score_en: return 0
    return 1

def find_all_occurrences_regex(text, pattern):
    occurrences = [match.start() for match in re.finditer(pattern, text)]
    return occurrences

def maxRouge(summaries_data, language_code, generated_summary, golds): # Pour un dictionnaire de résumés de référence, pour un langage donné, renvoie le résumé de référence le plus proche du résumé prédit au sens de rouge et les scores associés
    max_rouge2 = -1
    max_rougel = -1
    nb_words_closest_gold = 0

    for i in range(len(summaries_data)):
        summary_data = summaries_data[i]
        summary_text = summary_data["text_" + language_code]
        result_rouge = rouge.compute(predictions=[generated_summary], references=[summary_text], use_aggregator=False)
        golds.append(summary_text)
        if result_rouge['rouge2'][0] > max_rouge2:
            max_rouge2 = result_rouge['rouge2'][0]
            nb_words_closest_gold = summary_data['nb_words_' + language_code]
        if result_rouge['rougeL'][0] > max_rougel:
            max_rougel = result_rouge['rougeL'][0]

    return max_rouge2, max_rougel, nb_words_closest_gold

def load_json_into_dict(path):
    file = open(path, 'r', encoding='utf-8')
    dict = json.load(file)
    file.close()
    return dict


In [4]:
# These dictionaries have as a key the model name and as the value the function that load the prompt, the tokenizer, the model, give the context length or treat the output.

prompt_templates = {} # Dictionnaire qui associe les templates de prompts, il faut insérer l'instruction et le texte à résumer.
context_lengths = {} # Dictionnaire qui donne l'entier correspondant à la longueur de contexte
tokenizers = {} # Dictionnaire qui associe la méthode pour obtenir le tokenizer du modèle
models = {} # Dictionnaire qui associe la méthode pour obtenir le modèle
treat_output = {} # Dictionnaire qui associe la méthode pour traiter l'output et ne conserver que la génération du modèle
infer = {} # Dictionnaire qui associe la méthode pour l'inférence du modèle. Prend en paramètres l'input et le modèle

### Define model specific functions

In [5]:
# Models available
models_name = {
    "legendhasit/xgen-7b-8k-inst-8bit/4bit", # 1 GPU 
    "legendhasit/xgen-7b-8k-inst-8bit", # 1 GPU
    "Salesforce/xgen-7b-8k-inst", # 2GPU (3 shards de 10Go peut-être la dernière moins) de préférence, mais fonctionne avec 1... Sauf pour les textes longs, genre les plus petits de Fredsum + probablement mauvaise suppression automatique de la mémoire -> On a des OOM à 38GB
    "mosaicml/mpt-7b-instruct", # 1 GPU (2 shards de 10 Go)- To quantize
    "Trelis/mpt-7b-instruct-hosted-inference-8bit", #  - Potentially unreliable
    "mosaicml/mpt-7b-8k-instruct", # To quantize
    "mosaicml/mpt-30b-instruct" # To quantize
}

##### XGen

In [6]:
# Prompt templates
prompt_template_XGen = "A chat between a curious human and an artificial intelligence assistant.\nThe assistant gives helpful, detailed, and polite answers to the human's questions.\n\n### Human: {instruction}\n\n### Assistant: "

prompt_templates["legendhasit/xgen-7b-8k-inst-8bit"] = prompt_template_XGen
prompt_templates["legendhasit/xgen-7b-8k-inst-8bit/4bit"] = prompt_template_XGen
prompt_templates["Salesforce/xgen-7b-8k-inst"] = prompt_template_XGen

# Context lengths # Allow to know whether the model generated out of his context window
context_length_XGen = 8000 

context_lengths["legendhasit/xgen-7b-8k-inst-8bit"] = context_length_XGen
context_lengths["legendhasit/xgen-7b-8k-inst-8bit/4bit"] = context_length_XGen
context_lengths["Salesforce/xgen-7b-8k-inst"] = context_length_XGen

# Tokenizers
def get_tokenizer_XGen8bit(): # Le même pour le 4bit
    return AutoTokenizer.from_pretrained('legendhasit/xgen-7b-8k-inst-8bit', trust_remote_code=True)

def get_tokenizer_XGen():
    return AutoTokenizer.from_pretrained('Salesforce/xgen-7b-8k-inst', trust_remote_code=True)

tokenizers["legendhasit/xgen-7b-8k-inst-8bit"] = get_tokenizer_XGen8bit
tokenizers["legendhasit/xgen-7b-8k-inst-8bit/4bit"] = get_tokenizer_XGen8bit
tokenizers["Salesforce/xgen-7b-8k-inst"] = get_tokenizer_XGen

# Models

def get_model_XGen8bit():
    model = AutoModelForCausalLM.from_pretrained(
    'legendhasit/xgen-7b-8k-inst-8bit',
    device_map="auto",
    trust_remote_code=True
    )
    return model

def get_model_XGen4bit():
    bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
    'legendhasit/xgen-7b-8k-inst-8bit',
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
    )
    return model

def get_model_XGen():
    model = AutoModelForCausalLM.from_pretrained(
    'Salesforce/xgen-7b-8k-inst',
    device_map="auto",
    trust_remote_code=True
    )
    return model

models["legendhasit/xgen-7b-8k-inst-8bit"] = get_model_XGen8bit
models["legendhasit/xgen-7b-8k-inst-8bit/4bit"] = get_model_XGen4bit
models["Salesforce/xgen-7b-8k-inst"] = get_model_XGen

# Tokenize (if specificities in the way models call the tokenizer)

# Inference (if specificities in the way models call the generate function)

def infer_XGen(tokenized_input, model):
    return model.generate(**tokenized_input, do_sample=True, max_new_tokens=max_new_tokens, top_k=top_k, eos_token_id=50256, temperature=temperature) # eos_token_id : id du token de fin de réponse d'XGen dans son vocabulaire.

infer['legendhasit/xgen-7b-8k-inst-8bit'] = infer_XGen
infer['legendhasit/xgen-7b-8k-inst-8bit/4bit'] = infer_XGen
infer['Salesforce/xgen-7b-8k-inst'] = infer_XGen

# Treating the output (to remove the input if present in the output as well as the end of text token for example)

def treat_output_XGen(output): # Differs if it is N-shot. Here, it is 0-shot
    occ_1 = output.find("### Assistant: ")
    output = output[occ_1+15:]
    if output.find('<|endoftext|>')!=-1:
        output = output[:-14]
    return output

treat_output['legendhasit/xgen-7b-8k-inst-8bit'] = treat_output_XGen
treat_output['legendhasit/xgen-7b-8k-inst-8bit/4bit'] = treat_output_XGen
treat_output['Salesforce/xgen-7b-8k-inst'] = treat_output_XGen

# /workspace/.miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py:1411: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation )
# Avec XGen de salesforce

##### MPT

In [7]:
# Prompt templates
prompt_template_MPT7B = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:
"""

prompt_template_MPT30B = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n###Instruction\n{instruction}\n\n### Response\n" # On considère que c'est le même template que pour MPT 7B 8k car pour ce dernier, le template utilisé n'est pas précisé

prompt_templates["mosaicml/mpt-7b-instruct"] = prompt_template_MPT7B
prompt_templates["Trelis/mpt-7b-instruct-hosted-inference-8bit"] = prompt_template_MPT7B
prompt_templates["mosaicml/mpt-7b-8k-instruct"] = prompt_template_MPT30B
prompt_templates["mosaicml/mpt-30b-instruct"] = prompt_template_MPT30B

# Context lengths

context_lengths["mosaicml/mpt-7b-instruct"] = 2048 # 4096 d'après mosaicml

# Models
def get_model_MPT7B():
    config = AutoConfig.from_pretrained("mosaicml/mpt-7b-instruct", trust_remote_code=True)
    config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
    return AutoModelForCausalLM.from_pretrained("mosaicml/mpt-7b-instruct",config=config,trust_remote_code=True,device_map = "auto")

models["mosaicml/mpt-7b-instruct"] = get_model_MPT7B

# Tokenize
def get_tokenizer_MPT7B():
    return AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct", trust_remote_code=True)

tokenizers["mosaicml/mpt-7b-instruct"] = get_tokenizer_MPT7B
# Inference ######################################

def infer_MPT7B(tokenized_input, model):
    return model.generate(**tokenized_input, do_sample=True, max_new_tokens=max_new_tokens, top_k=top_k, temperature=temperature) # eos_token_id : id du token de fin de réponse d'XGen dans son vocabulaire.

infer["mosaicml/mpt-7b-instruct"] = infer_MPT7B

# Treating the output ######################################

def treat_output_MPT(output): # Differs if it is N-shot. Here, it is 0-shot
    occ_1 = output.find("### Response:\n")
    output = output[occ_1+14:]
    if output.find('<|endoftext|>')!=-1:
        output = output[:-14]
    return output

treat_output["mosaicml/mpt-7b-instruct"] = treat_output_MPT
treat_output["Trelis/mpt-7b-instruct-hosted-inference-8bit"] = treat_output_MPT
treat_output["mosaicml/mpt-7b-8k-instruct"] = treat_output_MPT
treat_output["mosaicml/mpt-30b-instruct"] = treat_output_MPT


### Pre-processing the input dataset
Optional

In [8]:
for corpus in os.listdir("datasets/" + name_dataset): # This script fills in the number of words and number of characters of the input files
    for file_name in os.listdir("datasets/" + name_dataset + "/" + corpus):
        file_path = "datasets/" + name_dataset + "/" + corpus + "/" + file_name
        if file_name==".ipynb_checkpoints" or os.path.isdir(file_path):
            continue
        file = open(file_path, 'r', encoding='utf-8')
        data = json.load(file)
        file.close()
        text_en = data["text_en"]
        text_fr = data["text_fr"]
        data["nb_words_en"] = str(count_words(text_en))
        data["nb_words_fr"] = str(count_words(text_fr))
        data["nb_characters_en"] = str(len(text_en))
        data["nb_characters_fr"] = str(len(text_fr))
        
        for i in range(len(data["summaries"])):
            summary = data["summaries"][i]
            text_en = summary["text_en"]
            text_fr = summary["text_fr"]
            summary["nb_words_en"] = str(count_words(text_en))
            summary["nb_words_fr"] = str(count_words(text_fr))
            summary["nb_characters_en"] = str(len(text_en))
            summary["nb_characters_fr"] = str(len(text_fr))
        file = open(file_path, 'w', encoding='utf-8')
        json.dump(data, file, indent=4, ensure_ascii=False)
        file.close()


In [11]:
# Every time the script is casted, it must register the results in the "results" folder, without smashing the existing results
# Then, it is stored in a file whose name is the number of the result, and this file contains a little .txt note describing what was the experiment.
# In such a folder, all the generations are json files with additional information like the prompt used, the number of samples, the path of the input text

# Now, what is specific to the model ? The prompt (different headers potentially), the tokenizer, the model itself, the way the output is displayed

In [None]:
mkdir("results")

archives = os.listdir("results")
output_folder_path = "results/"+str(len(archives))
mkdir(output_folder_path) # Create a folder for the last results.
desc_file = open(output_folder_path + "/desc.txt", 'w', encoding='utf-8')
desc_file.write(desc)
desc_file.close()

initial_time = time.time()

model_total = len(used_models_name)
model_index = 0
for model_name in used_models_name: # For each model
    model_index += 1
    print("---- Model : " + model_name + " (" + str(model_index) + "/" + str(model_total) +")----              (loading tokenizer and model...)")
    load_model_time = time.time()
    prompt_template = prompt_templates[model_name]
    tokenizer = tokenizers[model_name]()
    model = models[model_name]()
    load_model_time = time.time() - load_model_time
    !nvidia-smi
    print("Tokenizer and model loaded in", datetime.timedelta(seconds=int(load_model_time)), 'seconds')
    corpora = os.listdir("datasets/" + name_dataset)
    for corpus in corpora:
        !nvidia-smi
        file_index = 0
        files_name = os.listdir("datasets/" + name_dataset + "/" + corpus)
        file_total = len(files_name)
        for file_name in files_name:
            file_path = "datasets/" + name_dataset + "/" + corpus + "/" + file_name
            if file_name=="ipynb_checkpoints" or os.path.isdir(file_path):
                continue
            file_time = time.time()
            file_index+=1
            print("Starting inference for text " + str(file_index) + "/" + str(file_total) + " in the " + corpus + " corpus.") # Possible errors in the display: file_total also counts the .ipynb checkpoints potentially in the directory.
            file = open(file_path, 'r', encoding='utf-8')
            input_data = json.load(file)
            file.close()
            for text_key in text_keys:
                input_text = input_data[text_key]
                instruction_template = instruction_templates[text_key]
                
                full_instruction = instruction_template.format(text=input_text)
                prompt = prompt_template.format(instruction=full_instruction)
                # Probably specific to the model
                input = tokenizer(prompt, return_tensors="pt").to('cuda') # Le renvoie sur le GPU car au départ, c'est généré en CPU le tensuer des tokens
                #
                input_length = len(input['input_ids'][0])
                output_name = corpus + "_" + file_name + "_" + text_key + ".json"
                output_path = output_folder_path + "/" + output_name
                output_data = {"input_path":file_path, "model":model_name, "instruction":instruction_template.format(text=""), "success":"0", "over_context":"", "input_length":str(input_length), "text":"", "output_length":"", "nb_words":"", "nb_characters":"", "input_language":input_data["language"], "output_language":""}
                if input_length > token_limit: # In this case, a file is created but no inference is made
                    print("/!\ With model " + model_name + ", the input length is above the token limit for " + text_key + " input in " + file_path + " (" + str(input_length) + " > " + str(token_limit) + ")")

                else: # In this case, the output is inferred, treated to keep only the generation and the length information
                    # PROBABLY SPECIFIC TO THE MODEL
                    sample = infer[model_name](input, model)
                    sample_length = len(sample[0])
                    ### SPECIFIC TO THE MODEL ----------------------------------------
                    if sample_length > context_lengths[model_name]: # There was a context window overflow
                        output_data["over_context"] = "1"
                    else:
                        output_data["over_context"] = "0"
                    output_length = sample_length - input_length
                    output_data["output_length"] = output_length
                    full_output = tokenizer.decode(sample[0]).strip() # prompt + output generation
                    # Ideally, treat_output is a dictionnary that associates model_name to a function taking full_output as an argument
                    output = treat_output[model_name](full_output)
                    output_data["text"] = output
                    output_data["nb_characters"] = len(output)
                    output_data["nb_words"] = count_words(output)
                    output_data["success"] = "1"
                    output_data["language_output"] = str(classify(output))
                # SAVING THE OUTPUT
                output_file = open(output_path, 'w', encoding='utf-8')
                json.dump(output_data, output_file, indent=4, ensure_ascii=False)
                output_file.close()
            file_time = time.time() - file_time
            print("Inference done for French and English in", datetime.timedelta(seconds=int(file_time)), "seconds.")
    # HERE, clear memory... remove model, tokenizer from memory
                
delta = time.time() - initial_time
print('Done! Took', datetime.timedelta(seconds=int(delta)), 'seconds')
# Write treatOutput function / dictionnary


# RETIRER le fichier template dans samsum

### Post-processing

In [10]:

# A FAIRE : GERER LE BERT MULTILINGUE
# Anglais vers français : taux de foisonnement de 20%

rouge = evaluate.load('rouge') # Warning ! When comparing many references to a prediction, an average is performed, not a max...
bertscore = evaluate.load('bertscore') # The BERTScore calculation will take a lot of time... (if not performed in only one call)

mkdir('results')

nb_batch = "0" # Mettre le numéro du "batch d'inférence"

# Define the place of storage : "results/x/scores.csv"
output_names = os.listdir("results/" + nb_batch)
# Lists used for the BERT score computation
references = {"0":[], "1":[]}
predictions = {"0":[], "1":[]}
# Columns of the csv file
output_ids, rouges, input_nb_words_list, nb_words_generated_summaries, nb_words_gold_summaries, outputs_success, outputs_over_context, languages_output, languages_input = [], [], [], [], [], [], [], [], []

for output_name in output_names:
    # Makes sure the file is an output file, and opens it
    output_path = "results/" + nb_batch + "/" + output_name
    if output_name=="scores.csv" or output_name=="desc.txt" or os.path.isdir(output_path):
        continue
    output_data = load_json_into_dict(output_path)
    output_language = str(output_data["output_language"])
    language_code = "text_en" 
    if output_language in number_to_code:
        language_code = number_to_code[output_language] # Le language code est soit text_fr si output fr, soit text_en si output en ou toute autre langue ! Juste pour choisir un résumé de référence...
    generated_summary = output_data["text"]
    find_path = output_name.split('_') # 0: corpus, 1: name json input file, 2: text (quite useless) 3: fr.json or en.json
    input_path = "datasets/" + name_dataset + "/" + find_path[0] + "/" + find_path[1]
    input_data = load_json_into_dict(input_path)
    summaries_data = input_data["summaries"]
    # Computation of Rouge
    golds = [] # Sert uniquement pour le calcul du BertScore avec plusieurs références
    max_rouge2, max_rougel, nb_words_closest_gold = maxRouge(summaries_data, language_code, generated_summary, golds)
    # For BertScore
    if output_language in {"0", "1"}:
        predictions[output_language].append(generated_summary)
        references[output_language].append(golds)
    # Fill the columns
    rouges.append([max_rouge2, max_rougel])
    output_ids.append(output_name)
    nb_words_gold_summaries.append(nb_words_closest_gold)
    nb_words_generated_summaries.append(output_data['nb_words'])
    outputs_success.append(output_data['success'])
    outputs_over_context.append(output_data['over_context'])
    languages_input.append(output_data['input_language'])
    input_nb_words_list.append(int(input_data['nb_words_' + language_code]))
    languages_output.append(output_language)

max_bertscores_fr = bertscore.compute(predictions=predictions["0"], references=references["0"], lang='fr', rescale_with_baseline=True, verbose=True)['f1']
max_bertscores_en = bertscore.compute(predictions=predictions["1"], references=references["1"], lang='en', rescale_with_baseline=True, verbose=True)['f1']

index_fr = 0
index_en = 0
rows = []
for i in range(len(output_ids)):
    max_bertscore = 0
    if languages_output[i]=="0":
        max_bertscore = max_bertscores_fr[index_fr]
        index_fr += 1
    elif languages_output[i]=="1":
        max_bertscore = max_bertscores_fr[index_en]
        index_en += 1
    rows.append([output_ids[i], rouges[i][0], rouges[i][1], max_bertscore, input_nb_words_list[i], nb_words_gold_summaries[i], nb_words_generated_summaries[i], outputs_success[i], outputs_over_context[i], languages_output[i], languages_input[i]])

header = [["input_path", "rouge2", "rougel", "bertscore", "nb_words_input", "nb_words_gold", "nb_words_generated", "success", "over_context", "output_language", "input_language"]]

storage_file = open('results/' + nb_batch + '/scores.csv', mode='w', newline='', encoding='utf-8')
csv_writer = csv.writer(storage_file)
csv_writer.writerows(header + rows)
storage_file.close()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

### Hardware use

In [3]:
!nvidia-smi
# model.hf_device_map

'nvidia-smi' n'est pas reconnu en tant que commande interne
ou externe, un programme ex�cutable ou un fichier de commandes.


### TESTS

In [None]:
import transformers

name = 'mosaicml/mpt-7b-instruct'
name2 = "Trelis/mpt-7b-instruct-hosted-inference-8bit"

config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096

tokenizer = AutoTokenizer.from_pretrained('legendhasit/xgen-7b-8k-inst-8bit', trust_remote_code=True)

!nvidia-smi

model = transformers.AutoModelForCausalLM.from_pretrained(
  name,
  config=config,
  trust_remote_code=True
)

In [None]:
!nvidia-smi

input_test = tokenizer(prompt, return_tensors="pt").to('cuda')
sample = model.generate(**input_test, do_sample=True, max_new_tokens=100, top_k=20, temperature=0.3)
print(tokenizer.decode(sample[0]).strip())

In [11]:
# A FAIRE : GERER LE BERT MULTILINGUE
# COnserver ce 2eme programme adapté aux anciennes générations
# Anglais vers français : taux de foisonnement de 20%

rouge = evaluate.load('rouge') # Warning ! When comparing many references to a prediction, an average is performed, not a max...
bertscore = evaluate.load('bertscore') # The BERTScore calculation will take a lot of time... (if not performed in only one call)

mkdir('results')

nb_batch = "2" # Mettre le numéro du "batch d'inférence"

# Define the place of storage : "results/x/scores.csv"
output_names = os.listdir("results/" + nb_batch)
# The texts for bertscore
references, predictions = [], []
# The columns of the csv file
output_ids, rouges, input_nb_words_list, nb_words_generated_summaries, nb_words_gold_summaries, outputs_success, outputs_over_context, languages_output, languages_input = [], [], [], [], [], [], [], [], []

for output_name in output_names:
    # To ensure the read file is an output
    output_path = "results/" + nb_batch + "/" + output_name
    if output_name=="scores.csv" or output_name=="desc.txt" or os.path.isdir(output_path):
        continue
    # Extraire le dictionnaire de données de l'ouptut
    output_data = load_json_into_dict(output_path)
    # Obtenir l'input, le langage de l'output, les résumés de référence
    language_output = str(classify(generated_summary))
    language_code = number_to_code[language_output]
    find_path = output_name.split('_') # 0: corpus, 1: name json input file, 2: text (quite useless) 3: fr.json or en.json
    input_path = "datasets/" + name_dataset + "/" + find_path[0] + "/" + find_path[1]
    input_data = load_json_into_dict(input_path)
    summaries_data = input_data["summaries"]
    # Preparation pour Max Rouge
    golds = [] # Sert uniquement pour le calcul du BertScore avec plusieurs références
    max_rouge2, max_rougel, nb_words_closest_gold = maxRouge(summaries_data, language_code, generated_summary, golds)
    # Fill in the row
    output_ids.append(output_name)
    languages_output.append(language_output)
    rouges.append([max_rouge2, max_rougel])
    predictions.append(generated_summary)
    references.append(golds)
    nb_words_gold_summaries.append(nb_words_closest_gold)
    nb_words_generated_summaries.append(output_data['nb_words'])
    outputs_success.append(output_data['success'])
    outputs_over_context.append(output_data['over_context'])
    input_nb_words_list.append(int(input_data['nb_words_' + language_code]))
    languages_input.append(key_to_language[output_data['input_language']])

max_bertscores = bertscore.compute(predictions=predictions, references=references, lang='en', rescale_with_baseline=True, verbose=True)['f1'] # PROBLEM WITH LANGUAGE USED

storage_file = open('results/' + nb_batch + '/scores4.csv', mode='w', newline='', encoding='utf-8')
csv_writer = csv.writer(storage_file)

rows = [[output_ids[i], rouges[i][0], rouges[i][1], max_bertscores[i], input_nb_words_list[i], nb_words_gold_summaries[i], nb_words_generated_summaries[i], outputs_success[i], outputs_over_context[i], languages_output[i], languages_input[i]] for i in range(len(output_ids))]

header = [["input_path", "rouge2", "rougel", "bertscore", "nb_words_input", "nb_words_gold", "nb_words_generated", "success", "over_context", "output_language", "input_language"]]

csv_writer.writerows(header + rows)

storage_file.close()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 2/2 [04:06<00:00, 123.09s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00,  3.60it/s]

done in 1107629.12 seconds, 0.00 sentences/sec



