### Download packages

In [None]:
!pip install einops

### Import modules

In [5]:
import os
import csv
import json
import re
import torch
import time
import datetime
import evaluate

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoConfig
)


  from .autonotebook import tqdm as notebook_tqdm


### Define Working conditions

In [9]:
# Dataset
name_dataset = "LanguageTestDataSet" # The dataset must be in a certain format

# Models used
used_models_name = {'legendhasit/xgen-7b-8k-inst-8bit'} # Models taken for computation 

# Input limits
token_limit = 3000 # To be determined with the context length of the used models

# Description of the inference wanted
desc = "This inferenee batch is aimed at testing XGen 8bit on summarization. We shall compare his performance on French and on English"

# Caracteristics of the inference wanted
max_new_tokens=700
top_k=20
temperature=0.3

### Useful functions and constants

In [7]:
# Prompting elements
instruction_templates = {"text_en":"Summarize the following text:\n\n{text}", "text_fr":"Résume le texte suivant:\n\n{text}"}
text_keys = {'text_fr', 'text_en'} # Cet ensemble donne les clés pour le texte français et anglais.
key_to_language = {'text_fr':'0', 'text_en':'1'}

# Miscellaneous useful functions and constants
def count_words(s):
    words = re.findall(r'\b\w+\b', s)  # Find all word-like sequences using regular expression
    return len(words)  # Return the number of words

def mkdir(folder_path):
    try:
        os.mkdir(folder_path)
    except FileExistsError:
        pass

In [None]:
# These dictionaries have as a key the model name and as the value the function that load the prompt, the tokenizer, the model, give the context length or treat the output.

prompt_templates = {}
context_lengths = {}
tokenizers = {}
models = {}
treat_output = {}

### Define model specific functions

In [None]:
# Models available
models_name = {
    "legendhasit/xgen-7b-8k-inst-8bit/4bit", # 1 GPU 
    "legendhasit/xgen-7b-8k-inst-8bit", # 1 GPU
    "Salesforce/xgen-7b-8k-inst", # 2GPU (3 shards de 10Go peut-être la dernière moins) de préférence, mais fonctionne avec 1... Sauf pour les textes longs, genre les plus petits de Fredsum + probablement mauvaise suppression automatique de la mémoire -> On a des OOM à 38GB
    "mosaicml/mpt-7b-instruct", # 1 GPU (2 shards de 10 Go)- To quantize
    "Trelis/mpt-7b-instruct-hosted-inference-8bit", #  - Potentially unreliable
    "mosaicml/mpt-7b-8k-instruct", # To quantize
    "mosaicml/mpt-30b-instruct" # To quantize
}

##### XGen

In [None]:
# Prompt templates
prompt_template_XGen = "A chat between a curious human and an artificial intelligence assistant.\nThe assistant gives helpful, detailed, and polite answers to the human's questions.\n\n### Human: {instruction}\n\n### Assistant: "

prompt_templates["legendhasit/xgen-7b-8k-inst-8bit"] = prompt_template_XGen
prompt_templates["legendhasit/xgen-7b-8k-inst-8bit/4bit"] = prompt_template_XGen
prompt_templates["Salesforce/xgen-7b-8k-inst"] = prompt_template_XGen

# Context lengths # Allow to know whether the model generated out of his context window
context_length_XGen = 8000 

context_lengths["legendhasit/xgen-7b-8k-inst-8bit"] = context_length_XGen
context_lengths["legendhasit/xgen-7b-8k-inst-8bit/4bit"] = context_length_XGen
context_lengths["Salesforce/xgen-7b-8k-inst"] = context_length_XGen

# Tokenizers
def get_tokenizer_XGen8bit(): # Le même pour le 4bit
    return AutoTokenizer.from_pretrained('legendhasit/xgen-7b-8k-inst-8bit', trust_remote_code=True)

def get_tokenizer_XGen():
    return AutoTokenizer.from_pretrained('Salesforce/xgen-7b-8k-inst', trust_remote_code=True)

tokenizers["legendhasit/xgen-7b-8k-inst-8bit"] = get_tokenizer_XGen8bit
tokenizers["legendhasit/xgen-7b-8k-inst-8bit/4bit"] = get_tokenizer_XGen8bit
tokenizers["Salesforce/xgen-7b-8k-inst"] = get_tokenizer_XGen

# Models

def get_model_XGen8bit():
    model = AutoModelForCausalLM.from_pretrained(
    'legendhasit/xgen-7b-8k-inst-8bit',
    device_map="auto",
    trust_remote_code=True
    )
    return model

def get_model_XGen4bit():
    bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
    'legendhasit/xgen-7b-8k-inst-8bit',
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
    )
    return model

def get_model_XGen():
    model = AutoModelForCausalLM.from_pretrained(
    'Salesforce/xgen-7b-8k-inst',
    device_map="auto",
    trust_remote_code=True
    )
    return model

models["legendhasit/xgen-7b-8k-inst-8bit"] = get_model_XGen8bit
models["legendhasit/xgen-7b-8k-inst-8bit/4bit"] = get_model_XGen4bit
models["Salesforce/xgen-7b-8k-inst"] = get_model_XGen

# Tokenize (if specificities in the way models call the tokenizer)

# Inference (if specificities in the way models call the generate function)

def infer_XGen(tokenized_input):
    return model.generate(**tokenized_input, do_sample=True, max_new_tokens=max_new_tokens, top_k=top_k, eos_token_id=50256, temperature=temperature) # eos_token_id : id du token de fin de réponse d'XGen dans son vocabulaire.

infer['legendhasit/xgen-7b-8k-inst-8bit'] = infer_XGen
infer['legendhasit/xgen-7b-8k-inst-8bit/4bit'] = infer_XGen
infer['Salesforce/xgen-7b-8k-inst'] = infer_XGen

# Treating the output (to remove the input if present in the output as well as the end of text token for example)

def treat_output_XGen(output): # Differs if it is N-shot. Here, it is 0-shot
    occ_1 = output.find("### Assistant: ")
    output = output[occ_1+15:]
    if output.find('<|endoftext|>')!=-1:
        output = output[:-14]
    return output

treat_output['legendhasit/xgen-7b-8k-inst-8bit'] = treat_output_XGen
treat_output['legendhasit/xgen-7b-8k-inst-8bit/4bit'] = treat_output_XGen
treat_output['Salesforce/xgen-7b-8k-inst'] = treat_output_XGen

# /workspace/.miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py:1411: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation )
# Avec XGen de salesforce

##### MPT

In [2]:
# Prompt templates
prompt_template_MPT7B = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:
"""

prompt_template_MPT30B = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n###Instruction\n{instruction}\n\n### Response\n" # On considère que c'est le même template que pour MPT 7B 8k car pour ce dernier, le template utilisé n'est pas précisé

prompt_templates["mosaicml/mpt-7b-instruct"] = prompt_template_MPT7B
prompt_templates["Trelis/mpt-7b-instruct-hosted-inference-8bit"] = prompt_template_MPT7B
prompt_templates["mosaicml/mpt-7b-8k-instruct"] = prompt_template_MPT30B
prompt_templates["mosaicml/mpt-30b-instruct"] = prompt_template_MPT30B

import transformers

name = 'mosaicml/mpt-7b-instruct'

config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096

model = transformers.AutoModelForCausalLM.from_pretrained(
  name,
  config=config,
  trust_remote_code=True
)


# Treating the output

def treat_output_MPT(output): # Differs if it is N-shot. Here, it is 0-shot
    occ_1 = output.find("### Response:\n")
    output = output[occ_1+14:]
    if output.find('<|endoftext|>')!=-1:
        output = output[:-14]
    return output

treat_output["mosaicml/mpt-7b-instruct"] = treat_output_MPT
treat_output["Trelis/mpt-7b-instruct-hosted-inference-8bit"] = treat_output_MPT
treat_output["mosaicml/mpt-7b-8k-instruct"] = treat_output_MPT
treat_output["mosaicml/mpt-30b-instruct"] = treat_output_MPT


### Pre-processing the input dataset
Optional

In [12]:
for corpus in os.listdir("datasets/" + name_dataset): # This script fills in the number of words and number of characters of the input files
    for file_name in os.listdir("datasets/" + name_dataset + "/" + corpus):
        if file_name==".ipynb_checkpoints":
            continue
        file_path = "datasets/" + name_dataset + "/" + corpus + "/" + file_name
        file = open(file_path, 'r', encoding='utf-8')
        data = json.load(file)
        file.close()
        text = data["text_en"]
        nb_words = count_words(text)
        nb_characters = len(text)
        data["nb_characters"] = str(nb_characters)
        data["nb_words"] = str(nb_words)
        for i in range(len(data["summaries"])):
            summary = data["summaries"][i]
            text = summary["text_en"]
            nb_words = count_words(text)
            nb_characters = len(text)
            summary["nb_characters"] = str(nb_characters)
            summary["nb_words"] = str(nb_words)
        file = open(file_path, 'w', encoding='utf-8')
        json.dump(data, file, indent=4, ensure_ascii=False)
        file.close()


In [11]:
# Every time the script is casted, it must register the results in the "results" folder, without smashing the existing results
# Then, it is stored in a file whose name is the number of the result, and this file contains a little .txt note describing what was the experiment.
# In such a folder, all the generations are json files with additional information like the prompt used, the number of samples, the path of the input text

# Now, what is specific to the model ? The prompt (different headers potentially), the tokenizer, the model itself, the way the output is displayed

In [None]:
mkdir("results")

archives = os.listdir("results")
output_folder_path = "results/"+str(len(archives))
mkdir(output_folder_path) # Create a folder for the last results.
desc_file = open(output_folder_path + "/desc.txt", 'w', encoding='utf-8')
desc_file.write(desc)
desc_file.close()

initial_time = time.time()

model_total = len(used_models_name)
model_index = 0
for model_name in used_models_name: # For each model
    model_index += 1
    print("---- Model : " + model_name + " (" + str(model_index) + "/" + str(model_total) +")----              (loading tokenizer and model...)")
    load_model_time = time.time()
    prompt_template = prompt_templates[model_name]
    tokenizer = tokenizers[model_name]()
    model = models[model_name]()
    load_model_time = time.time() - load_model_time
    print("Tokenizer and model loaded in", datetime.timedelta(seconds=int(load_model_time)), 'seconds')
    corpora = os.listdir("datasets/" + name_dataset)
    for corpus in corpora:
        file_index = 0
        files_name = os.listdir("datasets/" + name_dataset + "/" + corpus)
        file_total = len(files_name)
        for file_name in files_name:
            if file_name=="ipynb_checkpoints":
                continue
            file_time = time.time()
            file_index+=1
            print("Starting inference for text " + str(file_index) + "/" + str(file_total) + " in the " + corpus + "corpus.")
            file_path = "datasets/" + name_dataset + "/" + corpus + "/" + file_name
            file = open(file_path, 'r', encoding='utf-8')
            input_data = json.load(file)
            file.close()
            for text_key in text_keys:
                input_text = input_data[text_key]
                instruction_template = instruction_templates[text_key]
                full_instruction = instruction_template.format(text=input_text)
                prompt = prompt_template.format(instruction=full_instruction)
                # Probably specific to the model
                input = tokenizer(prompt, return_tensors="pt").to('cuda') # Le renvoie sur le GPU car au départ, c'est généré en CPU le tensuer des tokens
                #
                input_length = len(input['input_ids'][0])
                output_name = corpus + "_" + file_name + "_" + text_key + ".json"
                output_path = output_folder_path + "/" + output_name
                output_data = {"input_path":file_path, "model":model_name, "instruction":instruction_template.format(text=""), "input_language":text_key, "success":"0", "over_context":"", "input_length":str(input_length), "text":"", "output_length":"", "nb_words":"", "nb_characters":""}
                if input_length > token_limit: # In this case, a file is created but no inference is made
                    print("/!\ With model " + model_name + ", the input length is above the token limit for " + text_key + " input in " + file_path + " (" + str(input_length) + " > " + str(token_limit) + ")")

                else: # In this case, the output is inferred, treated to keep only the generation and the length information
                    # PROBABLY SPECIFIC TO THE MODEL
                    sample = infer[model_name](input)
                    sample_length = len(sample[0])
                    ### SPECIFIC TO THE MODEL ----------------------------------------
                    if sample_length > context_lengths[model_name]: # There was a context window overflow
                        output_data["over_context"] = "1"
                    else:
                        output_data["over_context"] = "0"
                    output_length = sample_length - input_length
                    output_data["output_length"] = output_length
                    full_output = tokenizer.decode(sample[0]).strip() # prompt + output generation
                    # Ideally, treat_output is a dictionnary that associates model_name to a function taking full_output as an argument
                    output = treat_output[model_name](full_output)
                    output_data["text"] = output
                    output_data["nb_characters"] = len(output)
                    output_data["nb_words"] = count_words(output)
                    output_data["success"] = "1"
                # SAVING THE OUTPUT
                output_file = open(output_path, 'w', encoding='utf-8')
                json.dump(output_data, output_file, indent=4, ensure_ascii=False)
                output_file.close()
            file_time = time.time() - file_time
            print("Inference done for French and English in", datetime.timedelta(seconds=int(file_time)), "seconds.")
    # HERE, clear memory... remove model, tokenizer from memory
                
delta = time.time() - initial_time
print('Done! Took', datetime.timedelta(seconds=int(delta)), 'seconds')
# Write treatOutput function / dictionnary


# RETIRER le fichier template dans samsum

### Post-processing

In [13]:
rouge = evaluate.load('rouge') # Warning ! When comparing many references to a prediction, an average is performed, not a max...
bertscore = evaluate.load('bertscore') # The BERTScore calculation will take a lot of time... (if not performed in only one call)

mkdir('results')

nb_batch = "2" # Mettre le numéro du "batch d'inférence"

# Define the place of storage : "results/x/scores.csv"
output_names = os.listdir("results/" + nb_batch)
references = []
predictions = []
rouges = []
output_ids = []
input_nb_words_list = []
nb_words_generated_summaries = []
nb_words_gold_summaries = []
outputs_success = []
outputs_over_context = []

for output_name in output_names:
    if output_name=="scores.csv" or output_name=="desc.txt" or os.path.isdir("results/" + nb_batch + "/" + output_name):
        continue
    output_ids.append(output_name)
    output_file = open("results/" + nb_batch + "/" + output_name, 'r', encoding='utf-8')
    output_data = json.load(output_file)
    output_file.close()
    generated_summary = output_data["text"]
    nb_words_generated_summaries.append(output_data['nb_words'])
    outputs_success.append(output_data['success'])
    outputs_over_context.append(output_data['over_context'])
    find_path = output_name.split('_') # 0: corpus, 1: name json input file, 2: text (quite useless) 3: fr.json or en.json
    input_path = "datasets/" + name_dataset + "/" + find_path[0] + "/" + find_path[1]
    input_file = open(input_path, 'r', encoding='utf-8')
    input_data = json.load(input_file)
    input_nb_words_list.append(int(input_data['nb_words']))
    input_file.close()
    summaries_data = input_data["summaries"]
    lang = output_data['input_language'][-2:] # en or fr, for bertscore lang parameter
    max_rouge2 = -1
    max_rougel = -1
    golds = []
    nb_words_closest_gold = 0
    for i in range(len(summaries_data)):
        summary_data = summaries_data[i]
        summary_text = summary_data[find_path[2] + "_" + find_path[3][:2]]
        result_rouge = rouge.compute(predictions=[generated_summary], references=[summary_text], use_aggregator=False)
        golds.append(summary_text)
        if result_rouge['rouge2'][0] > max_rouge2:
            max_rouge2 = result_rouge['rouge2'][0]
            nb_words_closest_gold = summary_data['nb_words']
        if result_rouge['rougeL'][0] > max_rougel:
            max_rougel = result_rouge['rougeL'][0]
    rouges.append([max_rouge2, max_rougel])
    predictions.append(generated_summary)
    references.append(golds)
    nb_words_gold_summaries.append(nb_words_closest_gold)
max_bertscores = bertscore.compute(predictions=predictions, references=references, lang=lang, rescale_with_baseline=True, verbose=True)['f1']

storage_file = open('results/' + nb_batch + '/scores.csv', mode='w', newline='', encoding='utf-8')
csv_writer = csv.writer(storage_file)

rows = [[output_ids[i], rouges[i][0], rouges[i][1], max_bertscores[i], input_nb_words_list[i], nb_words_gold_summaries[i], nb_words_generated_summaries[i], outputs_success[i], outputs_over_context[i]] for i in range(len(output_ids))]

header = [["input_path", "rouge2", "rougel", "bertscore", "nb_words_input", "nb_words_gold", "nb_words_generated", "success", "over_context"]]

csv_writer.writerows(header + rows)

storage_file.close()

calculating scores...
computing bert embedding.


100%|██████████| 2/2 [01:10<00:00, 35.16s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00,  6.39it/s]


done in 869995.94 seconds, 0.00 sentences/sec


### Hardware use

In [3]:
!nvidia-smi

'nvidia-smi' n'est pas reconnu en tant que commande interne
ou externe, un programme ex�cutable ou un fichier de commandes.
