### Import modules

In [2]:
import os
import csv
import json
import re
import torch
import time
import datetime
import sys
import traceback
import gc
import evaluate

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoConfig
)


### Define Working conditions

In [None]:
# Dataset
name_dataset = "LanguageTestDataSet" # The dataset must be in a certain format

# Models used
models_name = {'legendhasit/xgen-7b-8k-inst-8bit', 'legendhasit/xgen-7b-dolly-15k-4bit', 'mosaicml/mpt-7b-instruct', 'Trelis/mpt-7b-instruct-hosted-inference-8bit'} # Model names list

# Input limits
token_limit = 1800 # To be determined with the context length of the used models

# Description of the inference wanted
desc = "This inference batch is aimed at testing a lot of models on a simple, preliminary, text for summarization. The 4bit 8bit 16bit and 32 bit models will be compared."

### Useful functions and constants

In [None]:
# Prompting elements
instruction_templates = {"Summarize the following text:\n\n{text}", "Résume le texte suivant:\n\n{text}"}
text_keys = {'text_fr', 'text_en'} # Cet ensemble donne les clés pour le texte français et anglais.
key_to_language = {'text_fr':'0', 'text_en':'1'}

# Miscellaneous useful functions and constants
def count_words(s):
    words = re.findall(r'\b\w+\b', s)  # Find all word-like sequences using regular expression
    return len(words)  # Return the number of words

def mkdir(folder_path):
    try:
        os.mkdir(folder_path)
    except FileExistsError:
        pass

### Define model specific functions

In [None]:
# Prompt templates
prompt_template_XGen = "A chat between a curious human and an artificial intelligence assistant.\nThe assistant gives helpful, detailed, and polite answers to the human's questions.\n\n### Human: {instruction}\n\n### Assistant: "

prompt_templates = {
    "legendhasit/xgen-7b-8k-inst-8bit":prompt_template_XGen
                    }

# Tokenizers

def get_tokenizer_XGen8bit():
    return AutoTokenizer.from_pretrained('legendhasit/xgen-7b-8k-inst-8bit', trust_remote_code=True)

def get_tokenizer_XGen():
    return AutoTokenizer.from_pretrained("Salesforce/xgen-7b-8k-inst", trust_remote_code=True)

def get_tokenizer_MPT7B():
    return AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")

def get_tokenizer_MPT7Bbit():
    return AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')

tokenizers = {'legendhasit/xgen-7b-8k-inst-8bit':get_tokenizer_XGen8bit}

# Models

def get_model_MPT37B8k():
    config = AutoConfig.from_pretrained('mosaicml/mpt-7b-instruct-8k', trust_remote_code=True)
    config.attn_config['attn_impl'] = 'triton'  # change this to use triton-based FlashAttention
    config.init_device = 'cuda:0' # For fast initialization directly on GPU!

    return AutoModelForCausalLM.from_pretrained(
    'mosaicml/mpt-7b-instruct-8k',
    config=config,
    torch_dtype=torch.bfloat16, # Load model weights in bfloat16
    trust_remote_code=True
    )

def get_model_MPT7B():
    return AutoModelForCausalLM.from_pretrained('mosaicml/mpt-7b-instruct', trust_remote_code=True)

def get_model_MPT7B8bit():
    config = AutoConfig.from_pretrained('Trelis/mpt-7b-instruct-hosted-inference-8bit', trust_remote_code=True)
    config.init_device = 'cuda:0' # Unclear whether this really helps a lot or interacts with device_map.
    config.max_seq_len = 512
    model = AutoModelForCausalLM.from_pretrained('Trelis/mpt-7b-instruct-hosted-inference-8bit', load_in_8bit=True, config=config)

def get_model_XGen():
    return AutoModelForCausalLM.from_pretrained("Salesforce/xgen-7b-8k-inst", torch_dtype=torch.bfloat16)

def get_model_XGen8bit():
    bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
    'legendhasit/xgen-7b-8k-inst-8bit',
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
    )
    return model

models = {'legendhasit/xgen-7b-8k-inst-8bit':get_model_XGen8bit}

# Tokenize (if specificities in the way models call the tokenizer)

# Inference (if specificities in the way models call the generate function)

# Treating the output (to remove the input if present in the output as well as the end of text token for example)

def treat_output_XGen(output): # Differs if it is N-shot. Here, it is 0-shot
    occ_1 = output.find("### Assistant: ")
    output = output[occ_1+15:]
    if output.find('<|endoftext|>')!=-1:
        output = output[:-14]
    return output

treat_output = {'legendhasit/xgen-7b-8k-inst-8bit':treat_output_XGen}

### ATTENTION A L'UTILISATION DE LA MEMOIRE : APRES CHARGEMENT ET INFERENCE D'UN MODELE, IL DOIT DISPARAITRE DE LA MEMOIRE



### Pre-processing the input dataset
Optional

In [12]:
for corpus in os.listdir("datasets/" + name_dataset): # This script fills in the number of words and number of characters of the input files
    for file_name in os.listdir("datasets/" + name_dataset + "/" + corpus):
        file_path = "datasets/" + name_dataset + "/" + corpus + "/" + file_name
        file = open(file_path, 'r', encoding='utf-8')
        data = json.load(file)
        file.close()
        text = data["text_en"]
        nb_words = count_words(text)
        nb_characters = len(text)
        data["nb_characters"] = str(nb_characters)
        data["nb_words"] = str(nb_words)
        for i in range(len(data["summaries"])):
            summary = data["summaries"][i]
            text = summary["text_en"]
            nb_words = count_words(text)
            nb_characters = len(text)
            summary["nb_characters"] = str(nb_characters)
            summary["nb_words"] = str(nb_words)
        file = open(file_path, 'w', encoding='utf-8')
        json.dump(data, file, indent=4, ensure_ascii=False)
        file.close()


In [11]:
# Every time the script is casted, it must register the results in the "results" folder, without smashing the existing results
# Then, it is stored in a file whose name is the number of the result, and this file contains a little .txt note describing what was the experiment.
# In such a folder, all the generations are json files with additional information like the prompt used, the number of samples, the path of the input text

# Now, what is specific to the model ? The prompt (different headers potentially), the tokenizer, the model itself, the way the output is displayed

In [None]:
archives = os.listdir("results")
output_folder_path = "results/"+str(len(archives))
mkdir(output_folder_path) # Create a folder for the last results.
desc_file = open(output_folder_path + "/desc.txt", 'w', encoding='utf-8')
desc_file.write(desc)
desc_file.close()

initial_time = time.time()

model_total = len(models_name)
model_index = 0
for model_name in models_name: # For each model
    model_index += 1
    print("---- Model : " + model_name + " (" + str(model_index) + "/" + str(model_total) +")----              (loading tokenizer and model...)")
    load_model_time = time.time()
    prompt_template = prompt_templates[model_name]
    tokenizer = tokenizers[model_name]()
    model = models[model_name]()
    load_model_time = time.time() - load_model_time
    print("Tokenizer and model loaded in", datetime.timedelta(seconds=int(load_model_time)), 'seconds')
    corpora = os.listdir("datasets/" + name_dataset)
    for corpus in corpora:
        file_index = 0
        files_name = os.listdir("datasets/" + name_dataset + "/" + corpus)
        file_total = len(files_name)
        for file_name in files_name:
            file_time = time.time()
            file_index+=1
            print("Starting inference for text " + str(file_index) + "/" + str(file_total) + " in the " + corpus + "corpus.")
            file_path = "datasets/" + name_dataset + "/" + corpus + "/" + file_name
            file = open(file_path, 'r', encoding='utf-8')
            input_data = json.load(file)
            file.close()
            for text_key in text_keys:
                input_text = input_data[text_key]
                instruction_template = instruction_templates[text_key]
                full_instruction = instruction_template.format(text=input_text)
                prompt = prompt_template.format(instruction=full_instruction)
                # Probably specific to the model
                input = tokenizer(prompt, return_tensors="pt").to('cuda')
                #
                input_length = len(input['input_ids'][0])
                output_name = corpus + "_" + file_name + "_" + text_key + ".json"
                output_path = output_folder_path + "/" + output_name
                output_data = {"input_path":file_path, "model":model_name, "instruction":instruction_template.format(text=""), "input_language":text_key, "success":"0", "over_context":"", "input_length":str(input_length), "text":"", "output_length":"", "nb_words":"", "nb_characters":""}
                if input_length > token_limit: # In this case, a file is created but no inference is made
                    print("/!\ With model " + model_name + ", the input length is above the token limit for " + text_key + " input in " + file_path + " (" + input_length + " > " + token_limit + ")")

                else: # In this case, the output is inferred, treated to keep only the generation and the length information
                    # PROBABLY SPECIFIC TO THE MODEL
                    sample = model.generate(**input, do_sample=True, max_new_tokens=700, top_k=20, eos_token_id=50256, temperature=0.3) # Top-k, température, max new tokens
                    sample_length = len(sample[0])
                    ### SPECIFIC TO THE MODEL ----------------------------------------
                    if sample_length > context_length: # There was a context window overflow
                        output_data["over_context"] = "1"
                    else:
                        output_data["over_context"] = "0"
                    output_length = sample_length - input_length
                    output_data["output_length"] = output_length
                    full_output = tokenizer.decode(sample[0]).strip() # prompt + output generation
                    # Ideally, treat_output is a dictionnary that associates model_name to a function taking full_output as an argument
                    output = treat_output[model_name](full_output)
                    output_data["text"] = output
                    output_data["nb_characters"] = len(output)
                    output_data["nb_words"] = count_words(output)
                    output_data["success"] = "1"
                # SAVING THE OUTPUT
                output_file = open(output_path, 'w', encoding='utf-8')
                json.dump(output_data, output_file, indent=4, ensure_ascii=False)
                output_file.close()
                file_time = time.time() - file_time
            print("Inference done for French and English in", datetime.timedelta(seconds=int(file_time)), "seconds.")
    # HERE, clear memory... remove model, tokenizer from memory
                
delta = time.time() - initial_time
print('Done! Took', datetime.timedelta(seconds=int(delta)), 'seconds')
# Write treatOutput function / dictionnary
# Improve the display of the code

### Post-processing

In [3]:
rouge = evaluate.load('rouge') # Warning ! When comparing many references to a prediction, an average is performed, not a max...
bertscore = evaluate.load('bertscore') # The BERTScore calculation will take a lot of time... (if not performed in only one call)

mkdir('results')

# Define the place of storage : "results/scores.csv"
output_names = os.listdir("results")
references = []
predictions = []
rouges = []
output_ids = []
input_nb_words_list = []
nb_words_generated_summaries = []
nb_words_gold_summaries = []

for output_name in output_names and output_name!="scores.csv" and output_name!="desc.txt":
    output_ids.append(output_name)
    output_file = open("results/" + output_name, 'r', encoding='utf-8')
    output_data = json.load(output_file)
    output_file.close()
    generated_summary = output_data["text"]
    nb_words_generated_summaries.append(output_data['nb_words'])
    find_path = output_name.split('_') # 0: corpus, 1: name json input file, 2: text_fr or text_en
    input_path = "datasets/" + name_dataset + "/" + find_path[0] + "/" + find_path[1]
    input_file = open(input_file, 'r', encoding='utf-8')
    input_data = json.load(input_file)
    input_nb_words_list.append(int(input_data['nb_words']))
    input_file.close()
    summaries_data = input_data["summaries"]
    lang = output_data['input_language'][-2:] # en or fr, for bertscore lang parameter
    max_rouge2 = -1
    max_rougel = -1
    golds = []
    nb_words_closest_gold = 0
    for i in range(len(summaries_data)):
        summary_data = summaries_data[i]
        summary_text = summary_data[find_path[2]]
        result_rouge = rouge.compute(predictions=[generated_summary], references=[summary_text], use_aggregator=False)
        golds.append(summary_text)
        if result_rouge['rouge2'][0] > max_rouge2:
            max_rouge2 = result_rouge['rouge2'][0]
            nb_words_closest_gold = summary_data['nb_words']
        if result_rouge['rougel'][0] > max_rougel:
            max_rougel = result_rouge['rougel'][0]
    rouges.append([max_rouge2, max_rougel])
    predictions.append(generated_summary)
    references.append(golds)
    nb_words_gold_summaries.append(nb_words_closest_gold)
max_bertscores = bertscore.compute(predictions=predictions, references=references, lang=lang, rescale_with_baseline=True, verbose=True)['f1']

storage_file = open('results/scores.csv', mode='w', newline='', encoding='utf-8')
csv_writer = csv.writer(storage_file)

rows = [[output_ids[i], rouges[i][max_rouge2], rouges[i][max_rougel], max_bertscores[i], input_nb_words_list[i], nb_words_gold_summaries[i], nb_words_generated_summaries[i]] for i in range(len(output_ids))]

header = [["input_path", "rouge2", "rougel", "bertscore", "nb_words_input", "nb_words_gold", "nb_words_generated"]]

csv_writer.writerows(header + rows)

storage_file.close()

IndentationError: unexpected indent (1186573947.py, line 29)