### Download packages

In [None]:
# !pip install einops
# !GITHUB_ACTIONS=true pip install auto-gptq
# !pip install tokenizers --upgrade


### Connexion HuggingFace

In [None]:
# !huggingface-cli login Pas oublier (lancer un terminal)

### Import modules

In [42]:
import os
import csv
import json
import re
import torch
import time
import datetime
import evaluate

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoConfig
)


### Define Working conditions

In [81]:
# Dataset
name_dataset = "LanguageTestDataSet" # The dataset must be in a certain format

# Models used
used_models_name = {
    "Salesforce/xgen-7b-8k-inst",
    "legendhasit/xgen-7b-8k-inst-8bit",
    "mosaicml/mpt-7b-instruct", # 1 GPU (2 shards de 10 Go)- To quantize
    "meta-llama/Llama-2-7b-chat-hf"} # Models taken for computation 

# Input limits
token_limit = 3500 # To be determined with the context length of the used model

# MAX token length for chunkization
MAX_TOKEN_CHUNK_SIZE = 750

# Description of the inference wanted
desc = "This inference batch is aimed at testing MPT7B on summarization. There is no French summarization at all."

# Caracteristics of the inference wanted
max_new_tokens=700
top_k=20
temperature=0.3

### Useful functions and constants

In [61]:
# Prompting elements
instruction_templates = {"text_en":"Provide a list of key points for the following text:\n\n{text}", "text_fr":"Résume le texte suivant:\n\n{text}"} # Prompt à améliorer peut-être avec une liste de independent key points
text_keys = {'text_en'} # Cet ensemble donne les clés pour le texte français et anglais. On retire 'text_fr' pour les modèles anglais
key_to_language = {'text_fr':'0', 'text_en':'1'}
language_to_key = {'0':'text_fr', '1':'text_en'}
number_to_code = {'0':'text_fr', '1':'text_en', '2':'text_en'}

# Miscellaneous useful functions and constants
def count_words(s):
    words = re.findall(r'\b\w+\b', s)  # Find all word-like sequences using regular expression
    return len(words)  # Return the number of words

def mkdir(folder_path):
    try:
        os.mkdir(folder_path)
    except FileExistsError:
        pass

def classify(text): # Classificateur déterministe basique, qui accumule des indices de langue et renvoie le langage avec le plsu haut score
    score_en = 0
    score_fr = 0
    score_en += len(find_all_occurrences_regex(text, " and " ))
    score_en += len(find_all_occurrences_regex(text, " of " ))
    score_en += len(find_all_occurrences_regex(text, " the " ))
    score_en += len(find_all_occurrences_regex(text, " in " ))
    score_en += len(find_all_occurrences_regex(text, " is " ))
    score_en += len(find_all_occurrences_regex(text, " for " ))
    score_en += len(find_all_occurrences_regex(text, " how " ))
    score_en += len(find_all_occurrences_regex(text, " with " ))
    score_fr += len(find_all_occurrences_regex(text, " le " ))
    score_fr += len(find_all_occurrences_regex(text, " la " ))
    score_fr += len(find_all_occurrences_regex(text, " de " ))
    score_fr += len(find_all_occurrences_regex(text, " un " ))
    score_fr += len(find_all_occurrences_regex(text, " une " ))
    score_fr += len(find_all_occurrences_regex(text, " et " ))
    score_fr += len(find_all_occurrences_regex(text, " à " ))
    score_fr += len(find_all_occurrences_regex(text, " avec " ))
    score_fr += len(find_all_occurrences_regex(text, " il " ))
    score_fr += len(find_all_occurrences_regex(text, " pour " ))
    if score_fr + score_en <= 3: return 2
    if score_fr > score_en: return 0
    return 1

def find_all_occurrences_regex(text, pattern):
    occurrences = [match.start() for match in re.finditer(pattern, text)]
    return occurrences

def maxRouge(summaries_data, language_code, generated_summary, golds): # Pour un dictionnaire de résumés de référence, pour un langage donné, renvoie le résumé de référence le plus proche du résumé prédit au sens de rouge et les scores associés
    max_rouge2 = -1
    max_rougel = -1
    nb_words_closest_gold = 0

    for i in range(len(summaries_data)):
        summary_data = summaries_data[i]
        summary_text = summary_data[language_code]
        result_rouge = rouge.compute(predictions=[generated_summary], references=[summary_text], use_aggregator=False)
        golds.append(summary_text)
        if result_rouge['rouge2'][0] > max_rouge2:
            max_rouge2 = result_rouge['rouge2'][0]
            nb_words_closest_gold = summary_data['nb_words_' + language_code[-2:]]
        if result_rouge['rougeL'][0] > max_rougel:
            max_rougel = result_rouge['rougeL'][0]

    return max_rouge2, max_rougel, nb_words_closest_gold

def load_json_into_dict(path):
    file = open(path, 'r', encoding='utf-8')
    dict = json.load(file)
    file.close()
    return dict

def save_dict_into_json(dict, path):
    file = open(path, 'r', encoding='utf-8')
    json.dump(dict, file, indent=4, ensure_ascii=False)
    file.close()

def token_len(text, tokenizer):
    return len(tokenizer(text, return_tensors="pt")['input_ids'][0])

def append_to_chunk(current_chunk, utterance):
    if len(current_chunk) > 0:
        current_chunk += '\n'
    current_chunk += utterance
    return current_chunk

def append_to_split(current_split, sentence): # Les ? . ! ... sont remplacés par des ". "
    if len(current_split) > 0:
        current_split += '. '
    current_split += sentence
    return current_split

def chunkize(text, tokenizer):
    """
    Greedy implementation of a dialogue transcript chunking algorithm. This method returns a list of transcript chunks.
    - It priorities stability over performance. There is a set maximum chunk size for LLM inference stability. Really long utterances bypass this limit.
    - It guarantees the cuts are made at utterance ends (\n).
    - It counts everything in MODEL TOKENS and not characters for more exact experiments.
    """
    
    chunks = [] # Final list of transcript chunks. This makes up the loop invariant
    utterances = text.split('\n') # Transcript is split into sentences
    utterances.reverse() # Reverse everything!!
    current_chunk = ''

    # While there is still an utterance to process
    while len(utterances) > 0:
        utterance = utterances.pop()
        new_current_chunk = append_to_chunk(current_chunk, utterance)
        if token_len(utterance, tokenizer) > MAX_TOKEN_CHUNK_SIZE:
            if current_chunk != '': chunks.append(current_chunk)
            splits = split_utterance(utterance, tokenizer) # Découpe une utterance en s'assurant que chaque coupe soit plus petite que la taille max
            for split in splits[:-1]: chunks.append(split) # Except the last split, that will be used as the next current_chunk
            current_chunk = splits[-1]

        # Add to current chunk and proceed to next
        elif token_len(new_current_chunk, tokenizer) <= MAX_TOKEN_CHUNK_SIZE:
            current_chunk = new_current_chunk
        
        # Current chunk is big enough, append to list and create new one
        else:
            chunks.append(current_chunk)
            current_chunk = utterance

    if len(current_chunk) > 0:
        chunks.append(current_chunk)

    return chunks

def split_utterance(text, tokenizer):  
    splits = [] # Final list of transcript chunks. This makes up the loop invariant
    sentence_pattern = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
    sentences = re.split(sentence_pattern, text) # Transcript is split into sentences
    sentences.reverse() # Reverse everything!!
    current_split = ''
    # While there is still an utterance to process
    while len(sentences) > 0:
        sentence = sentences.pop()
        new_current_split = append_to_split(current_split, sentence)
        if token_len(new_current_split, tokenizer) > MAX_TOKEN_CHUNK_SIZE:
            if current_split =='': print(" /!\ Split issue : too long sentence.")
            splits.append(current_split)
            current_split = sentence

        # Add to current chunk and proceed to next
        else:
            current_split = new_current_split
    
    if len(current_split) > 0:
        splits.append(current_split)

    return splits

### Dataset preparation

In [62]:
dataset_preparation_code = """dataset_folder_url = "datasets/KeyPointsExtractionTest/"
dataset_dict = {"fredsum":"0", "ami":"6", "mediasum":"3", "summre":5, "icsi":7}

virgin_files = os.listdir(dataset_folder_url)
data = {}
for file_name in virgin_files:
    if os.path.isdir(file_name): # Shouldn't happen normally
        continue
    file_code = file_name.split('.')[0].split("_")
    data_key = file_code[0] + "_" + file_code[1]
    dataset = file_code[0]
    if data_key not in data:
        data[data_key] = [dataset, {}]

    if file_code[2][0:3] == "txt":
        dataset = file_code[0]
        original_file_name = file_code[1]
        data[data_key][1]["file_name"] = original_file_name
        data[data_key][1]["source_dataset"] = dataset_dict[dataset]

        file = open(dataset_folder_url + "/" + file_name, 'r', encoding='utf-8')
        text = file.read()
        file.close()
        language_number = str(classify(text))
        data[data_key][1]["language"] = language_number
        language_code = number_to_code[language_number]
        data[data_key][1][language_code] = text

    elif file_code[2][0:3] == "sum":
        if "summaries" not in data[data_key][1]:
            data[data_key][1]["summaries"] = []

        if len(file_code)==4: # Plusieurs résumés
            summary_data = {}
            summary_data["number"] = file_code[3]
            file = open(dataset_folder_url + "/" + file_name, 'r', encoding='utf-8')
            text = file.read()
            file.close()
            summary_data[number_to_code[str(classify(text))]] = text

            data[data_key][1]["summaries"].append(summary_data)

        else: # Un seul résumé
            summary_data = {}
            summary_data["number"] = "1"
            file = open(dataset_folder_url + "/" + file_name, 'r', encoding='utf-8')
            text = file.read()
            file.close()
            summary_data[number_to_code[str(classify(text))]] = text

            data[data_key][1]["summaries"].append(summary_data)
            


for key in data:
    dataset = data[key][0]
    file_data = data[key][1]
    mkdir(dataset_folder_url + dataset)
    file_name = str(len(os.listdir(dataset_folder_url + dataset)))
    file = open(dataset_folder_url + dataset + "/" + file_name, 'w', encoding='utf-8')
    json.dump(file_data, file, indent=4, ensure_ascii=False)
    file.close()"""



In [63]:
# These dictionaries have as a key the model name and as the value the function that load the prompt, the tokenizer, the model, give the context length or treat the output.

prompt_templates = {} # Dictionnaire qui associe les templates de prompts, il faut insérer l'instruction et le texte à résumer.
context_lengths = {} # Dictionnaire qui donne l'entier correspondant à la longueur de contexte
tokenizers = {} # Dictionnaire qui associe la méthode pour obtenir le tokenizer du modèle
models = {} # Dictionnaire qui associe la méthode pour obtenir le modèle
treat_output = {} # Dictionnaire qui associe la méthode pour traiter l'output et ne conserver que la génération du modèle
infer = {} # Dictionnaire qui associe la méthode pour l'inférence du modèle. Prend en paramètres l'input et le modèle

### Define model specific functions

In [64]:
# Models available
models_name = {
    "legendhasit/xgen-7b-8k-inst-8bit/4bit", # 1 GPU 
    "legendhasit/xgen-7b-8k-inst-8bit", # 1 GPU
    "Salesforce/xgen-7b-8k-inst", # 2GPU (3 shards de 10Go peut-être la dernière moins) de préférence, mais fonctionne avec 1... Sauf pour les textes longs, genre les plus petits de Fredsum + probablement mauvaise suppression automatique de la mémoire -> On a des OOM à 38GB
    "mosaicml/mpt-7b-instruct", # 1 GPU (2 shards de 10 Go)- To quantize
    # "Trelis/mpt-7b-instruct-hosted-inference-8bit", #  - Potentially unreliable
    "mosaicml/mpt-7b-8k-instruct", # To quantize
    "mosaicml/mpt-30b-instruct", # To quantize
    "meta-llama/Llama-2-7b-chat-hf",
    "tiiuae/falcon-7b"
}

simplified_names = {
    "legendhasit/xgen-7b-8k-inst-8bit/4bit":"XGen7b8k4bit", # 1 GPU 
    "legendhasit/xgen-7b-8k-inst-8bit":"XGen7b8k8bit", # 1 GPU
    "Salesforce/xgen-7b-8k-inst":"XGen7b8k", # 2GPU (3 shards de 10Go peut-être la dernière moins) de préférence, mais fonctionne avec 1... Sauf pour les textes longs, genre les plus petits de Fredsum + probablement mauvaise suppression automatique de la mémoire -> On a des OOM à 38GB
    "mosaicml/mpt-7b-instruct":"MPT7b", # 1 GPU (2 shards de 10 Go)- To quantize
    # "Trelis/mpt-7b-instruct-hosted-inference-8bit", #  - Potentially unreliable
    "mosaicml/mpt-7b-8k-instruct":"MPT7b8k", # To quantize
    "mosaicml/mpt-30b-instruct":"MPT30b", # To quantize
    "meta-llama/Llama-2-7b-chat-hf":"llama27b",
    "tiiuae/falcon-7b":"falcon7b"
}

reversed_names = {simplified_names[name]:name for name in simplified_names}
# RMQ: Mieux que les dicos de fonction, comme on utilise toujours les mêmes fonctions, mais avec des paramètres différents, il suffit d'avoir le dico des paramètres de la fonction, avec un dico par défaut modifié pour chaque modèle.

##### XGen

In [65]:
# Prompt templates
prompt_template_XGen = "A chat between a curious human and an artificial intelligence assistant.\nThe assistant gives helpful, detailed, and polite answers to the human's questions.\n\n### Human: {instruction}\n\n### Assistant: "

prompt_templates["legendhasit/xgen-7b-8k-inst-8bit"] = prompt_template_XGen
prompt_templates["legendhasit/xgen-7b-8k-inst-8bit/4bit"] = prompt_template_XGen
prompt_templates["Salesforce/xgen-7b-8k-inst"] = prompt_template_XGen

# Context lengths # Allow to know whether the model generated out of his context window
context_length_XGen = 8000 

context_lengths["legendhasit/xgen-7b-8k-inst-8bit"] = context_length_XGen
context_lengths["legendhasit/xgen-7b-8k-inst-8bit/4bit"] = context_length_XGen
context_lengths["Salesforce/xgen-7b-8k-inst"] = context_length_XGen

# Tokenizers
def get_tokenizer_XGen8bit(): # Le même pour le 4bit
    return AutoTokenizer.from_pretrained('legendhasit/xgen-7b-8k-inst-8bit', trust_remote_code=True)

def get_tokenizer_XGen():
    return AutoTokenizer.from_pretrained('Salesforce/xgen-7b-8k-inst', trust_remote_code=True)

tokenizers["legendhasit/xgen-7b-8k-inst-8bit"] = get_tokenizer_XGen8bit
tokenizers["legendhasit/xgen-7b-8k-inst-8bit/4bit"] = get_tokenizer_XGen8bit
tokenizers["Salesforce/xgen-7b-8k-inst"] = get_tokenizer_XGen

# Models

def get_model_XGen8bit():
    model = AutoModelForCausalLM.from_pretrained(
    'legendhasit/xgen-7b-8k-inst-8bit',
    device_map="auto",
    trust_remote_code=True
    )
    return model

def get_model_XGen4bit():
    bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
    'legendhasit/xgen-7b-8k-inst-8bit',
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
    )
    return model

def get_model_XGen():
    model = AutoModelForCausalLM.from_pretrained(
    'Salesforce/xgen-7b-8k-inst',
    device_map="auto",
    trust_remote_code=True
    )
    return model

models["legendhasit/xgen-7b-8k-inst-8bit"] = get_model_XGen8bit
models["legendhasit/xgen-7b-8k-inst-8bit/4bit"] = get_model_XGen4bit
models["Salesforce/xgen-7b-8k-inst"] = get_model_XGen

# Tokenize (if specificities in the way models call the tokenizer)

# Inference (if specificities in the way models call the generate function)

def infer_XGen(tokenized_input, model):
    return model.generate(**tokenized_input, do_sample=True, max_new_tokens=max_new_tokens, top_k=top_k, eos_token_id=50256, temperature=temperature) # eos_token_id : id du token de fin de réponse d'XGen dans son vocabulaire.

infer['legendhasit/xgen-7b-8k-inst-8bit'] = infer_XGen
infer['legendhasit/xgen-7b-8k-inst-8bit/4bit'] = infer_XGen
infer['Salesforce/xgen-7b-8k-inst'] = infer_XGen

# Treating the output (to remove the input if present in the output as well as the end of text token for example)

def treat_output_XGen(output): # Differs if it is N-shot. Here, it is 0-shot
    occ_1 = output.find("### Assistant: ")
    output = output[occ_1+15:]
    if output.find('<|endoftext|>')!=-1:
        output = output[:-14]
    return output

treat_output['legendhasit/xgen-7b-8k-inst-8bit'] = treat_output_XGen
treat_output['legendhasit/xgen-7b-8k-inst-8bit/4bit'] = treat_output_XGen
treat_output['Salesforce/xgen-7b-8k-inst'] = treat_output_XGen

# /workspace/.miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py:1411: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation )
# Avec XGen de salesforce

##### MPT

In [66]:
# Prompt templates
prompt_template_MPT7B = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:
"""

prompt_template_MPT30B = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n###Instruction\n{instruction}\n\n### Response\n" # On considère que c'est le même template que pour MPT 7B 8k car pour ce dernier, le template utilisé n'est pas précisé

prompt_templates["mosaicml/mpt-7b-instruct"] = prompt_template_MPT7B
prompt_templates["Trelis/mpt-7b-instruct-hosted-inference-8bit"] = prompt_template_MPT7B
prompt_templates["mosaicml/mpt-7b-8k-instruct"] = prompt_template_MPT30B
prompt_templates["mosaicml/mpt-30b-instruct"] = prompt_template_MPT30B

# Context lengths

context_lengths["mosaicml/mpt-7b-instruct"] = 2048 # 4096 d'après mosaicml

# Models
def get_model_MPT7B():
    config = AutoConfig.from_pretrained("mosaicml/mpt-7b-instruct", trust_remote_code=True)
    config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
    return AutoModelForCausalLM.from_pretrained("mosaicml/mpt-7b-instruct",config=config,trust_remote_code=True,device_map = "auto")

models["mosaicml/mpt-7b-instruct"] = get_model_MPT7B

# Tokenize
def get_tokenizer_MPT7B():
    return AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct", trust_remote_code=True)

tokenizers["mosaicml/mpt-7b-instruct"] = get_tokenizer_MPT7B
# Inference ######################################

def infer_MPT7B(tokenized_input, model):
    return model.generate(**tokenized_input, do_sample=True, max_new_tokens=max_new_tokens, top_k=top_k, temperature=temperature, eos_token_id=0) # eos_token_id : id du token de fin de réponse d'XGen dans son vocabulaire. repetition_penalty=1.2 : évitd la répéttioon

infer["mosaicml/mpt-7b-instruct"] = infer_MPT7B

# Treating the output ######################################

def treat_output_MPT(output): # Differs if it is N-shot. Here, it is 0-shot
    occ_1 = output.find("### Response:\n")
    output = output[occ_1+14:]
    if output.find('<|endoftext|>')!=-1:
        output = output[:-14]
    return output

treat_output["mosaicml/mpt-7b-instruct"] = treat_output_MPT
treat_output["Trelis/mpt-7b-instruct-hosted-inference-8bit"] = treat_output_MPT
treat_output["mosaicml/mpt-7b-8k-instruct"] = treat_output_MPT
treat_output["mosaicml/mpt-30b-instruct"] = treat_output_MPT


##### Llama 2

In [67]:
# Prompt templates
prompt_templates["meta-llama/Llama-2-7b-chat-hf"] = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

{instruction} [/INST]"""

context_lengths["meta-llama/Llama-2-7b-chat-hf"] = 4096

def get_tokenizer_llama2_7b():
    return AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", trust_remote_code=True)

tokenizers["meta-llama/Llama-2-7b-chat-hf"] = get_tokenizer_llama2_7b

def get_model_llama2_7b():
    return AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",trust_remote_code=True,device_map = "auto")

models["meta-llama/Llama-2-7b-chat-hf"] = get_model_llama2_7b

def infer_llama2_7b(tokenized_input, model):
    return model.generate(**tokenized_input, do_sample=True, max_new_tokens=max_new_tokens, top_k=top_k, temperature=temperature, eos_token_id=2) # tokenizer.eos_token_id)

infer["meta-llama/Llama-2-7b-chat-hf"] = infer_llama2_7b

def treat_output_llama2_7b(output): # Differs if it is N-shot. Here, it is 0-shot
    occ_1 = output.find("[/INST]")
    output = output[occ_1+9:] # +7 + les 2 espaces avant que le modèle ne parle
    if output.find('</s>')!=-1:
        output = output[:-4]
    return output

treat_output["meta-llama/Llama-2-7b-chat-hf"] = treat_output_llama2_7b

##### Falcon

In [68]:
prompt_templates["tiiuae/falcon-7b"] = """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
>>QUESTION<<{instruction}
>>ANSWER<<"""

context_lengths["tiiuae/falcon-7b"] = 2048

def get_tokenizer_falcon7b():
    return AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)

tokenizers["tiiuae/falcon-7b"] = get_tokenizer_falcon7b

def get_model_falcon7b():
    return AutoModelForCausalLM.from_pretrained(
    "tiiuae/falcon-7b",
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

models["tiiuae/falcon-7b"] = get_model_falcon7b

def infer_falcon7b(tokenized_input, model):
    return model.generate(**tokenized_input, do_sample=True, max_new_tokens=max_new_tokens, top_k=top_k, temperature=temperature, eos_token_id=11)

infer["tiiuae/falcon-7b"] = infer_falcon7b

def treat_output_falcon7b(output):
    occ_1 = output.find('>>ANSWER<<')
    output = output[occ_1 + len('>>ANSWER<<'):]
    if output.find('<|endoftext|>')!=-1:
        output = output[:-len('<|endoftext|>')]
    return output

treat_output["tiiuae/falcon-7b"] = treat_output_falcon7b

### Pre-processing the input dataset
Optional

In [82]:
inputs_folder = "0"
nb_to_tokenizer = {}

# On peut imaginer une première passe qui découpe les inputs et forme les instructions, rangés dans des fichiers .json, dans un dossier instructions. Il faut utiliser le bon tokenizer -> Process à répéter pour chaque modèle. instructions/<tokenizer_name>/...

dataset_folder = "datasets/KeyPointsExtractionTest"
mkdir('inputs')
for model_name in used_models_name:
    tokenizer = tokenizers[model_name]()

    for corpus in os.listdir(dataset_folder):
        for name_file in os.listdir(dataset_folder + "/" + corpus):
            print(model_name + " --- " + corpus + " --- " + name_file)
            file = open(dataset_folder + "/" + corpus + "/" + name_file, 'r', encoding='utf-8')
            data = json.load(file)
            file.close()
        
            new_data = {}
            new_data["tokenizer"] = model_name
            new_data["original_path"] = corpus + "/" + name_file
            new_file_name = corpus + "_" + name_file
            new_data["summaries"] = data["summaries"]
            en_chunks = chunkize(data["text_en"], tokenizer)
            #fr_chunks = chunkize(data["text_fr"], tokenizer)
            new_data["en_chunks"] = []
            #new_data["fr_chunks"] = []
            new_data["MAX_TOKEN_CHUNK_SIZE"] = str(MAX_TOKEN_CHUNK_SIZE)

            for i in range(len(en_chunks)):
                chunk = {"text_en":en_chunks[i], "nb_words_en":str(count_words(en_chunks[i])), "nb_characters_en":str(len(en_chunks[i]))}
                new_data["en_chunks"].append(chunk)

            #for i in range(len(fr_chunks)):
            #    chunk = {"text_fr":fr_chunks[i], "nb_words_fr":str(count_words(fr_chunks[i])), "nb_characters_fr":str(len(fr_chunks[i]))}
            #    new_data["fr_chunks"].append(chunk)

            mkdir("inputs/" + simplified_names[model_name] + "_" + str(MAX_TOKEN_CHUNK_SIZE))
            mkdir("inputs/" + simplified_names[model_name] + "_" + str(MAX_TOKEN_CHUNK_SIZE) + "/" + corpus)
            new_file = open("inputs/" + simplified_names[model_name] + "_" + str(MAX_TOKEN_CHUNK_SIZE) + "/" + corpus + "/" + new_file_name, 'w', encoding='utf-8')
            json.dump(new_data, new_file, ensure_ascii=False, indent=4)
            new_file.close()


Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


Salesforce/xgen-7b-8k-inst --- ami --- 0
Salesforce/xgen-7b-8k-inst --- ami --- 1
Salesforce/xgen-7b-8k-inst --- ami --- 2
Salesforce/xgen-7b-8k-inst --- fredsum --- 0
Salesforce/xgen-7b-8k-inst --- fredsum --- 1
Salesforce/xgen-7b-8k-inst --- fredsum --- 2
Salesforce/xgen-7b-8k-inst --- icsi --- 0
Salesforce/xgen-7b-8k-inst --- icsi --- 1
Salesforce/xgen-7b-8k-inst --- icsi --- 2
Salesforce/xgen-7b-8k-inst --- mediasum --- 0
Salesforce/xgen-7b-8k-inst --- mediasum --- 1
Salesforce/xgen-7b-8k-inst --- mediasum --- 2
Salesforce/xgen-7b-8k-inst --- summre --- 0
Salesforce/xgen-7b-8k-inst --- summre --- 1
Salesforce/xgen-7b-8k-inst --- summre --- 2


Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


legendhasit/xgen-7b-8k-inst-8bit/4bit --- ami --- 0
legendhasit/xgen-7b-8k-inst-8bit/4bit --- ami --- 1
legendhasit/xgen-7b-8k-inst-8bit/4bit --- ami --- 2
legendhasit/xgen-7b-8k-inst-8bit/4bit --- fredsum --- 0
legendhasit/xgen-7b-8k-inst-8bit/4bit --- fredsum --- 1
legendhasit/xgen-7b-8k-inst-8bit/4bit --- fredsum --- 2
legendhasit/xgen-7b-8k-inst-8bit/4bit --- icsi --- 0
legendhasit/xgen-7b-8k-inst-8bit/4bit --- icsi --- 1
legendhasit/xgen-7b-8k-inst-8bit/4bit --- icsi --- 2
legendhasit/xgen-7b-8k-inst-8bit/4bit --- mediasum --- 0
legendhasit/xgen-7b-8k-inst-8bit/4bit --- mediasum --- 1
legendhasit/xgen-7b-8k-inst-8bit/4bit --- mediasum --- 2
legendhasit/xgen-7b-8k-inst-8bit/4bit --- summre --- 0
legendhasit/xgen-7b-8k-inst-8bit/4bit --- summre --- 1
legendhasit/xgen-7b-8k-inst-8bit/4bit --- summre --- 2
mosaicml/mpt-7b-instruct --- ami --- 0
mosaicml/mpt-7b-instruct --- ami --- 1
mosaicml/mpt-7b-instruct --- ami --- 2
mosaicml/mpt-7b-instruct --- fredsum --- 0
mosaicml/mpt-7b-instr

Token indices sequence length is longer than the specified maximum sequence length for this model (6433 > 2048). Running this sequence through the model will result in indexing errors


mosaicml/mpt-7b-instruct --- mediasum --- 0
mosaicml/mpt-7b-instruct --- mediasum --- 1
mosaicml/mpt-7b-instruct --- mediasum --- 2
mosaicml/mpt-7b-instruct --- summre --- 0
mosaicml/mpt-7b-instruct --- summre --- 1
mosaicml/mpt-7b-instruct --- summre --- 2


Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


legendhasit/xgen-7b-8k-inst-8bit --- ami --- 0
legendhasit/xgen-7b-8k-inst-8bit --- ami --- 1
legendhasit/xgen-7b-8k-inst-8bit --- ami --- 2
legendhasit/xgen-7b-8k-inst-8bit --- fredsum --- 0
legendhasit/xgen-7b-8k-inst-8bit --- fredsum --- 1
legendhasit/xgen-7b-8k-inst-8bit --- fredsum --- 2
legendhasit/xgen-7b-8k-inst-8bit --- icsi --- 0
legendhasit/xgen-7b-8k-inst-8bit --- icsi --- 1
legendhasit/xgen-7b-8k-inst-8bit --- icsi --- 2
legendhasit/xgen-7b-8k-inst-8bit --- mediasum --- 0
legendhasit/xgen-7b-8k-inst-8bit --- mediasum --- 1
legendhasit/xgen-7b-8k-inst-8bit --- mediasum --- 2
legendhasit/xgen-7b-8k-inst-8bit --- summre --- 0
legendhasit/xgen-7b-8k-inst-8bit --- summre --- 1
legendhasit/xgen-7b-8k-inst-8bit --- summre --- 2
tiiuae/falcon-7b --- ami --- 0
tiiuae/falcon-7b --- ami --- 1
tiiuae/falcon-7b --- ami --- 2
tiiuae/falcon-7b --- fredsum --- 0
tiiuae/falcon-7b --- fredsum --- 1
tiiuae/falcon-7b --- fredsum --- 2
tiiuae/falcon-7b --- icsi --- 0
tiiuae/falcon-7b --- icsi 

Token indices sequence length is longer than the specified maximum sequence length for this model (6636 > 2048). Running this sequence through the model will result in indexing errors


tiiuae/falcon-7b --- mediasum --- 0
tiiuae/falcon-7b --- mediasum --- 1
tiiuae/falcon-7b --- mediasum --- 2
tiiuae/falcon-7b --- summre --- 0
tiiuae/falcon-7b --- summre --- 1
tiiuae/falcon-7b --- summre --- 2
meta-llama/Llama-2-7b-chat-hf --- ami --- 0
meta-llama/Llama-2-7b-chat-hf --- ami --- 1
meta-llama/Llama-2-7b-chat-hf --- ami --- 2
meta-llama/Llama-2-7b-chat-hf --- fredsum --- 0
meta-llama/Llama-2-7b-chat-hf --- fredsum --- 1
meta-llama/Llama-2-7b-chat-hf --- fredsum --- 2
meta-llama/Llama-2-7b-chat-hf --- icsi --- 0
meta-llama/Llama-2-7b-chat-hf --- icsi --- 1
meta-llama/Llama-2-7b-chat-hf --- icsi --- 2
meta-llama/Llama-2-7b-chat-hf --- mediasum --- 0
meta-llama/Llama-2-7b-chat-hf --- mediasum --- 1
meta-llama/Llama-2-7b-chat-hf --- mediasum --- 2
meta-llama/Llama-2-7b-chat-hf --- summre --- 0
meta-llama/Llama-2-7b-chat-hf --- summre --- 1
meta-llama/Llama-2-7b-chat-hf --- summre --- 2


In [59]:
print(tokenizers)

{'legendhasit/xgen-7b-8k-inst-8bit': <function get_tokenizer_XGen8bit at 0x000001799D8FA5F0>, 'legendhasit/xgen-7b-8k-inst-8bit/4bit': <function get_tokenizer_XGen8bit at 0x000001799D8FA5F0>, 'Salesforce/xgen-7b-8k-inst': <function get_tokenizer_XGen at 0x000001799D8F91B0>, 'mosaicml/mpt-7b-instruct': <function get_tokenizer_MPT7B at 0x000001799C35C9D0>, 'meta-llama/Llama-2-7b-chat-hf': <function get_tokenizer_llama2_7b at 0x000001799D8FAD40>, 'tiiuae/falcon-7b': <function get_tokenizer_falcon7b at 0x000001799D8FAA70>}


In [11]:
# Every time the script is casted, it must register the results in the "results" folder, without smashing the existing results
# Then, it is stored in a file whose name is the number of the result, and this file contains a little .txt note describing what was the experiment.
# In such a folder, all the generations are json files with additional information like the prompt used, the number of samples, the path of the input text

# Now, what is specific to the model ? The prompt (different headers potentially), the tokenizer, the model itself, the way the output is displayed

In [None]:
mkdir("results")

archives = os.listdir("results")
output_folder_path = "results/"+str(len(archives))
mkdir(output_folder_path) # Create a folder for the last results.
desc_file = open(output_folder_path + "/desc.txt", 'w', encoding='utf-8')
desc_file.write(desc)
desc_file.close()

initial_time = time.time()

model_total = len(used_models_name)
model_index = 0
for model_name in used_models_name: # For each model
    model_index += 1
    print("---- Model : " + model_name + " (" + str(model_index) + "/" + str(model_total) +")----              (loading tokenizer and model...)")
    load_model_time = time.time()
    prompt_template = prompt_templates[model_name]
    tokenizer = tokenizers[model_name]()
    model = models[model_name]()
    load_model_time = time.time() - load_model_time
    !nvidia-smi
    print("Tokenizer and model loaded in", datetime.timedelta(seconds=int(load_model_time)), 'seconds')
    inputs_folder_path = "inputs/" + simplified_names[model_name] + "_" + str(MAX_TOKEN_CHUNK_SIZE)
    corpora = os.listdir(inputs_folder_path)
    for corpus in corpora:
        !nvidia-smi
        file_index = 0
        files_name = os.listdir(inputs_folder_path + "/" + corpus)
        file_total = len(files_name)
        for file_name in files_name:
            file_path = inputs_folder_path + "/" + corpus + "/" + file_name
            if file_name=="ipynb_checkpoints" or os.path.isdir(file_path):
                continue
            file_time = time.time()
            file_index+=1
            print("Starting inference for text " + str(file_index) + "/" + str(file_total) + " in the " + corpus + " corpus.") # Possible errors in the display: file_total also counts the .ipynb checkpoints potentially in the directory.
            file = open(file_path, 'r', encoding='utf-8')
            input_data = json.load(file)
            file.close()
            for text_key in text_keys:
                en_or_fr = text_key[-2:]
                chunks = input_data[en_or_fr + "_chunks"]
                output_data = {"chunks":[]}
                output_data["input_path"] = file_path
                output_data["model"] = model_name
                output_data["input_language"] = text_key # NE GERE PAS ENCORE BIEN FRANCAIS
                output_data["MAX_TOKEN_CHUNK_SIZE"] = input_data["MAX_TOKEN_CHUNK_SIZE"]
                # RAJOUTER UN AFFICHAGE pour toutes les chunks
                precedent_text = ""
                for i in range(len(chunks) + 1): # 1 fois le 1er chunk, puis une fois 1er et 2eme, puis une fois 2eme et 3eme etc. puis une fois avant dernier dernier puis une fois dernier
                    print("Chunk " + str(i) + "/" + str(len(chunks) + 1) + " ...")
                    if i==len(chunks):
                        chunk = chunks[-1]
                        precedent_text = ""
                    else: chunk = chunks[i]
                    chunk_data = {}
                    input_text = precedent_text + chunk[text_key]
                    precedent_text = chunk[text_key]
                    instruction_template = instruction_templates[text_key]
                    full_instruction = instruction_template.format(text=input_text)
                    prompt = prompt_template.format(instruction=full_instruction)

                    # Probably specific to the model
                    input=0
                    if model_name=="tiiuae/falcon-7b": input = tokenizer(prompt, return_token_type_ids=False, return_tensors="pt").to('cuda')# A AMELIORER: Il faut un nouveau dico
                    else: input = tokenizer(prompt, return_tensors="pt").to('cuda') # Le renvoie sur le GPU car au départ, c'est généré en CPU le tensuer des tokens

                    input_length = len(input['input_ids'][0])
                    output_name = file_name + "_" + text_key + ".json"
                    output_path = output_folder_path + "/" + output_name
                    if "instruction" not in output_data: output_data["instruction"] = instruction_template.format(text="")
                    chunk_data["success"] = "0"
                    chunk_data["input_length"] = str(input_length)

                    if input_length > token_limit: # In this case, a file is created but no inference is made
                        print("/!\ With model " + model_name + ", the input length is above the token limit for " + text_key + " input in " + file_path + " (" + str(input_length) + " > " + str(token_limit) + ")")

                    else: # In this case, the output is inferred, treated to keep only the generation and the length information
                        # PROBABLY SPECIFIC TO THE MODEL
                        sample = infer[model_name](input, model)
                        sample_length = len(sample[0])
                        ### SPECIFIC TO THE MODEL ----------------------------------------
                        if sample_length > context_lengths[model_name]: # There was a context window overflow
                            output_data["over_context"] = "1"
                        else:
                            output_data["over_context"] = "0"
                        output_length = sample_length - input_length
                        output_data["output_length"] = output_length
                        full_output = tokenizer.decode(sample[0]).strip() # prompt + output generation
                        # Ideally, treat_output is a dictionnary that associates model_name to a function taking full_output as an argument
                        output = treat_output[model_name](full_output)
                        chunk_data["text"] = output
                        chunk_data["nb_characters"] = len(output)
                        chunk_data["nb_words"] = count_words(output)
                        chunk_data["success"] = "1"
                        chunk_data["language_output"] = str(classify(output))
                    output_data["chunks"].append(chunk_data)
                    
                # SAVING THE OUTPUT
                output_file = open(output_path, 'w', encoding='utf-8')
                json.dump(output_data, output_file, indent=4, ensure_ascii=False)
                output_file.close()
            file_time = time.time() - file_time
            print("Inference done for French and English in", datetime.timedelta(seconds=int(file_time)), "seconds.")
    # HERE, clear memory... remove model, tokenizer from memory
                
delta = time.time() - initial_time
print('Done! Took', datetime.timedelta(seconds=int(delta)), 'seconds')
# Write treatOutput function / dictionnary


# RETIRER le fichier template dans samsum

### Post-processing

In [15]:

# A FAIRE : GERER LE BERT MULTILINGUE
# Anglais vers français : taux de foisonnement de 20%

rouge = evaluate.load('rouge') # Warning ! When comparing many references to a prediction, an average is performed, not a max...
bertscore = evaluate.load('bertscore') # The BERTScore calculation will take a lot of time... (if not performed in only one call)

mkdir('results')

nb_batch = len(os.listdir("results")) # Mettre le numéro du "batch d'inférence"

# Define the place of storage : "results/x/scores.csv"
output_names = os.listdir("results/" + nb_batch)
# Lists used for the BERT score computation
references = {"0":[], "1":[]}
predictions = {"0":[], "1":[]}
# Columns of the csv file
output_ids, rouges, input_nb_words_list, nb_words_generated_summaries, nb_words_gold_summaries, outputs_success, outputs_over_context, languages_output, languages_input = [], [], [], [], [], [], [], [], []
print(output_names)
for output_name in output_names:
    # Makes sure the file is an output file, and opens it
    output_path = "results/" + nb_batch + "/" + output_name
    if output_name=="scores.csv" or output_name=="desc.txt" or os.path.isdir(output_path):
        continue
    output_data = load_json_into_dict(output_path)
    generated_summary = output_data["text"]
    output_language = str(classify(generated_summary))
    language_code = "text_en" 
    if output_language in number_to_code:
        language_code = number_to_code[output_language] # Le language code est soit text_fr si output fr, soit text_en si output en ou toute autre langue ! Juste pour choisir un résumé de référence...
    
    find_path = output_name.split('_') # 0: corpus, 1: name json input file, 2: text (quite useless) 3: fr.json or en.json
    input_path = "datasets/" + name_dataset + "/" + find_path[0] + "/" + find_path[1]
    input_data = load_json_into_dict(input_path)
    summaries_data = input_data["summaries"]
    # Computation of Rouge
    golds = [] # Sert uniquement pour le calcul du BertScore avec plusieurs références
    max_rouge2, max_rougel, nb_words_closest_gold = maxRouge(summaries_data, language_code, generated_summary, golds)
    # For BertScore
    if output_language in {"0", "1"}:
        predictions[output_language].append(generated_summary)
        references[output_language].append(golds)
    # Fill the columns
    rouges.append([max_rouge2, max_rougel])
    output_ids.append(output_name)
    nb_words_gold_summaries.append(nb_words_closest_gold)
    nb_words_generated_summaries.append(output_data['nb_words'])
    outputs_success.append(output_data['success'])
    outputs_over_context.append(output_data['over_context'])
    languages_input.append(output_data['input_language'])
    input_nb_words_list.append(int(input_data['nb_words_' + language_code[-2:]]))
    languages_output.append(output_language)

print(len(predictions["0"]))
print(len(predictions["1"]))


#max_bertscores_fr = bertscore.compute(predictions=predictions["0"], references=references["0"], lang='fr', rescale_with_baseline=True, verbose=True)['f1']
max_bertscores_en = bertscore.compute(predictions=predictions["1"], references=references["1"], lang='en', rescale_with_baseline=True, verbose=True)['f1']

index_fr = 0
index_en = 0
rows = []
for i in range(len(output_ids)):
    max_bertscore = 0
    if languages_output[i]=="0":
        #max_bertscore = max_bertscores_fr[index_fr]
        index_fr += 1
    elif languages_output[i]=="1":
        max_bertscore = max_bertscores_en[index_en]
        index_en += 1
    rows.append([output_ids[i], rouges[i][0], rouges[i][1], max_bertscore, input_nb_words_list[i], nb_words_gold_summaries[i], nb_words_generated_summaries[i], outputs_success[i], outputs_over_context[i], languages_output[i], languages_input[i]])

header = [["input_path", "rouge2", "rougel", "bertscore", "nb_words_input", "nb_words_gold", "nb_words_generated", "success", "over_context", "output_language", "input_language"]]

storage_file = open('results/' + nb_batch + '/scores1.csv', mode='w', newline='', encoding='utf-8')# MODIFIER LE SCORE1 ----------------------------------------------------------
csv_writer = csv.writer(storage_file)
csv_writer.writerows(header + rows)
storage_file.close()

['desc.txt', 'dialogsum_1_text_en.json', 'dialogsum_2_text_en.json', 'dialogsum_3_text_en.json', 'dialogsum_4_text_en.json', 'dialogsum_5_text_en.json', 'fredsum_1_text_en.json', 'fredsum_2_text_en.json', 'fredsum_3_text_en.json', 'fredsum_4_text_en.json', 'fredsum_5_text_en.json', 'mediasum_1_text_en.json', 'mediasum_2_text_en.json', 'mediasum_3_text_en.json', 'mediasum_4_text_en.json', 'mediasum_5_text_en.json', 'samsum_1_text_en.json', 'samsum_2_text_en.json', 'samsum_3_text_en.json', 'samsum_4_text_en.json', 'samsum_5_text_en.json', 'scores.csv', 'xsum_1_text_en.json', 'xsum_2_text_en.json', 'xsum_3_text_en.json', 'xsum_4_text_en.json', 'xsum_5_text_en.json']
0
25


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 2/2 [01:42<00:00, 51.45s/it] 


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00,  5.11it/s]

done in 1559143.69 seconds, 0.00 sentences/sec





In [12]:
a = "text_en"
print(a[-2:])

en


### Hardware use

In [3]:
!nvidia-smi
# model.hf_device_map

'nvidia-smi' n'est pas reconnu en tant que commande interne
ou externe, un programme ex�cutable ou un fichier de commandes.


### TESTS

In [None]:
import transformers

name = 'mosaicml/mpt-7b-instruct'
name2 = "Trelis/mpt-7b-instruct-hosted-inference-8bit"

config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096

tokenizer = AutoTokenizer.from_pretrained('legendhasit/xgen-7b-8k-inst-8bit', trust_remote_code=True)

!nvidia-smi

model = transformers.AutoModelForCausalLM.from_pretrained(
  name,
  config=config,
  trust_remote_code=True
)

In [None]:
!nvidia-smi

input_test = tokenizer(prompt, return_tensors="pt").to('cuda')
sample = model.generate(**input_test, do_sample=True, max_new_tokens=100, top_k=20, temperature=0.3)
print(tokenizer.decode(sample[0]).strip())

In [11]:
# A FAIRE : GERER LE BERT MULTILINGUE
# COnserver ce 2eme programme adapté aux anciennes générations
# Anglais vers français : taux de foisonnement de 20%

rouge = evaluate.load('rouge') # Warning ! When comparing many references to a prediction, an average is performed, not a max...
bertscore = evaluate.load('bertscore') # The BERTScore calculation will take a lot of time... (if not performed in only one call)

mkdir('results')

nb_batch = "2" # Mettre le numéro du "batch d'inférence"

# Define the place of storage : "results/x/scores.csv"
output_names = os.listdir("results/" + nb_batch)
# The texts for bertscore
references, predictions = [], []
# The columns of the csv file
output_ids, rouges, input_nb_words_list, nb_words_generated_summaries, nb_words_gold_summaries, outputs_success, outputs_over_context, languages_output, languages_input = [], [], [], [], [], [], [], [], []

for output_name in output_names:
    # To ensure the read file is an output
    output_path = "results/" + nb_batch + "/" + output_name
    if output_name=="scores.csv" or output_name=="desc.txt" or os.path.isdir(output_path):
        continue
    # Extraire le dictionnaire de données de l'ouptut
    output_data = load_json_into_dict(output_path)
    # Obtenir l'input, le langage de l'output, les résumés de référence
    language_output = str(classify(generated_summary))
    language_code = number_to_code[language_output]
    find_path = output_name.split('_') # 0: corpus, 1: name json input file, 2: text (quite useless) 3: fr.json or en.json
    input_path = "datasets/" + name_dataset + "/" + find_path[0] + "/" + find_path[1]
    input_data = load_json_into_dict(input_path)
    summaries_data = input_data["summaries"]
    # Preparation pour Max Rouge
    golds = [] # Sert uniquement pour le calcul du BertScore avec plusieurs références
    max_rouge2, max_rougel, nb_words_closest_gold = maxRouge(summaries_data, language_code, generated_summary, golds)
    # Fill in the row
    output_ids.append(output_name)
    languages_output.append(language_output)
    rouges.append([max_rouge2, max_rougel])
    predictions.append(generated_summary)
    references.append(golds)
    nb_words_gold_summaries.append(nb_words_closest_gold)
    nb_words_generated_summaries.append(output_data['nb_words'])
    outputs_success.append(output_data['success'])
    outputs_over_context.append(output_data['over_context'])
    input_nb_words_list.append(int(input_data['nb_words_' + language_code]))
    languages_input.append(key_to_language[output_data['input_language']])

max_bertscores = bertscore.compute(predictions=predictions, references=references, lang='en', rescale_with_baseline=True, verbose=True)['f1'] # PROBLEM WITH LANGUAGE USED

storage_file = open('results/' + nb_batch + '/scores4.csv', mode='w', newline='', encoding='utf-8')
csv_writer = csv.writer(storage_file)

rows = [[output_ids[i], rouges[i][0], rouges[i][1], max_bertscores[i], input_nb_words_list[i], nb_words_gold_summaries[i], nb_words_generated_summaries[i], outputs_success[i], outputs_over_context[i], languages_output[i], languages_input[i]] for i in range(len(output_ids))]

header = [["input_path", "rouge2", "rougel", "bertscore", "nb_words_input", "nb_words_gold", "nb_words_generated", "success", "over_context", "output_language", "input_language"]]

csv_writer.writerows(header + rows)

storage_file.close()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 2/2 [04:06<00:00, 123.09s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00,  3.60it/s]

done in 1107629.12 seconds, 0.00 sentences/sec





In [4]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", trust_remote_code=True)
print(tokenizer.eos_token_id)

Downloading (…)okenizer_config.json: 100%|██████████| 776/776 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 13.9MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 6.20MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 414/414 [00:00<00:00, 161kB/s]


2


In [None]:
tokenizer = tokenizers["tiiuae/falcon-7b"]()
model = models["tiiuae/falcon-7b"]()

prompt = prompt_templates["tiiuae/falcon-7b"].format("Hey, how are you my dear Falcon ?")
tokenized_input = tokenizer(prompt, return_token_type_ids=False, return_tensors="pt").to('cuda')
output = infer["tiiuae/falcon-7b"](tokenized_input, model)
print(tokenizer.decode(output[0]).strip())
print()
print("Treated ================================")
print()
print(treat_output["tiiuae/falcon-7b"](tokenizer.decode(output[0]).strip()))