In [1]:
# Import libraries
import os, sys

# Pytorch
import torch

# Import some Hugging Face Libraries
import transformers
from datasets import load_dataset, load_from_disk

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

torch.cuda.empty_cache()

# Optional for debugging, if you want to see the full tensor
torch.set_printoptions(threshold=10_000)

In [2]:
#Training parameters
context = 1024

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: You are using ", device)


Device: You are using  cuda


In [6]:
path = os.getcwd() 
dataset_name = 'facebook/natural_reasoning'
tokenizer_path = path + '/tokenizers/tok16384'
checkpoint_dir = path + '/models/'

dataset_path_1 = path + '\\data2\\General-Knowledge'
dataset_path_2 = path + '\\data2\\natural_reasoning'
dataset_path = path + '\\data2\\other'

tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)

# Set the tokenizer parameters
# tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>\n' }}\n{% endif %}\n{% endfor %}"

# Make padding token equal to the end of sentence token (wich has ID of 2 in our case)
tokenizer.pad_token = tokenizer.eos_token

if os.path.exists(dataset_path_1) and os.path.exists(dataset_path_2):
 pass
else:
    print("Dataset not found, loading from Hugging Face")
    dataset = load_dataset(dataset_name, split='train')
    
    # Prétraitement pour transformer les questions et réponses en format utilisé pour l'entraînement
    def preprocess_dataset(examples):
        questions = examples['question']
        answers = examples['responses']
        

        # Vérification et conversion en string (évite les erreurs sur des valeurs nulles) sinon on converit en string 
        questions = [q if isinstance(q, str) else "" for q in questions]
        answers = [a[0].get('response') if isinstance(a[0].get('response'), str) else "" for a in answers]

        input_encodings = tokenizer(
            questions, truncation=True, padding="max_length", max_length=256, return_tensors="np"
        )
        target_encodings = tokenizer(
            answers, truncation=True, padding="max_length", max_length=256, return_tensors="np"
        )
        
        if any(x is None for x in input_encodings["input_ids"]):
            print("❌ Erreur : Certaines input_ids sont None !")
        
        if any(x is None for x in target_encodings["input_ids"]):
            print("❌ Erreur : Certaines labels sont None !")
            
            
        return {
            'input_ids': input_encodings['input_ids'],
            'labels': target_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'].tolist()  # Ajout de l'attention_mask
        }

    # Appliquer la transformation
    dataset = dataset.map(preprocess_dataset, batched=True, remove_columns=['question', 'responses'])
    dataset.save_to_disk(dataset_path)
        
    

Dataset not found, loading from Hugging Face


Map:   0%|          | 0/1145824 [00:00<?, ? examples/s]

Saving the dataset (0/6 shards):   0%|          | 0/1145824 [00:00<?, ? examples/s]

In [None]:

print(tokenizer.decode(dataset[0]['input_ids']))
print(tokenizer.decode(dataset[0]['labels']))