In [1]:
#!pip install -U datasets
#!pip install -U datasets trl accelerate peft bitsandbytes transformers trl huggingface_hub

In [2]:
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, PeftModel
from huggingface_hub import login

In [3]:
import os
from datasets import Dataset,load_from_disk
import pandas as pd

# Path to the directory containing the text files
data_dir =  "documents/"

# Read text files into a list of dictionaries
data = []
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            data.append({'filename': filename, 'text': text})

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(data)

# Create a Hugging Face dataset from the DataFrame
dataset1 = Dataset.from_pandas(df)

# Save the dataset for future use
dataset1.save_to_disk('save_my_dataset')


Saving the dataset (0/1 shards):   0%|          | 0/383 [00:00<?, ? examples/s]

In [4]:
print(dataset1[0])

{'filename': '10ÈME_ÉDITION_DU_CONCOURS_FRANCOPHONE_INTERNATIONAL_MA_THÈSE_EN_180_SECONDES.txt', 'text': 'Titre: 10ÈME ÉDITION DU CONCOURS FRANCOPHONE INTERNATIONAL «\xa0MA THÈSE EN 180 SECONDES\xa0», Date: janvier 10, 2024. Pour plus de détails, veuillez consulter le lien suivant : https://fstt.ac.ma/Portail2023/10eme-edition-du-concours-francophone-international-ma-these-en-180-secondes/'}


In [5]:
# Fine-tuned model
new_model = "llama-2-7b-chat-FSTT"
model_id = "NousResearch/Llama-2-7b-chat-hf"


In [6]:
import torch
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [8]:
peft_params = LoraConfig(
    lora_alpha = 16,
    lora_dropout = 0.1,
    r = 64,
    bias = "none",
    task_type = "CAUSAL_LM",)
     

In [9]:
training_params = TrainingArguments(
    output_dir = "results",
    num_train_epochs = 1,              # Start with 1 epoch and increase gradually if memory allows
    per_device_train_batch_size = 2,   # Begin with smallest batch size, increase in increments of 1
    gradient_accumulation_steps = 8,   # Aggressively accumulate gradients to compensate for low batch size
    optim = "adamw_torch",             # Efficient optimizer for LLMs
    save_steps = 1000,                 # Adjust saving frequency based on training duration
    logging_steps = 1000,              # Adjust logging frequency based on your preference
    learning_rate = 5e-6,              # Start with very low learning rate to mitigate instability
    weight_decay = 0.01,               # Regularization to prevent overfitting
    fp16 = True,                       # Enable mixed precision for memory savings
    bf16 = False,                      # T4 doesn't support bfloat16
    max_grad_norm = 0.5,               # Adjust gradient norm as needed
    max_steps = -1,                    # Train for all epochs by default
    warmup_ratio = 0.1,                # Adjust warmup ratio based on learning rate and dataset size
    group_by_length = True,            # Improve efficiency for long sequences
    lr_scheduler_type = "constant",    # Use warmup followed by constant learning rate
    report_to = "tensorboard",         # Track training progress with TensorBoard

    # Additional memory-specific optimizations:
    # max_train_steps = 1000,          # Set a maximum number of training steps to limit total memory usage
    # sharded_ddp = True,              # Enable DistributedDataParallel sharding if multiple GPUs are available
    gradient_checkpointing = True,     # Recompute intermediate activations for memory savings
    fp16_full_eval = True,             # Use mixed precision during evaluation as well
    dataloader_pin_memory = False,     # Disable data pinning to avoid potential memory overhead
    local_rank = -1,                   # Disable automatic distributed training (if only 1 GPU)
    # skip_memory_check=True,          # Temporarily skip memory checks, but monitor closely
)

In [10]:
from trl import SFTTrainer

In [11]:
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset1,
    peft_config = peft_params,
    dataset_text_field = "text",
    max_seq_length = None,
    tokenizer = tokenizer,
    args = training_params,
    packing = False,
)



Map:   0%|          | 0/383 [00:00<?, ? examples/s]

In [15]:
from torch import cuda
cuda.empty_cache()
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 250.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 6.42 GiB is allocated by PyTorch, and 476.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [12]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('llama-2-7b-chat-FSTT\\tokenizer_config.json',
 'llama-2-7b-chat-FSTT\\special_tokens_map.json',
 'llama-2-7b-chat-FSTT\\tokenizer.json')

In [None]:
config = {
    "task": "text-generation",
    "model": model,
    "tokenizer": tokenizer,
    "max_length": 250,
    "config": {
        "language": "fr"
    }
}

In [None]:
logging.set_verbosity(logging.CRITICAL)

prompt = "C'est quoi FSTT?"
pipe = pipeline(**config)
result = pipe(f"[INST] {prompt} [/INST]")
print(result[0]['generated_text'])