In [1]:
pip install transformers datasets accelerate peft

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (f

In [2]:
import os
import json
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)


In [3]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `GGGGGGG` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-auth

In [10]:
!pip install --upgrade transformers



In [5]:
!pip install transformers datasets accelerate peft
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset, load_dataset
from transformers import DataCollatorForLanguageModeling

# 1. Configuration
MODEL_NAME = "Salesforce/codegen-350M-mono"  # Changed to codegen-350M-mono
DATASET_PATH = "/content/manim_gen.jsonl"
OUTPUT_DIR = "./codegen-350M-mono-finetuned"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Load Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token if not defined
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

# 3. Prepare Data
def load_and_preprocess_data(file_path):
    formatted_data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                item = json.loads(line)
                instruction, output = item['text'].split("\n<output>: ")
                instruction = instruction.replace("<instruction>: ", "")
                prompt = f"Input: {instruction}\nOutput: {output}"
                formatted_data.append({"text": prompt})
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON line: {line.strip()} due to error: {e}")
            except ValueError as e:
                print(f"Skipping line due to incorrect format: {e}, line content: {line.strip()}")
    return Dataset.from_list(formatted_data)

dataset = load_and_preprocess_data(DATASET_PATH)

# 4. Tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 5. Split train/test
if len(tokenized_datasets) > 100:
    split_datasets = tokenized_datasets.train_test_split(test_size=0.1)
    train_dataset = split_datasets["train"]
    eval_dataset = split_datasets["test"]
else:
    train_dataset = tokenized_datasets
    eval_dataset = None

# 6. Training Arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Reduced batch size to 1
    per_device_eval_batch_size=1,  # Reduced batch size to 1
    gradient_accumulation_steps=4,  # Increased gradient accumulation steps
    gradient_checkpointing=True,
    fp16=DEVICE == "cuda",
    save_steps=10000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,
)
# 7. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset if eval_dataset else None,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

# 8. Fine-tuning
trainer.train()

# 9. Save Model
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"✅ Fine-tuning terminé ! Modèle sauvegardé dans : {OUTPUT_DIR}")



Some weights of the model checkpoint at Salesforce/codegen-350M-mono were not used when initializing CodeGenForCausalLM: ['transformer.h.0.attn.causal_mask', 'transformer.h.1.attn.causal_mask', 'transformer.h.10.attn.causal_mask', 'transformer.h.11.attn.causal_mask', 'transformer.h.12.attn.causal_mask', 'transformer.h.13.attn.causal_mask', 'transformer.h.14.attn.causal_mask', 'transformer.h.15.attn.causal_mask', 'transformer.h.16.attn.causal_mask', 'transformer.h.17.attn.causal_mask', 'transformer.h.18.attn.causal_mask', 'transformer.h.19.attn.causal_mask', 'transformer.h.2.attn.causal_mask', 'transformer.h.3.attn.causal_mask', 'transformer.h.4.attn.causal_mask', 'transformer.h.5.attn.causal_mask', 'transformer.h.6.attn.causal_mask', 'transformer.h.7.attn.causal_mask', 'transformer.h.8.attn.causal_mask', 'transformer.h.9.attn.causal_mask']
- This IS expected if you are initializing CodeGenForCausalLM from the checkpoint of a model trained on another task or with another architecture (e

Map:   0%|          | 0/599 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhalimalaghrida70[0m ([33mhalimalaghrida70-ensias[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
100,0.4189
200,0.1669
300,0.1299
400,0.1


✅ Fine-tuning terminé ! Modèle sauvegardé dans : ./codegen-350M-mono-finetuned


In [12]:
import transformers
print(transformers.__version__)

4.51.3


In [8]:
import json
import torch
from transformers import CodeGenTokenizer, CodeGenForCausalLM, TrainingArguments, Trainer
from datasets import Dataset

# 1. Configuration
MODEL_NAME = "/content/salesforce-codegen-350M-multi"
DATASET_PATH = "/manim_gen.jsonl"  # Remplacez par le chemin vers votre fichier JSON
OUTPUT_DIR = "./codegen-finetuned"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Chargement du tokenizer et du modèle
tokenizer = CodeGenTokenizer.from_pretrained(MODEL_NAME)
# Add this line to define the pad token if it's not already defined:
tokenizer.pad_token = tokenizer.eos_token
model = CodeGenForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

# 3. Préparation des données
def load_and_preprocess_data(file_path):
    formatted_data = []
    # Assuming JSON Lines format (one JSON object per line)
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                item = json.loads(line) # Load each line as a separate JSON object
                # Split the "text" field into instruction and output
                instruction, output = item['text'].split("\n<output>: ")
                instruction = instruction.replace("<instruction>: ", "") # Clean up instruction

                # Create the prompt
                prompt = f"Input: {instruction}\nOutput: {output}"
                formatted_data.append({"text": prompt})
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON line: {line.strip()} due to error: {e}") # Print an error message for any invalid lines
            except ValueError as e:  # Catch errors from splitting
                print(f"Skipping line due to incorrect format: {e}, line content: {line.strip()}")


    return Dataset.from_list(formatted_data)

dataset = load_and_preprocess_data(DATASET_PATH)
# 4. Tokenisation des données
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 5. Division en train/test (optionnel)
if len(tokenized_datasets) > 100:  # Seulement si le dataset est assez grand
    split_datasets = tokenized_datasets.train_test_split(test_size=0.1)
    train_dataset = split_datasets["train"]
    eval_dataset = split_datasets["test"]
else:
    train_dataset = tokenized_datasets
    eval_dataset = None

# 6. Configuration de l'entraînement
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="steps" if eval_dataset else "no",
    eval_steps=500 if eval_dataset else None,
    logging_dir='./logs',
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,
    fp16=DEVICE == "cuda",
)

# 7. Création du Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# 8. Fine-tuning
trainer.train()

# 9. Sauvegarde du modèle finetuné
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Fine-tuning terminé! Modèle sauvegardé dans {OUTPUT_DIR}")

Map:   0%|          | 0/599 [00:00<?, ? examples/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
import json
import torch
from transformers import CodeGenTokenizer, CodeGenForCausalLM, TrainingArguments, Trainer
from datasets import Dataset

# 1. Configuration
MODEL_NAME = "/content/salesforce-codegen-350M-multi"
DATASET_PATH = "/manim_gen.jsonl"  # Remplacez par le chemin vers votre fichier JSON
OUTPUT_DIR = "./codegen-finetuned"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Chargement du tokenizer et du modèle
tokenizer = CodeGenTokenizer.from_pretrained(MODEL_NAME)
model = CodeGenForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

# 3. Préparation des données
def load_and_preprocess_data(file_path):
    formatted_data = []
    # Assuming JSON Lines format (one JSON object per line)
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                item = json.loads(line) # Load each line as a separate JSON object
                # Split the "text" field into instruction and output
                instruction, output = item['text'].split("\n<output>: ")
                instruction = instruction.replace("<instruction>: ", "") # Clean up instruction

                # Create the prompt
                prompt = f"Input: {instruction}\nOutput: {output}"
                formatted_data.append({"text": prompt})
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON line: {line.strip()} due to error: {e}") # Print an error message for any invalid lines
            except ValueError as e:  # Catch errors from splitting
                print(f"Skipping line due to incorrect format: {e}, line content: {line.strip()}")


    return Dataset.from_list(formatted_data)

dataset = load_and_preprocess_data(DATASET_PATH)

In [None]:
import os
import json
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

# Define paths
MODEL_NAME = "WizardLM/WizardCoder-Python-34B-V1.0"  # Adjust to your specific WizardCoder model
OUTPUT_DIR = "finetuned-wizardcoder-manim"
DATASET_PATH = "/content/manim_gen.jsonl"  # Path to your existing JSON file

# Load dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    # If your JSON file is already a list of dictionaries with "text" field, use:
    # return Dataset.from_list(data)

    # If your JSON file has a different structure, you may need to process it:
    processed_data = []
    # Add your processing logic here if needed
    # For example, if it's a JSONL file (one JSON object per line):
    # with open(file_path, 'r') as f:
    #     for line in f:
    #         processed_data.append(json.loads(line))

    return Dataset.from_list(data)  # Or processed_data if you need to transform it

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,  # Use fp16 for memory efficiency
    device_map="auto"  # Automatically distribute model across available GPUs
)

# Set tokenizer padding token if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load and process the dataset
dataset = load_dataset(DATASET_PATH)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=2048)

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_dataset

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Not using masked language modeling
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Adjust based on your GPU memory
    gradient_accumulation_steps=8,  # Adjust based on your GPU memory
    save_steps=100,
    save_total_limit=2,
    logging_steps=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,  # Use fp16 training
    warmup_steps=100,
    report_to="tensorboard",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

# Train model
trainer.train()

# Save model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model fine-tuned and saved to {OUTPUT_DIR}")
