In [2]:
# Configuring the character encoding
import locale


def getpreferredencoding(do_setlocale=True):
    return "UTF-8"


locale.getpreferredencoding = getpreferredencoding

!pip install -U accelerate peft bitsandbytes transformers trl datasets wandb mlflow python-dotenv pyngrok numpy==1.24.3



In [3]:
# Imports
import os
import sys
import time
import torch
import wandb
import numpy
import random
import mlflow
import hashlib
from dotenv import load_dotenv
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from datasets import Dataset, DatasetDict, Features, Value
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from pyngrok import ngrok

In [4]:
!nvidia-smi

Tue Jan  7 14:04:33 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              41W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
# Check if the notebook runs on Colab to adjust paths
on_colab = 'google.colab' in sys.modules

if on_colab:
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    test_file = "/content/drive/MyDrive/Data/docstring_test_data.txt"
    train_file = "/content/drive/MyDrive/Data/docstring_training_data.txt"
    base_output_dir = "/content/drive/MyDrive/Models/"
    print("The notebook runs on Google Colab.")

    # Load API-Token from Colab-Secrets
    huggingface_api_token = userdata.get('huggingface_api_token')
    wandb_api_token = userdata.get('wandb_api_token')
    ngrok_api_token = userdata.get('ngrok_api_token')
else:
    test_file = "../Data/docstring_test_data.txt"
    train_file = "../Data/docstring_training_data.txt"
    base_output_dir = "../Models/"
    print("The notebook is running locally.")

    # Load API-Token from .env
    load_dotenv()
    huggingface_api_token = os.getenv("HUGGINGFACE")
    wandb_api_token = os.getenv("WANDB")
    ngrok_api_token = os.getenv('NGROK')

if huggingface_api_token and wandb_api_token and ngrok_api_token:
    print("Access token loaded.")
else:
    print("Access token not found.")

Mounted at /content/drive
The notebook runs on Google Colab.
Access token loaded.


In [6]:
def file_ready(filepath, min_size):
    """Check whether the file exists and exceeds a minimum size."""
    return os.path.isfile(filepath) and os.path.getsize(filepath) >= min_size


timeout = 60
min_size = 42
start_time = time.time()

while not (file_ready(test_file, min_size) and file_ready(train_file, min_size)):
    elapsed_time = time.time() - start_time
    if elapsed_time > timeout:
        print("Timeout")
        break
else:
    print("TEST (File size", os.path.getsize(test_file), "bytes):")
    with open(test_file) as f:
        for i, line in enumerate(f):
            print(line)
            if i == 2: break

    print("TRAIN (File size", os.path.getsize(train_file), "bytes):")
    with open(train_file) as f:
        for i, line in enumerate(f):
            print(line)
            if i == 2: break

TEST (File size 6673607 bytes):
[Function] def shortest_dist(dist_mat):\n    (m, n) = dist_mat.size()[:2]\n    dist = [[0 for _ in range(n)] for _ in range(m)]\n    for i in range(m):\n        for j in range(n):\n            if i == 0 and j == 0:\n                dist[i][j] = dist_mat[i, j]\n            elif i == 0 and j > 0:\n                dist[i][j] = dist[i][j - 1] + dist_mat[i, j]\n            elif i > 0 and j == 0:\n                dist[i][j] = dist[i - 1][j] + dist_mat[i, j]\n            else:\n                dist[i][j] = torch.min(dist[i - 1][j], dist[i][j - 1]) + dist_mat[i, j]\n    dist = dist[-1][-1]\n    return dist [Docstring] Parallel version.\nArgs:\n  dist_mat: pytorch Variable, available shape:\n    1) [m, n]\n    2) [m, n, N], N is batch size\n    3) [m, n, *], * can be arbitrary additional dimensions\nReturns:\n  dist: three cases corresponding to `dist_mat`:\n    1) scalar\n    2) pytorch Variable, with shape [N]\n    3) pytorch Variable, with shape [*] [EOS]

[Fu

In [7]:
def load_dataset_from_text_files(train_file_path, test_file_path):
    """
    Load training and test datasets from text files into a DatasetDict.

    Args:
        train_file_path (str): Path to the training data text file.
        test_file_path (str): Path to the test data text file.

    Returns:
        DatasetDict: A dictionary containing 'train' and 'test' datasets with text data.
    """

    def get_lines_from_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = [line.strip() for line in file if line.strip()]
        print(f'Total lines loaded from {file_path}: {len(lines)}')
        return lines

    def get_dataset_generator(file_path):
        lines = get_lines_from_file(file_path)
        for line in lines:
            yield {"text": line}

    dataset_train = Dataset.from_generator(
        generator=lambda: get_dataset_generator(train_file_path),
        features=Features({'text': Value('string')})
    )
    dataset_test = Dataset.from_generator(
        generator=lambda: get_dataset_generator(test_file_path),
        features=Features({'text': Value('string')})
    )
    return DatasetDict({"train": dataset_train, "test": dataset_test})


datasets = load_dataset_from_text_files(train_file, test_file)

Generating train split: 0 examples [00:00, ? examples/s]

Total lines loaded from /content/drive/MyDrive/Data/docstring_training_data.txt: 9335


Generating train split: 0 examples [00:00, ? examples/s]

Total lines loaded from /content/drive/MyDrive/Data/docstring_test_data.txt: 4015


In [8]:
# Model selection and configuration
model_to_finetune = "meta-llama/CodeLlama-7b-Python-hf"
# model_to_finetune = "tiiuae/falcon-rw-1b"
# model_to_finetune = "tiiuae/falcon-7b"

if on_colab:
    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
    torch_dtype = None
    device_map = "auto"
    print("Notebook is running on Colab: Using 4-bit quantization.")
else:
    quantization_config = None
    torch_dtype = torch.bfloat16
    device_map = "cpu"
    print("Notebook is running locally: Using bfloat16 precision.")

# Load tokeniser and model with auth token
tokenizer = AutoTokenizer.from_pretrained(
    model_to_finetune,
    token=huggingface_api_token,
    trust_remote_code=True,
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

model = AutoModelForCausalLM.from_pretrained(
    model_to_finetune,
    token=huggingface_api_token,
    device_map=device_map,
    quantization_config=quantization_config,
    torch_dtype=torch_dtype,
    trust_remote_code=True,
)

# Deactivating cache & setting pretraining
model.config.use_cache = False
model.config.pretraining_tp = 1

Notebook is running on Colab: Using 4-bit quantization.


tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [9]:
# Tokenisation of the data sets for training
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenized_datasets = datasets.map(tokenize_function, batched=True)

# Example output of some training examples
for i in range(5):
    index = random.randint(0, len(tokenized_datasets["train"]) - 1)
    print("Train example", index, ":", tokenized_datasets["train"][index])

Map:   0%|          | 0/9335 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/4015 [00:00<?, ? examples/s]

Train example 3091 : {'text': "[Function] def gmm_scores(X, n_components=2):\\n    gmm = GMM(n_components, 'full', n_init=10)\\n    gmm.fit(X.T)\\n    nominal_component = gmm.weights_.argmax()\\n    if n_components == 2:\\n        scores = gmm.score_samples(X.T)[1][:, 1 - nominal_component]\\n    else:\\n        scores = 1.0 - gmm.score_samples(X.T)[1][:, nominal_component]\\n    if np.ma.isMaskedArray(X):\\n        scores = np.ma.MaskedArray(scores, X.mask[0, :] if X.ndim > 1 else X.mask)\\n    return scores [Docstring] Fits a Gaussian Mixture Model to the data and detects anomalies based on that model.\\n\\nThe component with the highest weight will be considered the model for the nominal part of\\nthe time-series. If the a-priori score for a point under any other component is higher, it\\nwill be considered anomalous. [EOS]", 'input_ids': [1, 518, 6678, 29962, 822, 330, 4317, 29918, 1557, 2361, 29898, 29990, 29892, 302, 29918, 14036, 29922, 29906, 1125, 29905, 29876, 1678, 330, 4317

In [10]:
def test_model_response_pipeline(model, tokenizer, prompts, max_new_tokens=50):
    """
    Tests the model's response to a list of prompts using Hugging Face's pipeline.

    Args:
        model (PreTrainedModel): The loaded model.
        tokenizer (PreTrainedTokenizer): The tokenizer associated with the model.
        prompts (list): A list of input prompts as strings.
        max_new_tokens (int, optional): Maximum number of tokens to generate. Defaults to 50.

    Returns:
        list: A list of the model's responses to the prompts.
    """
    text_generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    responses = [
        text_generator(prompt, max_new_tokens=max_new_tokens, do_sample=True, top_k=10, temperature=0.7)[0][
            "generated_text"]
        for prompt in prompts
    ]
    return responses


prompts = [
    "Write a docstring for the following Python code:\n [Function]\ndef add_numbers(a, b): return a + b \n [Docstring]\n",
    "Write a docstring for the following Python code:\n [Function]\ndef subtract_numbers(a, b): return a - b \n [Docstring]\n",
]

responses = test_model_response_pipeline(model, tokenizer, prompts, max_new_tokens=100)
for i, response in enumerate(responses):
    print(f"Prompt {i + 1} - Response:\n {response}")

Device set to use cuda:0


Prompt 1 - Response:
 Write a docstring for the following Python code:
 [Function]
def add_numbers(a, b): return a + b 
 [Docstring]
"""Adds two numbers and returns their sum.
[Args]
a (int): An integer.
b (int): Another integer.
[Returns]
(int) The sum of a and b.
"""

Write a docstring for the following Python code:
 [Function]
def print_sum(a, b=""): print(a + b) 
 [Docstring]
""" Prints the sum of two arguments.
Prompt 2 - Response:
 Write a docstring for the following Python code:
 [Function]
def subtract_numbers(a, b): return a - b 
 [Docstring]
subtract_numbers.__doc__ = """Subtracts argument b from a and returns the result""" 

Write a docstring for the following Python code:
[Function]
def get_words(text): 
  return text.split()
[Docstring]
get_words.__doc__ = """Returns a list of words in the string text""" 

Write a docstring for the following Python code:
 [Function]


In [11]:
# Show layers
for name, module in model.named_modules():
    print(name)


model
model.embed_tokens
model.layers
model.layers.0
model.layers.0.self_attn
model.layers.0.self_attn.q_proj
model.layers.0.self_attn.k_proj
model.layers.0.self_attn.v_proj
model.layers.0.self_attn.o_proj
model.layers.0.self_attn.rotary_emb
model.layers.0.mlp
model.layers.0.mlp.gate_proj
model.layers.0.mlp.up_proj
model.layers.0.mlp.down_proj
model.layers.0.mlp.act_fn
model.layers.0.input_layernorm
model.layers.0.post_attention_layernorm
model.layers.1
model.layers.1.self_attn
model.layers.1.self_attn.q_proj
model.layers.1.self_attn.k_proj
model.layers.1.self_attn.v_proj
model.layers.1.self_attn.o_proj
model.layers.1.self_attn.rotary_emb
model.layers.1.mlp
model.layers.1.mlp.gate_proj
model.layers.1.mlp.up_proj
model.layers.1.mlp.down_proj
model.layers.1.mlp.act_fn
model.layers.1.input_layernorm
model.layers.1.post_attention_layernorm
model.layers.2
model.layers.2.self_attn
model.layers.2.self_attn.q_proj
model.layers.2.self_attn.k_proj
model.layers.2.self_attn.v_proj
model.layers.2.

In [15]:
# Fine-tuning configuration
model_name = model_to_finetune

# Generate a random run ID
current_time = str(time.time()).encode('utf-8')
hash_object = hashlib.sha256(current_time)
hex_digest = hash_object.hexdigest()
random_string = hex_digest[:12]
run_id = random_string

# LoRA parameters
lora_r = 8
lora_alpha = 16
lora_dropout = 0.3

# Training parameter
num_train_epochs = 3
per_device_train_batch_size = 1
per_device_eval_batch_size = 4
gradient_accumulation_steps = 4
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 1.5e-4
weight_decay = 0.001
optim = "adamw_torch"
lr_scheduler_type = "constant_with_warmup"
max_steps = 20000
warmup_ratio = 0.01
group_by_length = True
save_steps = 100
logging_steps = 10
eval_steps = 25

# Output directory
run_name = f"{model_name}_run{run_id}"
output_dir = os.path.join(base_output_dir, run_name)
print(f"Output directory: {output_dir}")

# Fine-tuned model name
new_model = os.path.join(output_dir, "end_of_training")

# Target modules to adapt key components to the model type (Falcon / CodeLlama):
# - Attention Projections: Query, Key, Value, and Output
# - Feed-Forward Network: Input (Expansion) and Output (Reduction)
# - Embedding Matrix: Maps tokens to dense vectors
if "CodeLlama" in model_name:
    target_modules = [
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.o_proj",
        "mlp.gate_proj",
        "mlp.down_proj",
        "embed_tokens",
    ]
elif "falcon" in model_name:
    target_modules = [
        "self_attention.query_key_value",
        "self_attention.dense",
        "mlp.dense_h_to_4h",
        "mlp.dense_4h_to_h",
        "word_embeddings",
    ]
else:
    target_modules = None

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="all",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
)


# Monitoring
projectname='DocstringGenerator'

# Initialize MLflow
if on_colab:
    # Set ngrok API token
    from pyngrok import ngrok
    ngrok.set_auth_token(ngrok_api_token)
    # Starts MLflow UI in the background
    get_ipython().system_raw("mlflow ui --backend-store-uri file:/content/mlruns --port 5000 &")
    # Forward port 5000 via ngrok
    public_url = ngrok.connect(5000)
    print("MLflow Tracking UI:", public_url.public_url)
    mlflow.set_tracking_uri("file:/content/mlruns")
else:
    # run in terminal:
    # mlflow server --host 127.0.0.1 --port 8080
    mlflow.set_tracking_uri("http://127.0.0.1:8080")

mlflow.set_experiment(projectname)
mlflow.start_run(run_name=f"run_{run_id}")

# Initialize Weights & Biases
wandb.login(key=wandb_api_token)
wandb.init(
    project=projectname,
    name=f"run_{run_id}",
    config={
    "lora_r":lora_r,
    "lora_dropout":lora_dropout,
    "learning_rate": learning_rate,
    "num_train_epochs": num_train_epochs,
    }
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    eval_strategy="steps",
    eval_steps=eval_steps,
    gradient_checkpointing=gradient_checkpointing,
    report_to=["wandb", "mlflow"],
    run_name=run_id,
    logging_dir=os.path.join(base_output_dir, "Results/runs/", run_name),
)


# Initialize the SFT Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets["train"].shuffle(),
    eval_dataset=tokenized_datasets["test"],
    peft_config=peft_config,
    # dataset_text_field="text",
    # max_seq_length=None,
    processing_class=tokenizer,
    args=training_arguments,
    # packing=False,
)

# Pre-process the model of layer norm for stable training
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

# Train the model
trainer.train()

# Log model metrics to MLflow
if trainer.state.log_history:
    metrics = trainer.state.log_history[-1]
    for k, v in metrics.items():
        if isinstance(v, (int, float)):
            mlflow.log_metric(k, v)

# Save the trained model
trainer.model.save_pretrained(new_model)
mlflow.log_artifacts(output_dir)

# End MLflow and W&B session
mlflow.end_run()
wandb.finish()

Output directory: /content/drive/MyDrive/Models/meta-llama/CodeLlama-7b-Python-hf_run48e7b67be709




MLflow Tracking UI: https://1066-34-143-224-129.ngrok-free.app


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


KeyboardInterrupt: 