### Import the required packages for the training.

In [None]:
!pip install --quiet transformers datasets torchmetrics accelerate -Uq
!pip install --quiet evaluate wandb mlflow

In [None]:
!pip install --quiet --upgrade transformers

### Import the required environment keys.
(Example using google colab.)

In [None]:
from google.colab import userdata
import os

os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

## To use the Eunomia App for evaluation later
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')
os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')
os.environ["LANGCHAIN_API_KEY"] = userdata.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_TRACING"] = 'true'
os.environ["LANGCHAIN_PROJECT"] = "Eunomia"

### Load the model to train.

In [None]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments

model_name = "meta-llama/Meta-Llama-3.1-8B"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd
import transformers


class PreprocessDatasetLegalkitShareGPT:
    """
    A class to preprocess the 'MaziyarPanahi/legalkit_sharegpt' dataset for training a model.

    Attributes
    ----------
    tokenizer : transformers.PreTrainedTokenizer
        A tokenizer from the Hugging Face Transformers library.
    dataset_name : str
        The name of the dataset to be loaded and processed.
    train_dataset : datasets.Dataset
        The tokenized and split training dataset.
    eval_dataset : datasets.Dataset
        The tokenized and split validation dataset.
    """

    def __init__(self, tokenizer: transformers.PreTrainedTokenizer):
        """
        Initialize the PreprocessDatasetLegalkitShareGPT class with a tokenizer.

        Parameters
        ----------
        tokenizer : transformers.PreTrainedTokenizer
            The tokenizer used for processing text data.
        """
        self.tokenizer = tokenizer
        self.dataset_name = "MaziyarPanahi/legalkit_sharegpt"

        # Preprocess the dataset and tokenize it
        dataset = self.preprocess_dataset_legalkit_sharegpt()
        tokenized_dataset = dataset.map(self.tokenize_function, batched=True)
        tokenized_dataset = tokenized_dataset.remove_columns(["text", "label"])

        # Split the tokenized dataset into training and validation sets
        train_val_dataset = tokenized_dataset.train_test_split(test_size=0.2)
        self.train_dataset = train_val_dataset["train"]
        self.eval_dataset = train_val_dataset["test"]

    def preprocess_dataset_legalkit_sharegpt(self) -> Dataset:
        """
        Load and preprocess the 'MaziyarPanahi/legalkit_sharegpt' dataset.

        Returns
        -------
        Dataset
            A Hugging Face Dataset object containing the processed text and label columns.
        """
        # Load the dataset
        data = load_dataset(self.dataset_name)

        # Convert the dataset to a pandas DataFrame for easier manipulation
        df = pd.DataFrame.from_dict(data['train'])

        # Extract the text and label columns from the conversation data
        df["text"] = df["conversations_with_input"].apply(lambda x: x[0]["value"])
        df["label"] = df["conversations_with_input"].apply(lambda x: x[1]["value"])

        # Drop unnecessary columns
        df = df.drop(columns=["conversations_with_input", "conversations"])

        # Convert the pandas DataFrame back to a Hugging Face Dataset
        dataset = Dataset.from_pandas(df)

        return dataset

    def tokenize_function(self, examples: dict) -> dict:
        """
        Tokenize the input text and labels.

        Parameters
        ----------
        examples : dict
            A dictionary containing the examples to be tokenized.

        Returns
        -------
        dict
            A dictionary with tokenized input_ids, labels, and attention_mask.
        """
        questions = examples["text"]
        responses = examples["label"]

        # Tokenize the questions
        question_tokens = self.tokenizer(
            questions, padding="max_length", truncation=True, max_length=512
        )

        # Tokenize the responses
        response_tokens = self.tokenizer(
            responses, padding="max_length", truncation=True, max_length=512
        )

        # Combine the tokenized inputs and labels
        input_ids = question_tokens["input_ids"]
        labels = response_tokens["input_ids"]

        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": question_tokens["attention_mask"],
        }


In [None]:
class ModelParser:
    def __init__(self, model):
        """
        Initialize the class with a given model.

        Parameters
        ----------
        model : object
            The model (e.g., a PyTorch or Hugging Face model).
        """
        self.model = model

    def count_parameters(self):
        """
        Count and display the total number of parameters in the model,
        as well as the number of trainable parameters.
        Also displays the percentage of trainable parameters relative to
        the total number of parameters.

        Returns
        -------
        None
        """
        total_params = sum(p.numel() for p in self.model.parameters())
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        trainable_percentage = (trainable_params / total_params) * 100

        # Format numbers with thousand separators
        total_params_str = f"{total_params:,}".replace(",", " ")
        trainable_params_str = f"{trainable_params:,}".replace(",", " ")

        print(f"Total parameters: {total_params_str}")
        print(f"Trainable parameters: {trainable_params_str} ({trainable_percentage:.2f}%)\n")

    def freeze_layers_by_param_count(self, max_trainable_params):
        """
        Freeze the layers of the model until the maximum number of trainable
        parameters is reached.

        Parameters
        ----------
        max_trainable_params : int
            The maximum number of trainable parameters desired.

        Returns
        -------
        None
        """
        current_trainable_params = 0

        # Iterate through the layers of the model
        for param in self.model.parameters():
            if current_trainable_params + param.numel() > max_trainable_params:
                param.requires_grad = False
            else:
                current_trainable_params += param.numel()

        print(f"Final trainable parameters: {self.count_parameters}")

    def freeze_layers_by_name(self, layer_names):
        """
        Freeze the layers of the model based on the provided layer names.

        Parameters
        ----------
        layer_names : list of str
            A list of layer names to freeze.

        Returns
        -------
        None
        """
        for name, param in self.model.named_parameters():
            if any(layer_name in name for layer_name in layer_names):
                param.requires_grad = False

        print(f"Layers frozen: {layer_names}")

    def freeze_all_except_layer(self, layer_name_to_keep):
        """
        Freeze all layers of the model except for the one specified by its name.

        Parameters
        ----------
        layer_name_to_keep : str
            The name of the layer to keep unfrozen.

        Returns
        -------
        None
        """
        for name, param in self.model.named_parameters():
            if layer_name_to_keep not in name:
                param.requires_grad = False
            else:
                param.requires_grad = True

        print(f"All layers frozen except: {layer_name_to_keep}")

    def train_specific_layers(self, layer_names):
        """
        Freeze the layers of the model based on the provided layer names, while
        keeping the specified layers trainable.

        Parameters
        ----------
        layer_names : list of str
            A list of layer names to keep trainable.

        Returns
        -------
        None
        """
        for name, param in self.model.named_parameters():
            if any(layer_name in name for layer_name in layer_names):
                param.requires_grad = True
                print(f"Layer frozen: {name}")
            else:
                param.requires_grad = False

    def train_lm_head(self):
        """
        Freeze all parameters except for the lm_head layer.

        Returns
        -------
        None
        """
        for param in self.model.parameters():
            param.requires_grad = False

        if hasattr(self.model, 'lm_head'):
            for param in self.model.lm_head.parameters():
                param.requires_grad = True
            print("lm_head has been frozen.")
        else:
            print("No lm_head found in the model.")

        self.count_parameters()


To train llama3.1

In [None]:
from typing import Dict
import transformers


def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict[str, str],
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """
    Resize the tokenizer and the corresponding model embeddings to account for new special tokens.

    This function adds special tokens to the tokenizer, resizes the model's token embeddings,
    and initializes the embeddings for the new tokens by averaging the existing token embeddings.

    Note: This method may result in the embedding size not being divisible by 64.

    Parameters
    ----------
    special_tokens_dict : Dict[str, str]
        A dictionary of special tokens to be added to the tokenizer.
    tokenizer : transformers.PreTrainedTokenizer
        The tokenizer to which the special tokens will be added.
    model : transformers.PreTrainedModel
        The model whose token embeddings will be resized.

    Returns
    -------
    model : transformers.PreTrainedModel
        The model with resized token embeddings.
    tokenizer : transformers.PreTrainedTokenizer
        The tokenizer with added special tokens.
    """
    # Add the special tokens to the tokenizer and resize the model's embeddings
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        # Get the input and output embeddings from the model
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        # Calculate the average of the existing embeddings
        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        # Assign the averaged embeddings to the new tokens
        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

    return model, tokenizer


# Default special tokens
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

# Create a dictionary for special tokens if they are not already set in the tokenizer
special_tokens_dict = {}
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

# Resize the tokenizer and model embeddings to accommodate the new special tokens
model, tokenizer = smart_tokenizer_and_embedding_resize(
    special_tokens_dict=special_tokens_dict,
    tokenizer=tokenizer,
    model=model,
)


In [None]:
prep_data = PreprocessDatasetLegalkitShareGPT(tokenizer)

In [None]:
train_data = prep_data.train_dataset
val_data = prep_data.eval_dataset
data_collator = transformers.data.data_collator.default_data_collator

### You might want to select specific layers to finetune.

In [None]:
model

### Count the number of parameters and freeze all layers except the lm_head_layer

In [None]:
# Créer une instance de ModelInspector
inspector = ModelParser(model)

# Compter les paramètres
inspector.count_parameters()
inspector.train_lm_head()

In [None]:
from transformers import Trainer, TrainingArguments
from transformers.integrations import WandbCallback
import torch
import numpy as np  # Added import for numpy, used in compute_metrics

class ModifiedTrainer(Trainer):
    """
    A custom Trainer class that overrides the compute_loss and compute_metrics methods.
    """

    def compute_loss(self, model, inputs, return_outputs: bool = False):
        """
        Compute the loss using the provided model and inputs.

        This method overrides the default loss computation by manually
        setting the attention mask to be all ones and using the input_ids
        as labels.

        Parameters
        ----------
        model : transformers.PreTrainedModel
            The model used for forward pass and loss computation.
        inputs : Dict[str, torch.Tensor]
            The input data containing 'input_ids'.
        return_outputs : bool, optional
            If True, also return the model outputs, by default False.

        Returns
        -------
        torch.Tensor
            The computed loss value.
        """
        return model(
            input_ids=inputs["input_ids"],
            attention_mask=torch.ones_like(inputs["input_ids"]).bool(),
            labels=inputs["input_ids"],
        ).loss

# Define the training arguments for the model
training_args = TrainingArguments(
    output_dir="./Model_llama_3_1_8B",  # Directory for storing model checkpoints
    fp16=False,  # Disable 16-bit floating point precision
    gradient_accumulation_steps=1,  # Number of gradient accumulation steps
    per_device_train_batch_size=2,  # Training batch size per device
    learning_rate=1e-4,  # Learning rate for optimizer
    evaluation_strategy='no',  # Disable evaluation during training
    save_strategy='no',  # Disable saving checkpoints during training
    max_steps=300,  # Maximum number of training steps
    logging_steps=5,  # Log every 5 steps
    report_to="mlflow",  # Report training metrics to Weights & Biases
)

# Initialize the Trainer with the specified model, dataset, and training arguments
trainer = ModifiedTrainer(
    model=model,  # Model to train
    train_dataset=train_data,  # Training dataset
    args=training_args,  # Training arguments
    data_collator=data_collator,  # Data collator function
    # callbacks=[WandbCallback()],  # Callback for integration with Weights & Biases
    tokenizer=tokenizer,  # Tokenizer used for encoding the inputs
)

## Example with mlflow:

In [None]:
import mlflow
from mlflow import MlflowClient

# Set the tracking URI for the MLflow server, which could be a local server or an ngrok URI
track_uri: str = "http://34.242.16.206:8080/"  # Replace with your specific tracking URI if needed

# Initialize the MLflow client with the specified tracking URI
mlflow.set_tracking_uri(track_uri)

# Create a new experiment in MLflow with the name "Eunomia"
# client.create_experiment(name="Eunomia")
mlflow.set_experiment("Eunomia")

In [None]:
with mlflow.start_run() as run:
    trainer.train()

In [None]:
# Register your model
from transformers import pipeline
import mlflow

# Create a text-generation pipeline using the fine-tuned model and tokenizer
tuned_pipeline = pipeline(
    task="text-generation",  # Specify the task for the pipeline
    model=trainer.model,     # Use the model from the Trainer instance
    tokenizer=tokenizer,     # Use the associated tokenizer
)

# Start an MLflow run with the specified run ID to log the model
with mlflow.start_run(run_id=run.info.run_id):
    mlflow.set_experiment("Eunomia")
    # Log the fine-tuned model to MLflow and register it under the specified name
    model_info = mlflow.transformers.log_model(
        transformers_model=tuned_pipeline,       # The text-generation pipeline to log
        artifact_path="fine_tuned",              # Directory path within the artifact store
        registered_model_name="Eunomia-llama-model",  # Name of the registered model in MLflow
    )


In [None]:
inputs = tokenizer.decode(val_data["labels"][1], skip_special_tokens=True)
preds = tuned_pipeline(inputs, max_length=450)
print(f"question: {inputs}\n\n")
print(f"predictions: {preds}")

In [None]:
for example in val_data[:5]:
    inputs = tokenizer.decode(example["input_ids"], skip_special_tokens=True)

    # Générer la prédiction
    generated = tuned_pipeline(inputs, max_length=50)[0]['generated_text']


## Test on our langgraph app
### Clone the git repo with the app

In [None]:
!git clone https://github.com/H-Gelender/Eunomia.git
%cd Eunomia/app
!pip install --quiet -r requirements.txt
!pip install --quiet rapidfuzz

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize HuggingFaceEmbeddings with a specified model for multilingual text embeddings
embeddings: HuggingFaceEmbeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/distiluse-base-multilingual-cased-v1"
)


In [None]:
from langsmith import Client

# Initialize a Client instance for interacting with the Langsmith API
client = Client()

# Define the name of the dataset to be used
dataset_name: str = "eunomia-Q&A"


In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from transformers.pipelines import Pipeline  # Import Pipeline for type hinting

# Create a text-to-text generation pipeline using the specified model and tokenizer
pipe: Pipeline = pipeline(
    task="text2text-generation",  # Define the task for the pipeline
    model=trainer.model,          # Use the model from the Trainer instance
    tokenizer=tokenizer,          # Use the associated tokenizer
    max_length=512                # Set the maximum sequence length for the generated text
)

# Wrap the Hugging Face pipeline in a LangChain HuggingFacePipeline object
llm: HuggingFacePipeline = HuggingFacePipeline(pipeline=pipe)

### You may want to ajust the llm you want to use in your graph

In [None]:
import json
from typing import Dict, TypedDict, Optional

from langchain_google_genai import GoogleGenerativeAI
from nodes import RAGNode, PreprocessQuestionNode, ChatNode, ChoosePathNode
from langgraph.graph import StateGraph, END
from langgraph.graph import MessageGraph
from langgraph.prebuilt.tool_node import ToolNode

# Initialize the Google Generative AI model with specific parameters
Gllm: GoogleGenerativeAI = GoogleGenerativeAI(model="gemini-pro", temperature=0.1)

class GraphState(TypedDict):
    """Typed dictionary to define the structure of the graph state."""
    question: str
    preproccess_question: str
    books: str
    documents: str
    ids: str
    answer: str
    chat_history: str
    path: str

class EunomiaGraph:
    """Class to define and manage the Eunomia state graph with specific nodes."""

    def __init__(self, llm: GoogleGenerativeAI, embeddings: object):
        """
        Initialize the EunomiaGraph with a language model and embeddings.

        Args:
            llm: The language model instance to use.
            embeddings: The embeddings to use in the RAGNode.
        """
        self.embeddings = embeddings
        self.llm = llm
        self.workflow = StateGraph(GraphState)
        self.app = self.init_graph()

        # Initialize lists for storing document IDs, books, and documents
        self.ids: list = []
        self.books: list = []
        self.documents: list = []

    def init_node(self):
        """
        Initialize nodes and add them to the workflow.
        """
        self.rag_node = RAGNode(self.llm, "eunomia", self.embeddings)
        preprocess_node = PreprocessQuestionNode(self.llm)
        chat = ChatNode(self.llm)
        choose_path = ChoosePathNode(Gllm)

        # Add nodes to the workflow
        self.workflow.add_node("ChoosePath_node", choose_path.run)
        self.workflow.add_node("Preprocess_node", preprocess_node.run)
        self.workflow.add_node("RAG", self.rag_node.run)
        self.workflow.add_node('Retriever_node', self.retriever_node)
        # self.workflow.add_node('Final_Node', self.final_node)
        self.workflow.add_node("Chat_node", chat.run)

    def init_edges(self):
        """
        Define edges and set entry points in the workflow.
        """
        self.workflow.set_entry_point("ChoosePath_node")
        self.workflow.add_conditional_edges(
            "ChoosePath_node",
            self.path,
            {
                "Preprocess_node": "Preprocess_node",
                "Chat_node": "Chat_node"
            }
        )
        self.workflow.add_edge("Preprocess_node", "RAG")
        self.workflow.add_edge("RAG", "Retriever_node")
        self.workflow.add_edge("Retriever_node", "Chat_node")
        self.workflow.add_edge("Chat_node", END)
        # self.workflow.add_edge("Final_Node", END)

    def init_graph(self) -> StateGraph:
        """
        Initialize nodes and edges, then compile and return the workflow graph.

        Returns:
            StateGraph: The compiled workflow graph.
        """
        self.init_node()
        self.init_edges()
        return self.workflow.compile()

    def retriever_node(self, state: Dict[str, Optional[Dict[str, str]]]) -> Dict[str, list]:
        """
        Process the state to retrieve documents and their metadata.

        Args:
            state: The current state of the graph containing a question.

        Returns:
            dict: Contains lists of IDs, books, and documents.
        """
        ids: list = []
        books: list = []
        documents: list = []

        # Retrieve the question from the state and invoke the retriever
        question = state.get("question", {}).get("question", "").strip()
        docs = self.rag_node.retriever.invoke(question)

        # Collect document metadata and content
        for doc in docs:
            ids.append(doc.metadata["id"])
            books.append(doc.metadata["book"])
            documents.append(doc.page_content)

        # Update instance variables
        self.ids = ids
        self.books = books
        self.documents = documents

        return {"ids": ids, "books": books, "documents": documents}

    def path(self, state: Dict[str, str]) -> str:
        """
        Retrieve the path from the state.

        Args:
            state: The current state of the graph.

        Returns:
            str: The path from the state.
        """
        return state.get("path", "")

    def run(self, question: str) -> Dict[str, str]:
        """
        Run the workflow with a given question and return the result.

        Args:
            question: The question to be processed.

        Returns:
            dict: The result of the workflow containing the answer.
        """
        inputs: Dict[str, str] = {"question": question}
        result: Dict[str, str] = self.app.invoke(inputs)
        return result.get("answer", "")


In [None]:
from langchain.smith import RunEvalConfig, run_on_dataset
from langchain.evaluation import EvaluatorType

# Configure the evaluation settings
eval_config: RunEvalConfig = RunEvalConfig(
    eval_llm=llm,
    evaluators=[
        RunEvalConfig.Criteria("conciseness"),
        RunEvalConfig.Criteria("relevance"),
        RunEvalConfig.Criteria("coherence"),
        EvaluatorType.STRING_DISTANCE,
        EvaluatorType.QA,
    ]
)

# Define dataset parameters
dataset_name: str = "eunomia-Q&A"
dataset_id: str = "9f51d387-e223-4998-86b9-145d5d8252ca"

# Run evaluation on the dataset
result = run_on_dataset(
    client=client,
    dataset_name=dataset_name,
    llm_or_chain_factory=EunomiaGraph(llm, embeddings).run,
    dataset_id=dataset_id,
    evaluation=eval_config,
)
