In [1]:
!pip install transformers sentence-transformers langchain chromadb langchain_community langchain_text_splitters accelerate bitsandbytes xformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.0-py3-none-any.whl.metadata (10 kB)
Collecting langchain
  Downloading langchain-0.2.1-py3-none-any.whl.metadata (13 kB)
Collecting chromadb
  Downloading chromadb-0.5.0-py3-none-any.whl.metadata (7.3 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.1-py3-none-any.whl.metadata (8.9 kB)
Collecting langchain_text_splitters
  Downloading langchain_text_splitters-0.2.0-py3-none-any.whl.metadata (2.2 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting xformers
  Downloading xformers-0.0.26.post1-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_core-0.2.3-py3-none-any.whl.metadata (5.9 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.65-py3-none-any.whl.metadata (13 kB)
Collecting build>=1.0.3 (from chr

In [2]:
from langchain_community.vectorstores.chroma import Chroma
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema.document import Document
from torch import cuda
import os

CHROMA_PATH = "/kaggle/working/dataset"
DATA_PATH = "/kaggle/input/dataset"



def embedding_function():
    embed_model_id = 'sentence-transformers/distilbert-base-nli-mean-tokens'
    device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

    embed_model = HuggingFaceEmbeddings(
        model_name=embed_model_id,
        model_kwargs={'device': device},
        encode_kwargs={'device': device, 'batch_size': 32}
    )
    return embed_model


def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")


def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

# Load documents
documents = load_documents()

# Split documents
doc_splits = split_documents(documents)
# Add documents to Chroma vector store
vectorstore = add_to_chroma(doc_splits)


  from tqdm.autonotebook import tqdm, trange
2024-05-30 20:10:13.487963: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-30 20:10:13.488062: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-30 20:10:13.610849: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Number of existing documents in DB: 0
👉 Adding new documents: 57


  warn_deprecated(


In [33]:
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

prompt = PromptTemplate(
    template="""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. 
    Use three sentences maximum and keep the answer concise
    
    user
    
    Here is the retrieved document: 
    
    {document}
    
    ---
    Here is the user question: 
    
    {question} 
    
    assistant""",
    input_variables=["question", "document"],
)


def query_rag(query_text: str, model , vectorstore):
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=embedding_function()
    )
    retriever = db.as_retriever()
    docs = retriever.invoke(query_text)
    
    #results = db.similarity_search_with_score(query_text)
    def format_docs(docs):
        return "\n\n---\n\n".join([doc.page_content for doc in docs])
    
    prompt_text = prompt.format(document=format_docs(docs), question=query_text)
 
    generation = model(prompt_text, max_new_tokens=5000 , num_return_sequences=1)[0]["generated_text"]
    if "assistant" in generation:
        generation = generation.split("assistant")[-1].strip()
    
    formatted_response = f"""
    --> Assistant Response

    ✨User Question:
    {query_text}

    -->

    ✅Answer:
    {generation} 

    """
    print(formatted_response)
    return formatted_response.replace('\n', ' ')

In [14]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM 
from torch import cuda , bfloat16
from transformers import pipeline , BitsAndBytesConfig , AutoConfig

def load_model():
    device = f'cuda :{cuda.current_device()}' if cuda.is_available() else  'cpu'
    hf_auth = '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'
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=bfloat16
        )
    model_id= "mistralai/Mistral-7B-Instruct-v0.3"  
    model_config = AutoConfig.from_pretrained(
        model_id,
        use_auth_token=hf_auth
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_id , use_auth_token=hf_auth ,  
        trust_remote_code=True)
    
    gemma = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        device_map='auto',
        use_auth_token=hf_auth 
    )
    model = pipeline('text-generation', model=gemma , tokenizer=tokenizer ) 
    #model.save_pretrained("./model")
    return model

model = load_model()



config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [37]:
# cuda.empty_cache()
text = "definir la boucle Pour en deux lignes (en arabe)"
resut = query_rag(text , model, vectorstore )

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



    --> Assistant Response

    ✨User Question:
    definir la boucle Pour en deux lignes (en arabe)

    -->

    ✅Answer:
    بلوز لولية (Pour) هي تعتمد على عدد معين من الإعادات (إعادة التنفيذ) قبل بدءها. تحتوي على عدد معين من الأوامر وتتكون من عنوان البلوز والجسم. وتمكن المستخدم من تعيين عدد الإعادات وعدد الأوامر في البلوز باستخدام المتغير المحدد للعدد المعين والمتغير المحدد للعدد المتكرر.

    (Arabic translation: "The loop For depends on a specific number of repetitions (iterations) before it begins. It contains a specific number of commands and consists of the loop header and the body. The user can set the number of iterations and the number of commands in the loop using the variable specified for the specific number and the variable specified for the repeated number.") 

    
