In [None]:
pip install datasets

In [None]:
!pip install -q transformers==4.38.1 peft==0.9.0 accelerate==0.27.2 bitsandbytes==0.42.0 safetensors==0.4.2 sentencepiece==0.1.99 streamlit==1.31.1 weaviate-client langchain==0.1.10 sentence-transformers==2.5.1 tiktoken

In [3]:
import torch
import locale

locale.getpreferredencoding = lambda: "UTF-8"
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig
from transformers import pipeline

from langchain.text_splitter import TokenTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.vectorstores import Weaviate
import weaviate

This is where you connect to the weaviate database. It is free and stays up for 14 days. You can always create a new database again.

In [4]:
import os
import weaviate.classes as wvc

URL = 'here'
APIKEY = 'here'



client = weaviate.Client(
    url=URL, auth_client_secret=weaviate.AuthApiKey(APIKEY)
)

This is the embedding model. The data is a bunch of words that computers dont understand. RAG is based on vector databases. As the name suggests, they store vectors. Here we call an embedding model that will embbed the text into vectors. You can choose any embedding model that will suit your needs. Make sure to use cuda, otherwise it will take much long. In my case it took like 20x longer when I used cpu instead of cuda

In [None]:
embedding_model_name = "intfloat/multilingual-e5-large"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
  model_name=embedding_model_name,
  model_kwargs=model_kwargs
)

In [6]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("/content/merged_text.txt", encoding = 'UTF-8')

In [7]:
documents = loader.load()

Here you can tune the hyperparameters. The chunck_size decides how many tokens per chunk to take, and the chunk_overlap is how many tokens from the previous chunk to take, which is important to avoid loss of information between chunks due to splitting. It basically keeps the data connected in a way.

Langchain supports weaviate so you basically pass the client as a parameter and the document (txt), as well as the embedding model you want to use, then watch the magic happens.

In [14]:
from langchain.document_loaders import YoutubeLoader


text_splitter = TokenTextSplitter(chunk_size=256, chunk_overlap=64)

docs = text_splitter.split_documents(documents)
# Ingest the documents into Weaviate
vector_db = Weaviate.from_documents(
    docs, embeddings, client=client, by_text=False
)

Here is some basic similarity search. K is how many searches you want to return. K = 3 means it will return the 3 most similar chunks.

In [15]:
print(
    vector_db.similarity_search(
        "اعاني من الرهاب الاجتماعي", k=3)
    )

[Document(page_content=' اعاني من الرهاب الاجتماعي حيث انني لااصعد الي السبوره ولا اشارك في قسمي بسبب اني لو فعلت لتزايدت ضربات قلبي ولتشتت ذهني و عدم استطاعتي الكلام بصوره واضحه  ؟ الامر يحتاج العرض علي معالج سلوكي للتدريب علي مواجهه مثل هذه المواقف ولو الامر شديد قد يحتاج علاج دواءي ما هو �', metadata={'source': '/content/merged_text.txt'}), Document(page_content='ك اي استفسار يخص مرض الزهايمر اعاني من الرهاب الاجتماعي حيث انني لااصعد الي السبوره ولا اشارك في قسمي بسبب اني لو فعلت لتزايدت ضربات قلبي ولتشتت ذهني و عدم استطاعتي الكلام بصوره واضحه  ؟ الامر يحتاج العرض علي معالج سلوكي للتدريب علي مواجهه مثل هذه المواقف ولو الامر', metadata={'source': '/content/merged_text.txt'}), Document(page_content='رهاب الاجتماعي وان القلق لديك يجري تعميمه علي اماكن ومواقف مختلفه واتوقع منطقيا ان هذا الامر يعيقك بطريقه ما وانه لديك من فتره ليست بالقليله كما اوردت في مقالك فلا اعلم هل راجعت طبيبا نفسيا بهذا الصدد ليقوم بتقييم شخصي ومباشر لحالتك ويصل لتشخيص', metadata={'source': '/content/merged_text.t

Here, we call the LLM that we would like to use. I was using google colab which had a very small vram (free version) which prevented me from using the bigger Arabic models, so i opted for the small ones which as you can see from the results, had a pretty horrible response. If you use a better Arabic LLM you will get better results!

In [20]:
# specify model huggingface mode name
model_name = "Ruqiya/Fine-Tuning-Gemma-2b-it-for-Arabic"

# function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
    """
    :param model_name: Name or path of the model to be loaded.
    :return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        #load_in_4bit=True,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
    )
    return model

# function for initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    Initialize the tokenizer with the specified model_name.

    :param model_name: Name or path of the model for tokenizer initialization.
    :return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer


# initialize tokenizer
tokenizer = initialize_tokenizer(model_name)
# load model
model = load_quantized_model(model_name)
# specify stop token ids
stop_token_ids = [0]


# build huggingface pipeline for using zephyr-7b-alpha
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

# specify the llm
llm = HuggingFacePipeline(pipeline=pipeline)

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/522 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Here is a feature from langchain which creates a chain. The chain basically uses the LLM combined with the vector database which stores the information. The LLM uses the information stored in the database to generate coherent answers.

In [21]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vector_db.as_retriever()
)

In [22]:
response = qa_chain.run("اعاني من الرهاب الاجتماعي")
print(response)

  warn_deprecated(


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

 اعاني من الرهاب الاجتماعي حيث انني لااصعد الي السبوره ولا اشارك في قسمي بسبب اني لو فعلت لتزايدت ضربات قلبي ولتشتت ذهني و عدم استطاعتي الكلام بصوره واضحه  ؟ الامر يحتاج العرض علي معالج سلوكي للتدريب علي مواجهه مثل هذه المواقف ولو الامر شديد قد يحتاج علاج دواءي ما هو �

ك اي استفسار يخص مرض الزهايمر اعاني من الرهاب الاجتماعي حيث انني لااصعد الي السبوره ولا اشارك في قسمي بسبب اني لو فعلت لتزايدت ضربات قلبي ولتشتت ذهني و عدم استطاعتي الكلام بصوره واضحه  ؟ الامر يحتاج العرض علي معالج سلوكي للتدريب علي مواجهه مثل هذه المواقف ولو الامر

رهاب الاجتماعي وان القلق لديك يجري تعميمه علي اماكن ومواقف مختلفه واتوقع منطقيا ان هذا الامر يعيقك بطريقه ما وانه لديك من فتره ليست بالقليله كما اوردت في مقالك فلا اعلم هل راجعت طبيبا نفسيا بهذا الصدد ليقوم بتقييم شخصي ومباشر لحالتك ويصل لتشخيص

زيد من التدريب وانت بمفردك ثم الانخراط تدريجيا مع ال