## Preliminary instructions and installation of dependencies

In [None]:
!git clone https://github.com/edoppiap/CharBERT.git

Cloning into 'CharBERT'...
remote: Enumerating objects: 362, done.[K
remote: Counting objects: 100% (173/173), done.[K
remote: Compressing objects: 100% (121/121), done.[K
remote: Total 362 (delta 91), reused 117 (delta 48), pack-reused 189[K
Receiving objects: 100% (362/362), 3.32 MiB | 13.26 MiB/s, done.
Resolving deltas: 100% (192/192), done.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/CharBERT

/content/CharBERT


In [None]:
%%capture
%pip install --q boto3 GitPython langchain chromadb sentence_transformers
%pip install langchain_community==0.0.16
!pip -q install google-generativeai==0.3.1
!pip -q install google-ai-generativelanguage==0.4.0
!pip -q install langchain-google-genai
!pip install langchain==0.1.4

In [None]:
import os
from git import Repo
from google.colab import userdata
from IPython.display import display
from IPython.display import Markdown


from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.text_splitter import Language

from langchain_core.prompts.prompt import PromptTemplate

from modeling.modeling_charbert import CharBertTransformer
from modeling.charbert_embeddings import CharBertEmbeddings
from sentence_transformers import SentenceTransformer, models

import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAI

## Download and parse the Github repository

In [None]:
# Clone a github repo
repo_path = "/content/CharBERT/db_charbert"
#github_repo = "https://github.com/mawentao277/CharBERT" ## any github repository URL
github_repo = 'https://github.com/edoppiap/casl_labs'
repo = Repo.clone_from(github_repo, to_path=repo_path)

In [None]:
# Load
loader = GenericLoader.from_filesystem(
    #repo_path + "/libs/langchain/langchain",
    repo_path,
    glob="**/*",
    suffixes=[".py"],
    #exclude=["**/non-utf8-encoding.py"],
    parser=LanguageParser(language=Language.PYTHON),
)
documents = loader.load()
len(documents)

118

In [None]:
import re
pattern = r'(?:def|class)\s+(\w+)\s*'

i_to_del = []
"""for i, doc in enumerate(documents):
    matches = re.findall(pattern, doc.page_content)
    doc.metadata['wrap_name'] = doc.metadata['source'] + '_' + matches[0]
    if doc.metadata['content_type'] == 'simplified_code':
        i_to_del.append(i)"""

for i, doc in enumerate(documents):

    if 'content_type' in doc.metadata:
        matches = re.findall(pattern, doc.page_content)

        if len(matches) == 0:
            i_to_del.append(i)

        else:
            doc.metadata['wrap_name'] = doc.metadata['source'] + '_' + matches[0]
            if doc.metadata['content_type'] == 'simplified_code':
                i_to_del.append(i)

    else:
        i_to_del.append(i)

for ix in reversed(i_to_del):
    documents.pop(ix)

for doc in documents:
    print(doc.metadata)

In [None]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=1000, chunk_overlap=0
)
texts = python_splitter.split_documents(documents)
len(texts)

261

##Inizialize CharBert model for Sentence Embeddings

In [None]:
charBertTransformer = CharBertTransformer(model_type = 'bert',
                                          model_name_or_path = '/content/drive/MyDrive/NLP_Project/CharBERT/charbert-bert-wiki', ## download it from the link in the readme
                                          char_vocab = '/content/CharBERT/data/dict/bert_char_vocab')

pooling_model = models.Pooling(charBertTransformer.get_word_embedding_dimension()*2)
embeddings = SentenceTransformer(modules=[charBertTransformer, pooling_model])
sentenceEmbeddings = CharBertEmbeddings(embeddings)

  return self.fget.__get__(instance, owner)()


cls: <class 'modeling.configuration_bert.BertConfig'>
pretrained_model_name_or_path: /content/drive/MyDrive/NLP_Project/CharBERT/charbert-bert-wiki
cls.pretrained_config_archive_map: {'bert-base-uncased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json', 'bert-large-uncased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json', 'bert-base-cased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json', 'bert-large-cased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json', 'bert-base-multilingual-uncased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json', 'bert-base-multilingual-cased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json', 'bert-base-chinese': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json', 'bert-base-german-cased': 'htt

##Create a chroma vector database given the github repository

In [None]:
db = Chroma.from_documents(texts , sentenceEmbeddings, persist_directory='./chroma_db')

retriever = db.as_retriever(
    search_type="mmr",  # Also test "similarity"
    search_kwargs={"k": 6},
)
db.persist()

##Load an existing vector database

In [None]:
db = Chroma(persist_directory='/content/drive/MyDrive/NLP_Project/db/casl_ema_chroma_db_giusto', embedding_function=sentenceEmbeddings)
retriever = db.as_retriever(
    search_type="mmr",  # Also test "similarity"
    search_kwargs={"k": 10},
)

##Initialize Google gemini environment and LLM

In [None]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyDVRweStnbEJUjEAV9Mah2ZhEUp2kz0w2M"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
llm = GoogleGenerativeAI(model="models/gemini-1.0-pro-001")
llm_latest = GoogleGenerativeAI(model="models/gemini-1.0-pro-latest")

#RAG MODEL

In [None]:
import textwrap
from IPython.display import display

template = """
You are an AI assistant that helps users understand Github repositories
that are provided to you via context input,
the context you are given is composed by python scripts of the repository: {repo} |
This documents are the context input Docs: {context} |
This is the question you are going to aswer: {question}

Instructions:
1. Answer based on the documents given in the context, use only the functions seen in the context.
2. Focus on repo/code.
3. Consider:
    a. Purpose/features - describe.
    b. Functions/code - provide details/samples.
    c. Setup/usage - give instructions.
4. Unsure? Say "I am not sure".
5. Tell me if you used the context to answer the question.
6. Tell me from which documents/function_classes in the context you took the answer using the index of the documents in the list of documents.

Answer:
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["repo","question", "context"]
)


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

def retrieve_documents(question=None, retriever=None, llm=None, what_retrieve='chunks', transform_in_code=False):
  if question == None:
    print('Need a question to retrieve documents')
    return None

  if retriever == None:
    print('Need retriever to retrieve documents')
    return None

  if what_retrieve == 'functions_or_classes':
    if transform_in_code:
      if llm:
        code_query = get_code_from_question(question, llm)
        print(f"This is the code query generated from the LLM:\n {code_query}\n----------------------\n")
      documents = retriever.get_relevant_documents(code_query)
    else:
      documents = retriever.get_relevant_documents(question)
    metadata = []
    for d in documents:
      metadata.append(d.metadata['wrap_name'])
    metadata = set(metadata)
    full_res = []
    for wrap_name in metadata:
      res = ''
      document_list = db.get(where={"wrap_name": wrap_name})['documents']
      for s in document_list:
        res += s
      full_res.append(res)
    function_or_classes_list = []
    for i, function_class_content in enumerate(full_res):
      function_or_classes_list.append(f'Start function/class {i}: \n'+function_class_content+f'\nEnd function/class {i}')
    return function_or_classes_list

  elif what_retrieve == 'chunks':
    if transform_in_code:
      code_query = get_code_from_question(question, llm)
      print(f"This is the code query generated from the LLM:\n {code_query}\n----------------------\n")
      retrieved_documents = retriever.get_relevant_documents(code_query)
    else:
      retrieved_documents = retriever.get_relevant_documents(question)
  elif what_retrieve == 'entire_document':
    if transform_in_code:
      if llm:
        code_query = get_code_from_question(question, llm)
        print(f"This is the code query generated from the LLM:\n {code_query}\n----------------------\n")
      documents = retriever.get_relevant_documents(code_query)
    else:
      documents = retriever.get_relevant_documents(question)
    metadata = []
    for d in documents:
      metadata.append(d.metadata['source'])
    metadata = set(metadata)
    document_list = []
    for i, source in enumerate(metadata):
      with open(source, 'r') as file:
        file_contents = file.read()
      document_list.append(f'Start Document {i}: \n'+file_contents+f'\nEnd Document {i}')
    return document_list
  return retrieved_documents


def get_code_from_question(query, llm):
  context = """You are an AI assiantant that converts natural language in python code, return only code in your answer. This is the question: """
  result = llm.invoke(context+query)
  if "```python\n" in result :
    return result.replace("```python\n", "").replace("\n```","").replace("```python \n","").replace("```python  \n","")
  else:
    return query




In [None]:
from pprint import pprint
stuff_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt, verbose=False)
stuff_chain_latest = load_qa_chain(llm_latest, chain_type="stuff", prompt=prompt, verbose=False)


In [None]:
transform_in_code = False
what_retrieve = 'chunks' ## entire_document, chunks, oppure functions_or_classes
retriever = db.as_retriever(search_type="mmr",search_kwargs={"k": 25},) if what_retrieve == 'chunks' else db.as_retriever(search_type="mmr",search_kwargs={"k": 6},)

question = "what is this repo about??"

retrieved_documents = retrieve_documents(question=question, retriever=retriever, llm=llm, what_retrieve=what_retrieve, transform_in_code=transform_in_code)


if what_retrieve == 'chunks':
  stuff_answer = stuff_chain.invoke(
    {"repo": repo, "question": question, "input_documents": retrieved_documents}, return_only_outputs=True)
  stuff_answer_latest = stuff_chain_latest.invoke(
    {"repo": repo, "question": question, "input_documents": retrieved_documents}, return_only_outputs=True)
  display(to_markdown(stuff_answer['output_text']))
  print('\n\n')
  print('_______________________________________')
  display(to_markdown(stuff_answer_latest['output_text']))
else:
  result = llm.invoke(template.format(repo = repo, context = retrieved_documents, question = question))
  result_latest = llm_latest.invoke(template.format(repo = repo, context = retrieved_documents, question = question))
  display(to_markdown(result))
  print('\n\n----------------------------------\n\n')
  display(to_markdown(result_latest))

In [None]:
#@title Chat with a GitHub repo
#repo_url = "repo url..." # @param {type:"string"}

transform_in_code = False #@param {type: "boolean"}
what_retrieve = "chunks" # @param ["chunks", "entire_document", "functions_or_classes"]
question = "what this repo is about" # @param {type:"string"}

retriever = db.as_retriever(search_type="mmr",search_kwargs={"k": 20},) if what_retrieve == 'chunks' else db.as_retriever(search_type="mmr",search_kwargs={"k": 6},)
retrieved_documents = retrieve_documents(question=question, retriever=retriever, llm=llm, what_retrieve=what_retrieve, transform_in_code=transform_in_code)

if not transform_in_code:
  print(f'This is the query : {question}\n----------------------\n')

if what_retrieve == 'chunks':
  stuff_answer = stuff_chain.invoke(
    {"repo": repo, "question": question, "input_documents": retrieved_documents}, return_only_outputs=True)
  stuff_answer_latest = stuff_chain_latest.invoke(
    {"repo": repo, "question": question, "input_documents": retrieved_documents}, return_only_outputs=True)
  display(to_markdown(stuff_answer['output_text']))
  print('\n\n')
  print('_______________________________________')
  display(to_markdown(stuff_answer_latest['output_text']))
else:
  result = llm.invoke(template.format(repo = repo, context = retrieved_documents, question = question))
  result_latest = llm_latest.invoke(template.format(repo = repo, context = retrieved_documents, question = question))
  display(to_markdown(result))
  print('\n\n----------------------------------\n\n')
  display(to_markdown(result_latest))

This is the query : what this repo is about
----------------------



> This repository simulates the arrival and departure of clients in a service system. It includes classes for clients, servers, and events, as well as functions for scheduling arrivals and departures.
> 
> The main function of the repository is to simulate the behavior of a queueing system. The simulation starts with a certain number of servers and a certain arrival rate. Clients arrive at the system according to a Poisson distribution, and they are served by the servers according to a negative exponential distribution. The simulation ends when a certain number of clients have been served.
> 
> The repository includes a number of functions for scheduling arrivals and departures. The `arrival()` function schedules the arrival of a new client at a certain time. The `departure()` function schedules the departure of a client who has been served.
> 
> The repository also includes a number of classes for representing clients, servers, and events. The `Client` class represents a client who arrives at the system. The `Server` class represents a server who serves clients. The `Event` class represents an event that occurs in the system, such as an arrival or a departure.
> 
> To use the repository, you can create a new instance of the `Simulation` class. You can then call the `run()` method to start the simulation. The simulation will run until a certain number of clients have been served.
> 
> The repository is useful for simulating the behavior of a queueing system. It can be used to evaluate the performance of different queueing systems, and to identify potential bottlenecks.
> 
> I used the context to answer the question. I used the following documents/function_classes:
> 
> * Document 1: `Event` class
> * Document 2: `arrival()` function
> * Document 3: `departure()` function
> * Document 4: `Client` class
> * Document 5: `Server` class
> * Document 6: `Simulation` class




_______________________________________


> a. This repository contains code for simulating a queueing system with multiple servers and different urgency levels for clients.
> The simulation includes events such as arrivals, departures, and service completions.
> The code keeps track of various metrics, such as the number of arrivals, the average utilization of the servers, and the number of clients in the queue and paused.
> The simulation also includes a mechanism for adjusting the service time based on the urgency of the client.
> 
> b. The code includes the following functions:
> 
> - `arrival()`: This function generates a new arrival event and schedules it in the future event set (FES).
> - `departure()`: This function processes a departure event, removing the client from the queue or paused list and updating the statistics.
> - `start_service_if_possible()`: This function checks if there is a server available to serve a client. If so, it starts the service and schedules the end of service event. If not, it adds the client to the queue.
> - `calculate_service_time()`: This function calculates the service time for a given client, taking into account the urgency of the client.
> - `is_early_than()`: This function compares two events and returns True if the first event is earlier than the second event.
> 
> c. To set up and use the simulation, you need to:
> 
> 1. Import the necessary modules.
> 2. Create a `Simulation` object.
> 3. Set the parameters of the simulation, such as the arrival rate, the service rate, and the number of servers.
> 4. Run the simulation by calling the `run()` method.
> 5. Collect and analyze the results of the simulation.
> 
> I used the context to answer the question.
> I took the answer from the following documents/function_classes in the context:
> 
> - Document 1: `arrival()`, `departure()`, `start_service_if_possible()`, `calculate_service_time()`, `is_early_than()`
> - Document 2: `Simulation` class