## Preliminary instructions and installation of dependencies

In [None]:
!git clone https://github.com/edoppiap/CharBERT.git

Cloning into 'CharBERT'...
remote: Enumerating objects: 356, done.[K
remote: Counting objects: 100% (167/167), done.[K
remote: Compressing objects: 100% (118/118), done.[K
remote: Total 356 (delta 88), reused 111 (delta 46), pack-reused 189[K
Receiving objects: 100% (356/356), 3.31 MiB | 12.47 MiB/s, done.
Resolving deltas: 100% (189/189), done.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/CharBERT

/content/CharBERT


In [None]:
%%capture
%pip install --q boto3 GitPython langchain chromadb sentence_transformers
%pip install langchain_community==0.0.16
!pip -q install google-generativeai==0.3.1
!pip -q install google-ai-generativelanguage==0.4.0
!pip -q install langchain-google-genai
!pip install langchain==0.1.4

In [None]:
import os
from git import Repo
from google.colab import userdata
from IPython.display import display
from IPython.display import Markdown


from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.text_splitter import Language

from langchain_core.prompts.prompt import PromptTemplate

from modeling.modeling_charbert import CharBertTransformer
from modeling.charbert_embeddings import CharBertEmbeddings
from sentence_transformers import SentenceTransformer, models

import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAI

## Download and parse the Github repository

In [None]:
# Clone a github repo
repo_path = "/content/CharBERT/db_charbert"
#github_repo = "https://github.com/mawentao277/CharBERT" ## any github repository URL
github_repo = 'https://github.com/edoppiap/casl_labs'
repo = Repo.clone_from(github_repo, to_path=repo_path)

In [None]:
# Load
loader = GenericLoader.from_filesystem(
    #repo_path + "/libs/langchain/langchain",
    repo_path,
    glob="**/*",
    suffixes=[".py"],
    #exclude=["**/non-utf8-encoding.py"],
    parser=LanguageParser(language=Language.PYTHON),
)
documents = loader.load()
len(documents)

118

In [None]:
import re
pattern = r'(?:def|class)\s+(\w+)\s*'

i_to_del = []
"""for i, doc in enumerate(documents):
    matches = re.findall(pattern, doc.page_content)
    doc.metadata['wrap_name'] = doc.metadata['source'] + '_' + matches[0]
    if doc.metadata['content_type'] == 'simplified_code':
        i_to_del.append(i)"""

for i, doc in enumerate(documents):

    if 'content_type' in doc.metadata:
        matches = re.findall(pattern, doc.page_content)

        if len(matches) == 0:
            i_to_del.append(i)

        else:
            doc.metadata['wrap_name'] = doc.metadata['source'] + '_' + matches[0]
            if doc.metadata['content_type'] == 'simplified_code':
                i_to_del.append(i)

    else:
        i_to_del.append(i)

for ix in reversed(i_to_del):
    documents.pop(ix)

for doc in documents:
    print(doc.metadata)

In [None]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=1000, chunk_overlap=0
)
texts = python_splitter.split_documents(documents)
len(texts)

261

## Inizialize CharBert model for Sentence Embeddings

In [None]:
charBertTransformer = CharBertTransformer(model_type = 'bert',
                                          model_name_or_path = '/content/drive/MyDrive/NLP_Project/CharBERT/charbert-bert-wiki', ## download it from the link in the readme
                                          char_vocab = '/content/CharBERT/data/dict/bert_char_vocab')

pooling_model = models.Pooling(charBertTransformer.get_word_embedding_dimension()*2)
embeddings = SentenceTransformer(modules=[charBertTransformer, pooling_model])
sentenceEmbeddings = CharBertEmbeddings(embeddings)

  return self.fget.__get__(instance, owner)()


cls: <class 'modeling.configuration_bert.BertConfig'>
pretrained_model_name_or_path: /content/drive/MyDrive/NLP_Project/CharBERT/charbert-bert-wiki
cls.pretrained_config_archive_map: {'bert-base-uncased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json', 'bert-large-uncased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json', 'bert-base-cased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json', 'bert-large-cased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json', 'bert-base-multilingual-uncased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json', 'bert-base-multilingual-cased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json', 'bert-base-chinese': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json', 'bert-base-german-cased': 'htt

## Create a chroma vector database given the github repository

In [None]:
db = Chroma.from_documents(texts , sentenceEmbeddings, persist_directory='./chroma_db')

retriever = db.as_retriever(
    search_type="mmr",  # Also test "similarity"
    search_kwargs={"k": 6},
)
db.persist()

## Load an existing vector database

In [None]:
db = Chroma(persist_directory='/content/drive/MyDrive/NLP_Project/db/casl_ema_chroma_db_giusto', embedding_function=sentenceEmbeddings)
retriever = db.as_retriever(
    search_type="mmr",  # Also test "similarity"
    search_kwargs={"k": 10},
)

## Initialize Google gemini environment and LLM

In [None]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyDVRweStnbEJUjEAV9Mah2ZhEUp2kz0w2M"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
llm = GoogleGenerativeAI(model="models/gemini-1.0-pro-001")
llm_latest = GoogleGenerativeAI(model="models/gemini-1.0-pro-latest")

# RAG MODEL

In [None]:
import textwrap
from IPython.display import display

template = """
You are an AI assistant that helps users understand Github repositories
that are provided to you via context input,
the context you are given is composed by python scripts of the repository: {repo} |
This documents are the context input Docs: {context} |
This is the question you are going to aswer: {question}

Instructions:
1. Answer based on the documents given in the context, use only the functions seen in the context.
2. Focus on repo/code.
3. Consider:
    a. Purpose/features - describe.
    b. Functions/code - provide details/samples.
    c. Setup/usage - give instructions.
4. Unsure? Say "I am not sure".
5. Tell me if you used the context to answer the question.
6. Tell me from which documents/function_classes in the context you took the answer using the index of the documents in the list of documents.

Answer:
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["repo","question", "context"]
)


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

def retrieve_documents(question=None, retriever=None, llm=None, what_retrieve='chunks', transform_in_code=False):
  if question == None:
    print('Need a question to retrieve documents')
    return None

  if retriever == None:
    print('Need retriever to retrieve documents')
    return None

  if what_retrieve == 'functions_or_classes':
    if transform_in_code:
      if llm:
        code_query = get_code_from_question(question, llm)
        print(f"This is the code query generated from the LLM:\n {code_query}\n----------------------\n")
      documents = retriever.get_relevant_documents(code_query)
    else:
      documents = retriever.get_relevant_documents(question)
    metadata = []
    for d in documents:
      metadata.append(d.metadata['wrap_name'])
    metadata = set(metadata)
    full_res = []
    for wrap_name in metadata:
      res = ''
      document_list = db.get(where={"wrap_name": wrap_name})['documents']
      for s in document_list:
        res += s
      full_res.append(res)
    function_or_classes_list = []
    for i, function_class_content in enumerate(full_res):
      function_or_classes_list.append(f'Start function/class {i}: \n'+function_class_content+f'\nEnd function/class {i}')
    return function_or_classes_list

  elif what_retrieve == 'chunks':
    if transform_in_code:
      code_query = get_code_from_question(question, llm)
      print(f"This is the code query generated from the LLM:\n {code_query}\n----------------------\n")
      retrieved_documents = retriever.get_relevant_documents(code_query)
    else:
      retrieved_documents = retriever.get_relevant_documents(question)
  elif what_retrieve == 'entire_document':
    if transform_in_code:
      if llm:
        code_query = get_code_from_question(question, llm)
        print(f"This is the code query generated from the LLM:\n {code_query}\n----------------------\n")
      documents = retriever.get_relevant_documents(code_query)
    else:
      documents = retriever.get_relevant_documents(question)
    metadata = []
    for d in documents:
      metadata.append(d.metadata['source'])
    metadata = set(metadata)
    document_list = []
    for i, source in enumerate(metadata):
      with open(source, 'r') as file:
        file_contents = file.read()
      document_list.append(f'Start Document {i}: \n'+file_contents+f'\nEnd Document {i}')
    return document_list
  return retrieved_documents


def get_code_from_question(query, llm):
  context = """You are an AI assiantant that converts natural language in python code, return only code in your answer. This is the question: """
  result = llm.invoke(context+query)
  if "```python\n" in result :
    return result.replace("```python\n", "").replace("\n```","").replace("```python \n","").replace("```python  \n","")
  else:
    return query




In [None]:
from pprint import pprint
stuff_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt, verbose=True)
stuff_chain_latest = load_qa_chain(llm_latest, chain_type="stuff", prompt=prompt, verbose=True)

transform_in_code = True
what_retrieve = 'functions_or_classes' ## entire_document, chunks, oppure functions_or_classes
retriever = db.as_retriever(search_type="mmr",search_kwargs={"k": 25},) if what_retrieve == 'chunks' else db.as_retriever(search_type="mmr",search_kwargs={"k": 6},)

question = "hoe do you open a csv file using pandas?"

retrieved_documents = retrieve_documents(question=question, retriever=retriever, llm=llm, what_retrieve=what_retrieve, transform_in_code=transform_in_code)

if what_retrieve == 'chunks':
  stuff_answer = stuff_chain.invoke(
    {"repo": repo, "question": question, "input_documents": retrieved_documents}, return_only_outputs=True)
  stuff_answer_latest = stuff_chain_latest.invoke(
    {"repo": repo, "question": question, "input_documents": retrieved_documents}, return_only_outputs=True)
  display(to_markdown(stuff_answer['output_text']))
  print('\n\n')
  print('_______________________________________')
  display(to_markdown(stuff_answer_latest['output_text']))
else:
  result = llm.invoke(template.format(repo = repo, context = retrieved_documents, question = question))
  result_latest = llm_latest.invoke(template.format(repo = repo, context = retrieved_documents, question = question))
  display(to_markdown(result))
  print('\n\n----------------------------------\n\n')
  display(to_markdown(result_latest))

This is the code query generated from the LLM:
 import pandas as pd

# Open a csv file
df = pd.read_csv('filename.csv')

# Print the contents of the csv file
print(df)
----------------------



> The function `read_csv_exams_files` in the index 4 of the context is used to read a csv file using pandas.
> 
> The function `read_csv_exams_files` reads a csv file called `input_exams_.csv` and returns a list of `Exam` objects.
> The function uses the `pd.read_csv` function from the pandas library to read the csv file.
> The `pd.read_csv` function takes the path to the csv file as its first argument and the separator as its second argument.
> The `read_csv_exams_files` function uses a semicolon as the separator.
> The function then iterates over the rows of the DataFrame and creates an `Exam` object for each row.
> The `Exam` object has the following attributes:
> 
> * `name`: The name of the exam.
> * `year`: The year the exam was taken.
> * `semester`: The semester the exam was taken.
> * `passed`: A boolean value indicating whether the exam was passed.
> * `tot`: The total number of points possible on the exam.
> * `cfu`: The number of credits the exam is worth.
> * `optional`: A boolean value indicating whether the exam is optional.
> * `max_stud`: The maximum number of students allowed to take the exam.
> * `grade_distr`: A list of the grades that were given on the exam.
> 
> I used the context to answer the question.



----------------------------------




> I am not sure, the provided context does not contain any information on how to open a csv file using pandas.
> 
> I used the documents in the context to answer: No

In [None]:
sum = 0
if what_retrieve != 'chunks':
  for doc in retrieved_documents:
    sum += len(doc)
else:
  for doc in retrieved_documents:
    sum += len(doc.page_content)
sum/4

3058.75

In [None]:
display(to_markdown(llm.invoke(" ")))