## LLM (Phi-2) with Retrieval Agent (ColBERTv2)

The collection represents the knowledge base and is loaded into the retrieval model. An indexer and searcher is initiated. 
Next, the retrieved passages are processed and a prompt is generated for each individual interaction. The interactions are

1. **Compressed Prompt with limitation on the provided knowledge:** The compressed content must be used to generate and answer.
2. **Prompt with limitation on the provided knowledge**: The content must be used to generate an answer.
3. **Compressed Prompt without limitation on the provided knowledge:** The compressed content is presented as an inspiration to formulate an answer.
4. **Prompt without limitation on the provided knowledge:** The content is presented as an inspiration to formulate an answer.
 

The source from the external knowledge is provided for generated answers from the limited interaction.

In [None]:
# Import ColBERT libraries
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection

# Import PyTorch and HuggingFace libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmlingua import PromptCompressor

# Import default Python libraries
import csv
import ast

In [None]:
# Check for available GPU. IF yes, select GPU as default device
if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.set_device(0)  # Set the GPU device (change the index according to your system)
    torch.cuda.device(device)  # Set the default CUDA device
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU instead.")

Uncomment the next three lines to download a pre-trained checkpoint for the ColBERT indexer

In [None]:
#!mkdir -p downloads/

In [None]:
#!wget https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/colbertv2.0.tar.gz -P downloads/

In [None]:
#!tar -xvzf downloads/colbertv2.0.tar.gz -C downloads/

In [None]:
# Save collection from TSV file
collection = Collection(path='kb/collection1024token.tsv')

In [None]:
# Define standard parameters, checkpoint and index name. Parameters are adopted from the ColBERT documentation.
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300   # truncate passages at 300 tokens

checkpoint = 'downloads/colbertv2.0'
index_name = f'hitl.{nbits}bits'

In [None]:
# Download a sample indexer fron HuggingFace or initialize a custom indexer with the pre-defined checkpoint
APPLY_INDEXING = False

if APPLY_INDEXING:
    from huggingface_hub import snapshot_download

    !mkdir "index"
    indexer = snapshot_download(repo_id="colbert-ir/indexes", local_dir="index")
    index_name = indexer + "/intro_colbert"
else:
    checkpoint = 'downloads/colbertv2.0'

    with Run().context(RunConfig(nranks=1, experiment='notebook')):  # nranks specifies the number of GPUs to use
        config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.                                                                           # Consider larger numbers for small datasets.

        indexer = Indexer(checkpoint=checkpoint, config=config)
        indexer.index(name=index_name, collection=collection, overwrite=True)

In [None]:
# Check path to indexer file
indexer.get_index()

In [None]:
# Create ColBERT searcher
with Run().context(RunConfig(experiment='notebook')):
    searcher = Searcher(index=index_name, collection=collection)

In [None]:
# Define query
query = 'How can Large Language Models be refined by external knowledge?'

In [None]:
# Find the top-3 passages for this query
results = searcher.search(query, k=3)

# Print out the top-k retrieved passages
for passage_id, passage_rank, passage_score in zip(*results):
    print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}")

In [None]:
content_list = []
metadata_list = []

# extract the text and metadata from the document passage and split them into seperate lists
for passage_id, passage_rank, passage_score in zip(*results):
    collection_set = {searcher.collection[passage_id]}
    for item in collection_set:
        parts = item.split(" metadata=")
        page_content_str = parts[0].split("page_content=")[1].strip("'")
        content_list.append(page_content_str)
        metadata_str = parts[1]
        metadata_dict = ast.literal_eval(metadata_str)
        # Only keep title, author, date and DOI for the metadata output
        top_keys = ['title', 'author', 'publish_date', 'doi']
        top_metadata = list(map(metadata_dict.get, top_keys))
        metadata_list.append(top_metadata)

In [None]:
# remove duplicate metadata items
seen = set()
unique_meta = []
for lst in metadata_list:
        if lst[0] not in seen:
            seen.add(lst[0])
            unique_meta.append(lst)

In [None]:
# initialize Transformer langauge model and tokenizer
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype=torch.float32, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)

In [None]:
# define instruction text
instruction = 'Instruction: You are an Expert on topics covering natural language processing and must answer the question:'

In [None]:
# declare if you only want the model to use the external data
external_knowledge_only = False
# declare if you want to compress the prompt 
prompt_compressed = False

# for interaction 1
if external_knowledge_only and prompt_compressed:
    llmlingua = PromptCompressor()
    compressed_prompt = llmlingua.compress_prompt(content,
                                                  instruction=f"Instruction: {instruction}: {query} Below are facts that might be meaningful to answer the given question:",
                                                  question="", target_token=300)
    compressed_content = compressed_prompt['compressed_prompt']
    
    prompt = f'{instruction} {query} \nYou confidently answer the question based on the text abstract below. If the question is not related to the text abstract, output an <missing information> message. \n{compressed_content}'
    metad = unique_meta

# for interaction 2
elif external_knowledge_only and not prompt_compressed:
    prompt = f'{instruction} {query} \nYou confidently answer the question based on the text abstract below. If the question is not related to the text abstract, output an <missing information> message. \n{content_list}'
    metad = unique_meta

# for interaction 3
elif not external_knowledge_only and prompt_compressed:
    compressed_prompt = llmlingua.compress_prompt(content,
                                                  instruction=f"Instruction: {instruction}: {query} Below are facts that might be meaningful to answer the given question:",
                                                  question="", target_token=300)
    compressed_content = compressed_prompt['compressed_prompt']
    
    prompt = f'{instruction} {question} \nBelow are facts that might be meaningful to answer the given question: \n{compressed_content}'
    # return empty metadata list
    metad = []
# for interaction 4
else:
    prompt = f'{instruction} {question} \nBelow are facts that might be meaningful to answer the given question: \n{content_list}'
    # return empty metadata list
    metad = []

In [None]:
# output without metadata
if not metad:
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
    outputs = model.generate(**inputs, max_length=1000)
    text = tokenizer.batch_decode(outputs)[0]
    print(text)

# output with metadata
else:
    inputs = tokenizer(prompt2, return_tensors="pt", return_attention_mask=False)
    outputs = model.generate(**inputs, max_length=1000)
    text = tokenizer.batch_decode(outputs)[0]
    print(text)
    print('Source:')
    for imeta in metad:
    print("Title: " + str(imeta[0]) + "\nAuthor: " + str(imeta[1]) + "\nPublication Date: " + str(imeta[2]) + "\nDOI: " + str(imeta[3]) + "\n")