In [16]:
from langchain_community.document_loaders.csv_loader import CSVLoader

def encode_csv(path, model, chunk_size=1000, chunk_overlap=0):
    # Load PDF documents
    loader = CSVLoader(file_path=path,
        csv_args={
        'delimiter': ',',
        'quotechar': '"',
        'fieldnames': ['DR#', 'Problem Summary', 'Problem Description', 'Notes & Resolution']},
        metadata_columns=['DR#', 'Problem Summary', 'Problem Description', 'Notes & Resolution'],
        # content_columns=['Problem Summary', 'Problem Description', 'Notes & Resolution'],
        content_columns=['Problem Summary'],
        encoding='utf-8')
    documents = loader.load()
    return documents

encode_csv("data/mantis.csv", "llama3.2")

[Document(metadata={'source': 'data/mantis.csv', 'row': 0, 'DR#': '25120', 'Problem Summary': 'Simulator Freeze - October 16th 2024', 'Problem Description': 'Simulator froze unexpectedly second of two LORP runs this morning.\r\nFollowing indications of ICCM datalink failure, simulator board controls and ovation were unresponsive for approximately 2 minutes while simulator status indicated "Running" on IS station. After which, the simulator continued to work as normal', 'Notes & Resolution': '@ppham 01/14/2025\nFreeze was not reproducible during further validation and did not occur during actual exam run. Closing DR\n=-=\n@jclark 10/17/2024\nIf this issue emerges again on a future validation run of the same scenario, please collect the ICs and data needed for troubleshooting.\n'}, page_content='Problem Summary: Simulator Freeze - October 16th 2024'),
 Document(metadata={'source': 'data/mantis.csv', 'row': 1, 'DR#': '25119', 'Problem Summary': 'Thor Abort - October 16th 2024', 'Problem D

In [17]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from tqdm.autonotebook import tqdm
import gzip
import os
import torch

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")


#We use the Bi-Encoder to encode all passages, so that we can use it with semantic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder

wikipedia_filepath = 'simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipedia_filepath):
    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)

passages = []
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        data = json.loads(line.strip())

        #Add all paragraphs
        #passages.extend(data['paragraphs'])

        #Only add the first paragraph
        passages.append(data['paragraphs'][0])

print("Passages:", len(passages))

# We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)


Passages: 169597


Batches:   9%|▉         | 470/5300 [00:20<03:30, 22.99it/s]


KeyboardInterrupt: 

In [None]:
# We also compare the results to lexical search (keyword search). Here, we use 
# the BM25 algorithm which is implemented in the rank_bm25 package.

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = []
for passage in tqdm(passages):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)


  0%|          | 0/169597 [00:00<?, ?it/s]

100%|██████████| 169597/169597 [00:02<00:00, 68928.61it/s]


In [26]:
# This function will search all wikipedia articles for passages that
# answer the query
def search(query):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    # print("Top-3 lexical search (BM25) hits")
    # for hit in bm25_hits[0:3]:
    #     print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    ##### Semantic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from bi-encoder
    # print("\n-------------------------\n")
    # print("Top-3 Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    # for hit in hits[0:3]:
    #     print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    # Output of top-5 hits from re-ranker
    # print("\n-------------------------\n")
    # print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    result = list()
    for hit in hits[0:3]:
        # print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
        result.append(passages[hit['corpus_id']].replace("\n", " "))

    return(result)


In [27]:
import ollama
# search(query = "What is the capital of the United States?")
# generate a response combining the prompt and data we retrieved in step 2
query = "What is the capital of the United States?"
data = search(query=query)

output = ollama.generate(
  model="llama3.2",
  prompt=f"Using this data: {data}. Respond to this prompt: {query}"
)

print(output['response'])


Input question: What is the capital of the United States?
According to the provided data, the capital of the United States is not explicitly stated as a single city or location. However, it can be inferred that Washington, D.C. (also known as simply Washington) serves as the political center and is the official home of many major national government offices, including the President of the USA.

Furthermore, the text mentions the United States Capitol, which is located in Washington, D.C., suggesting a strong connection between the capital city and the legislative branch of the federal government. It can be concluded that Washington, D.C. (or simply Washington) is often considered as the capital of the United States.


In [None]:
query = "What is the capital of the United States?"
data = search(query=query)

output = ollama.generate(
  model="llama3.2",
  prompt=f"Using this data: {data}. Respond to this prompt: {query}"
)

print(output['response'])
