In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader
import sys
import csv
maxInt = sys.maxsize

while True:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.

    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)

def encode_csv(path, chunk_size=1000, chunk_overlap=0):
    # Load PDF documents
    loader = CSVLoader(file_path=path,
        csv_args={
        'delimiter': ',',
        'quotechar': '"',
        'fieldnames': ['DR#', 'Problem Summary', 'Problem Description', 'Notes & Resolution']},
        metadata_columns=['DR#', 'Problem Summary', 'Problem Description', 'Notes & Resolution'],
        content_columns=['Problem Summary', 'Problem Description', 'Notes & Resolution'],
        encoding='utf-8')
    documents = loader.load()
    return documents

docs = encode_csv("data/mantis.csv")

# for doc in docs:
#     print(doc.metadata['Problem Summary'], doc.metadata['Problem Description'], doc.metadata['Notes & Resolution'])

In [2]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from tqdm.autonotebook import tqdm
import gzip
import os
import torch
import ollama

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")

#We use the Bi-Encoder to encode all passages, so that we can use it with semantic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder

wikipedia_filepath = 'simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipedia_filepath):
    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)

passages = []
# with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
#     for line in fIn:
#         data = json.loads(line.strip())

#         #Add all paragraphs
#         #passages.extend(data['paragraphs'])

#         #Only add the first paragraph
#         passages.append(data['paragraphs'][0])

# print("Passages:", len(passages))

for doc in docs:
    # print(doc.metadata['Problem Summary'], doc.metadata['Problem Description'], doc.metadata['Notes & Resolution'])
    passages.append(str([doc.metadata['Problem Summary'], doc.metadata['Problem Description'], doc.metadata['Notes & Resolution']]))

# We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)


  from .autonotebook import tqdm as notebook_tqdm





Batches: 100%|██████████| 681/681 [00:40<00:00, 16.66it/s]


In [3]:
# We also compare the results to lexical search (keyword search). Here, we use 
# the BM25 algorithm which is implemented in the rank_bm25 package.

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = []
for passage in tqdm(passages):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)


100%|██████████| 21792/21792 [00:01<00:00, 17251.43it/s]


In [6]:
# This function will search all wikipedia articles for passages that
# answer the query
def search(query):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -5)[-10:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    print("Top-N lexical search (BM25) hits")
    for hit in bm25_hits[0:10]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    ##### Semantic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from bi-encoder
    print("\n-------------------------\n")
    print("Top-N Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits[0:100]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    # Output of top-5 hits from re-ranker
    print("\n-------------------------\n")
    print("Top-N Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    result = list()
    for hit in hits[0:5]:
        print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
        result.append(passages[hit['corpus_id']].replace("\n", " "))

    return(result)


In [8]:
# search(query = "What is the capital of the United States?")
# generate a response combining the prompt and data we retrieved in step 2
query = "How to fix rod drift?"
data = search(query=query)

output = ollama.generate(
  model="llama3.2",
  prompt=f"Using this data: {data}. Respond to this prompt: {query}"
)
print("\n-------------------------\n")
print("LLM Response")
print(output['response'])


Input question: How to fix rod drift?
Top-N lexical search (BM25) hits
	17.472	['Combined load - When inserting control rods in gang mode the control rod drift alarm (P680-4A2-E4) alarms', 'When inserting control rods in gang mode the control rod drift alarm (P680-4A2-E4) alarms. The light for control rod drift on P680-6C does not illuminate.\r\nThe alarm clears itself when control rod movement is stopped and should not. To reset a drift you have to depress reset drift 0n P680-6C', '@mmcadory 08/07/2024\nFound out that this occurs on the current training load. DR 24-0130 initiated. mzm 08/07/2024\n=-=\n@dofarrell 08/05/2024\nat first glance, it would not seem that the rod control system was impacted by the addition of the new THOR3G hydraulic models.  This is assigned to RGoldman to take an initial look to see what is going on.  If the root of the problem is in the THOR models, reassign backk to CORYS\n']
	17.447	['Malf RD03: Unable to withdraw a rod with rod drift inward active.', 'Du

In [None]:
# query = "Scram is not working"
# data = search(query=query)

# output = ollama.generate(
#   model="deepseek-r1:14b",
#   prompt=f"Using this data: {data}. Respond to this prompt: {query}"
# )

# print(output['response'])


Input question: Scram is not working
Top-3 lexical search (BM25) hits
	10.269	['Hydraulic ATWS Malfunction CRD014 functions differently and is not predictable', 'Malfunction CRD014 inserts and causes a SCRAM immediately. This initially sat in the background and would not affect scenarios until a SCRAM occurred.\r\n\r\nA second issue is now rods are randomly inserting with the hydraulic lock as to where it was repeatable in the past. This needs to be repeatable to ensure the same rods are out during the same severity of ATWS.', '@bturner 06/05/2023\nMalfunction works differently, but is repeatable. This tested SAT after working through scenarios.\n=-=\n@bturner 05/30/2023\nFurther discussion is to be had with Instructors, as this appears to e repeatable and the different functionality seems to be fine.\n']
	10.262	['Need to test DRE SWR 14620 at FAT - CRD Drift in not working', "SWR 14620 Problem description:\r\nWhile driving power down to troubleshoot another SWR noticed the Drift In m