### Loading Libraries

In [1]:
#Init Libraries

#Fitting Libraries
from components import Components
from llama_index.core import Settings
import chromadb

#Additional Core Libraries
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
from llama_index.core import PromptTemplate,Document 
from llama_index.core.node_parser import SentenceSplitter

import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#initialise components, reranker and retriever
rag_components = Components("Snowflake/snowflake-arctic-embed-s","mixedbread-ai/mxbai-embed-large-v1","llama3-8b-8192")

In [3]:
rag_components.model_name

'llama3-8b-8192'

Setting Contexts


In [4]:
Settings.embed_model = rag_components.get_embedding_model()
Settings.llm = rag_components.get_groq_llm()

Embedding model loaded!


In [5]:
db = chromadb.PersistentClient(path="../CMU_LTI_ChromaDB")
chroma_collection = db.get_collection("CMU_LTI")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

Loading Papers

In [5]:
knowledge_directory = 'LTI_Neural_Navigator-main/final_data'
contents = os.listdir(knowledge_directory)

In [7]:
from llama_index.core import SimpleDirectoryReader

In [8]:
documents = SimpleDirectoryReader(knowledge_directory).load_data()

In [9]:
pipeline = IngestionPipeline(
    transformations=[
          SentenceSplitter(chunk_size=1000),
    ],
    vector_store=vector_store,
)

In [10]:
chunks = pipeline.run(documents=documents)

In [11]:
chunks

[TextNode(id_='99b0a6e3-8c9f-494e-931a-3f82c451d3af', embedding=None, metadata={'file_path': "e:\\Github Repositories\\RAG-FYP2024\\LTI_Neural_Navigator-main\\final_data\\2023-24_Tartan_Men's_Basketball_Roster_-__Carnegie_Mellon_University_Athletics.txt", 'file_name': "2023-24_Tartan_Men's_Basketball_Roster_-__Carnegie_Mellon_University_Athletics.txt", 'file_type': 'text/plain', 'file_size': 2120, 'creation_date': '2024-08-12', 'last_modified_date': '2024-03-18'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b0b671c1-971f-4fbe-84bb-0555476b12d0', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': "e:\\Github Repositories\\RAG-FYP2024\\LTI_Neural_Navigator-main\\final_data\\2023-24_Tartan_Men's_

In [6]:
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context,similarity_top_k=5,node_postprocessors=[rag_components.get_reranker()])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mixedbread-ai/mxbai-embed-large-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reranker model loaded!


In [7]:
query_engine = index.as_query_engine(streaming=True,similarity_top_k=3)

In [8]:
qa_prompt_template_str = """
Context: {context_str}
Question: {query_str}
"""


qa_prompt_template = PromptTemplate(qa_prompt_template_str)
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template":qa_prompt_template}
)


In [10]:
response = query_engine.query("What is the main topic of the article posted on January 8?")
response.print_response_stream()

I don't know. The provided text appears to be a schedule layout for courses, not an article.

In [24]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='1db00e1b-febe-4826-8872-a17a223383dd', embedding=None, metadata={'file_path': 'e:\\Github Repositories\\RAG-FYP2024\\LTI_Neural_Navigator-main\\final_data\\sched_layout_fall.txt', 'file_name': 'sched_layout_fall.txt', 'file_type': 'text/plain', 'file_size': 637349, 'creation_date': '2024-08-12', 'last_modified_date': '2024-03-18'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6980c30a-74de-4962-a7e5-32ec17a215b5', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'e:\\Github Repositories\\RAG-FYP2024\\LTI_Neural_Navigator-main\\final_data\\sched_layout_fall.txt', 'file_name': 'sched_layout_fall.txt', 'file_type': 'text/plain', 'file_size': 637349, 'creation_da

In [None]:
question_list = []
for questions in df['question'][:128]:
    question_list.append(questions)
    

In [None]:
import time
for i in range(len(question_list)): 
    time.sleep(3)
    print(question_list[i])
    response = query_engine.query(question_list[i])
    response.print_response_stream()
    rowIndex = df.index[i]

    df.at[rowIndex, 'gen_answer'] = response.response_txt
    print('')
    print("--"*100)
    print(f'{i}/{128}')
 


Who is the music director of the Quebec Symphony Orchestra?
According to the provided context, Fabien Gabel is the music director of the Quebec Symphony Orchestra.
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
0/128
Who were the four students of the University of Port Harcourt that were allegedly murdered?
According to the article, the four students of the University of Port Harcourt who were allegedly murdered are:

1. Chiadika Lordson
2. Ugonna Kelechi Obusor
3. Mike Lloyd Toku
4. Tekena Elkanah
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1/128
What did Paul Wall offer to all U.S. Olympic Medalists?
According to the article, Paul Wall offered to give free gold grills to all U.S

In [None]:
df[:128]

Unnamed: 0,context,question,answer,gen_answer
0,"HOUSTON (Jan. 23, 2018) – Fabien Gabel, music ...",Who is the music director of the Quebec Sympho...,The music director of the Quebec Symphony Orch...,"According to the provided context, Fabien Gabe..."
1,Port Harcourt – The Rivers police command on W...,Who were the four students of the University o...,The four students of the University of Port Ha...,"According to the article, the four students of..."
2,Channels\nMusic\nStyle\nPop Culture\nSports\nS...,What did Paul Wall offer to all U.S. Olympic M...,Paul Wall wants to give free gold grills to al...,"According to the article, Paul Wall offered to..."
3,The crazy level at which African countries imp...,What are the main agricultural products that A...,"African countries mainly export cocoa, edible ...","According to the text, African countries mainl..."
4,"CHI 2010 Workshop May 7 or 8, 2011 (final date...",What is the main goal of the CHI 2011 workshop...,The main goal of this one-day CHI 2011 worksho...,The main goal of the CHI 2011 workshop on larg...
...,...,...,...,...
123,The video below was found yesterday on the WaP...,What was the long-term study published in the ...,The long-term study published in the journal P...,What a lovely morning! I'm still basking in th...
124,Trip Notes\nA little bit about India\nClimate:...,"What is the climate like in Goa, India?",The climate in Goa is humid and tropical with ...,"According to the trip notes, the climate in Go..."
125,Intuition is more than just a characteristic o...,What is the relationship between intuition and...,Intuition is more than just a characteristic o...,"According to the context, intuition is a corne..."
126,It looks like you're using an Ad Blocker.\nPle...,What action did Gov. Jan Brewer take in relati...,Brewer signed the 2011 legislative budget whic...,"According to the text, Gov. Jan Brewer signed ..."


In [None]:
df = df[:128]

In [None]:
df.to_csv(f'data/results/{rag_components.model_name}_128Q_Run1.csv',index=False)