In [1]:
from langchain.embeddings import GPT4AllEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema.document import Document
import pandas as pd
import json
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_community.vectorstores import FAISS
from langchain_community.document_transformers import LongContextReorder

## Import tables and corresponding metadata

In [2]:
#Import dataframes and metadata separately
imported_dfs = []
for i in range(11):
    df = pd.read_csv(f'\\Users\\HP\\Downloads\\csv_with_metadata\\acf\\acf_{i}.csv')
    imported_dfs.append(df)

with open('\\Users\\HP\\Downloads\\csv_with_metadata\\acf\\metadata.json', 'r') as f:
    imported_metadata = json.load(f)

## Create documents from tables and metadata

In [None]:
#Create a list to hold the Document objects
documents = []

# Loop through each dataframe and its corresponding metadata
for df, meta in zip(imported_dfs, imported_metadata):
    # Convert the dataframe to a string (assuming it's already preprocessed and tokenized)
    text = df.to_string(index=False)

    # Append the metadata to the text
    text_with_meta = f"{text}\n\nMetadata: {meta}"

    # Create a Document object with combined text and metadata
    doc = Document(page_content=text)

    # Append the Document object to the list
    documents.append(doc)

In [None]:
documents

## combine faculty, faq, program announcement and student handbook text files

In [2]:
# Define the file paths for your three text files
file_paths = ['\\Users\\HP\\ITA\\faculty_data_CS_FullTime_Detailed.txt', '\\Users\\HP\\ITA\\faculty_data_CS_PartTime_Detailed.txt',
            '\\Users\\HP\\ITA\\faqs.txt', '\\Users\\HP\\ITA\\pa_text.txt', '\\Users\\HP\\ITA\\student_handbook_text.txt',
            '\\Users\\HP\\ITA\\bscs.txt']

# Combine the content of the three text files into a single string
combined_text = ""
for file_path in file_paths:
    with open(file_path, 'r') as file:
        content = file.read()
        combined_text += content

In [3]:
combined_text



In [4]:
#combine the content of pages into a single string
text = "".join(combined_text)
len(text)

522123

## load program announcement and handbook via PyPDF

In [42]:
from langchain_community.document_loaders import PyPDFLoader
pages_list = []

loader = PyPDFLoader("\\Users\\HP\\ITA\\pa-2023-24.pdf")
pages = loader.load_and_split()

In [None]:
#combine the content of pages into a single string
text = "".join([page.page_content for page in pages])
len(text)

## no chunking

In [43]:
persist_directory = 'chroma/plain_text'


plain_db = Chroma.from_documents(
    documents=pages,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
) 

# save the database so we can use it later
plain_db.persist()

# check that the database have been created and get the number of documents
print(plain_db._collection.count())

166


In [44]:
plain_db = Chroma(persist_directory='chroma/plain_text', embedding_function=GPT4AllEmbeddings())

print(plain_db._collection.count())

166


In [None]:
question = "what is the pre-requisite for financial accounting?"
#compute embeddings for the question
question_vector = GPT4AllEmbeddings().embed_query(question)
docs1 = plain_db.similarity_search_with_score(question,k=5)
docs2 = plain_db.similarity_search_with_relevance_scores(question,k=5)
docs3 = plain_db.similarity_search_by_vector(question_vector,k=5)
docs4 = plain_db.max_marginal_relevance_search(question,k=5)

In [36]:
plain_db.delete_collection()

## character text splitter

In [5]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator=".",
    chunk_size=1000,
    chunk_overlap=200
)

splits = text_splitter.split_text(text)

Created a chunk of size 1229, which is longer than the specified 1000
Created a chunk of size 1718, which is longer than the specified 1000
Created a chunk of size 1081, which is longer than the specified 1000
Created a chunk of size 1361, which is longer than the specified 1000
Created a chunk of size 1254, which is longer than the specified 1000
Created a chunk of size 1117, which is longer than the specified 1000
Created a chunk of size 1278, which is longer than the specified 1000
Created a chunk of size 1034, which is longer than the specified 1000
Created a chunk of size 1178, which is longer than the specified 1000
Created a chunk of size 1037, which is longer than the specified 1000
Created a chunk of size 1328, which is longer than the specified 1000
Created a chunk of size 2184, which is longer than the specified 1000
Created a chunk of size 1187, which is longer than the specified 1000
Created a chunk of size 1994, which is longer than the specified 1000
Created a chunk of s

In [6]:
len(splits)

611

In [None]:
split_docs = [Document(page_content=x) for x in splits]
len(split_docs)

In [None]:
persist_directory = 'chroma/char_txt'


char_db = Chroma.from_documents(
    documents=split_docs,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
) 

# save the database so we can use it later
# char_db.persist()

# check that the database have been created and get the number of documents
print(char_db._collection.count())

In [None]:
char_db = Chroma(persist_directory='chroma/char_txt', embedding_function=GPT4AllEmbeddings())

print(char_db._collection.count())

In [None]:
question = "What is the email of sir sajjad?"
#compute embeddings for the question
question_vector = GPT4AllEmbeddings().embed_query(question)
docs1 = char_db.similarity_search_with_score(question,k=5)
docs2 = char_db.similarity_search_with_relevance_scores(question,k=5)
docs3 = char_db.similarity_search_by_vector(question_vector,k=5)
docs4 = char_db.max_marginal_relevance_search(question,k=5)

In [None]:
docs1

In [None]:
char_db.delete_collection()

## Recursive Text Splitter

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200

)

splits = text_splitter.split_text(text)

In [None]:
split_docs = [Document(page_content=x) for x in splits]
len(split_docs)

In [None]:
persist_directory = 'chroma/recur_txt'


recur_db = Chroma.from_documents(
    documents=split_docs,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
) 

# save the database so we can use it later
recur_db.persist()

# check that the database have been created and get the number of documents
print(recur_db._collection.count())

In [None]:
recur_db = Chroma(persist_directory='chroma/recur_txt', embedding_function=GPT4AllEmbeddings())

print(recur_db._collection.count())

In [None]:
question = "Which financial assistance packages are offered by IBA?"
#compute embeddings for the question
question_vector = GPT4AllEmbeddings().embed_query(question)
docs1 = recur_db.similarity_search_with_score(question,k=5)
docs2 = recur_db.similarity_search_with_relevance_scores(question,k=3)
docs3 = recur_db.similarity_search_by_vector(question_vector,k=3)
docs4 = recur_db.max_marginal_relevance_search(question,k=3)

In [None]:
docs1

In [None]:
recur_db.delete_collection()

## Semantic Chunker

In [None]:
from langchain_experimental.text_splitter import SemanticChunker

text_splitter = SemanticChunker(GPT4AllEmbeddings(), breakpoint_threshold_type="interquartile")

In [None]:
docs = text_splitter.create_documents([text])

In [None]:
len(docs)

In [None]:
persist_directory = 'chroma/semantic_chunk'


semantic_db = Chroma.from_documents(
    documents=docs,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
) 

# save the database so we can use it later
semantic_db.persist()

# check that the database have been created and get the number of documents
print(semantic_db._collection.count())

In [None]:
semantic_db = Chroma(persist_directory='chroma/semantic_chunk', embedding_function=GPT4AllEmbeddings())

print(semantic_db._collection.count())

In [None]:
question = "Which financial assistance packages are offered by IBA?"
#compute embeddings for the question
question_vector = GPT4AllEmbeddings().embed_query(question)
docs1 = semantic_db.similarity_search_with_score(question,k=5)
docs2 = semantic_db.similarity_search_with_relevance_scores(question,k=3)
docs3 = semantic_db.similarity_search_by_vector(question_vector,k=3)
docs4 = semantic_db.max_marginal_relevance_search(question,k=3)

In [None]:
docs1

In [None]:
semantic_db.delete_collection()

## Record responses in a text file 'response.txt' and upload on Kaggle

In [None]:
response1 = docs1[0][0].page_content
response2 = docs1[1][0].page_content
response3 = docs1[2][0].page_content
response4 = docs1[3][0].page_content
response5 = docs1[4][0].page_content

In [None]:
# Define the file path for saving the responses
file_path = 'responses.txt'

# Open the file in write mode
with open(file_path, 'w') as file:
    # Write the responses along with their headings to the file
    file.write("Response 1:\n")
    file.write(response1 + '\n\n')
    
    file.write("Response 2:\n")
    file.write(response2 + '\n\n')
    
    file.write("Response 3:\n")
    file.write(response3 + '\n\n')
    
    file.write("Response 4:\n")
    file.write(response4 + '\n\n')
    
    file.write("Response 5:\n")
    file.write(response5 + '\n\n')

print("Responses saved to:", file_path)


## BM25

In [7]:
# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(
    splits
)
bm25_retriever.k = 5

## FAISS

In [8]:
faiss_vectorstore = FAISS.from_texts(
    texts=splits, embedding=GPT4AllEmbeddings()
)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 5})

## Ensemble

In [9]:
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

In [50]:
question = "What is the prerequisite for data structures"

In [51]:
#compute embeddings for the question
ensemble_docs = ensemble_retriever.invoke(question)

In [52]:
ensemble_docs

[Document(page_content='Offered by the School of \nMathematics and Computer Science (SMCS), the MS in Data Science (MSDS) \nprogram prepares students to extract valuable insights from data through a \nunique and comprehensive methodology. The program is designed for students \nwho want to begin or advance their careers in the ﬁeld of data science. It provides \na powerful base in subjects including statistical modeling, probabilistic \nreasoning, machine learning, management of massive data sets, data \nvisualization, and software engineering. The program targets both CS and STEM \n(Science, Technology, Engineering and Mathematics) students and prepares \nthem to apply the knowledge of data science to a wide range of corporate \ndomains. Non-CS/SE/IT students are required to take non-credit foundation-level \ncourses in algorithms, data management and application development. These \ncourses prepare them for the core and more advanced data science courses \nalongside students with a CS

In [47]:
# Reorder the documents:
# Less relevant document will be at the middle of the list and more
# relevant elements at beginning / end.
reordering = LongContextReorder()
reordered_docs = reordering.transform_documents(ensemble_docs)

In [48]:
reordered_docs

[Document(page_content='Offered by the School of \nMathematics and Computer Science (SMCS), the MS in Data Science (MSDS) \nprogram prepares students to extract valuable insights from data through a \nunique and comprehensive methodology. The program is designed for students \nwho want to begin or advance their careers in the ﬁeld of data science. It provides \na powerful base in subjects including statistical modeling, probabilistic \nreasoning, machine learning, management of massive data sets, data \nvisualization, and software engineering. The program targets both CS and STEM \n(Science, Technology, Engineering and Mathematics) students and prepares \nthem to apply the knowledge of data science to a wide range of corporate \ndomains. Non-CS/SE/IT students are required to take non-credit foundation-level \ncourses in algorithms, data management and application development. These \ncourses prepare them for the core and more advanced data science courses \nalongside students with a CS

In [53]:

# Open the response.txt file in write mode
with open('responses.txt', 'w') as file:
    # Iterate through each document in ensemble_docs
    for doc in ensemble_docs:
        # Write the page content of the document to the file
        file.write(doc.page_content)
        # Add a newline separator between document contents
        file.write('\n\n')