In [2]:
from langchain.embeddings import GPT4AllEmbeddings
import pandas as pd
import json
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_community.vectorstores import FAISS

## Approach 1: Use raw data
combine faculty, faq, program announcement and student handbook text files

In [2]:
# Define the file paths for your three text files
file_paths = ['\\Users\\HP\\ITA\\faculty_data_CS_FullTime_Detailed.txt', '\\Users\\HP\\ITA\\faculty_data_CS_PartTime_Detailed.txt',
            '\\Users\\HP\\ITA\\faqs.txt', '\\Users\\HP\\ITA\\pa_text.txt', '\\Users\\HP\\ITA\\student_handbook_text.txt',
            '\\Users\\HP\\ITA\\bscs.txt']

# Combine the content of the three text files into a single string
combined_text = ""
for file_path in file_paths:
    with open(file_path, 'r') as file:
        content = file.read()
        combined_text += content

In [3]:
text_chunks = [chunk for chunk in combined_text.split('\n\n') if chunk.strip()]

In [4]:
# combined_text = combined_text.split("\n\n")
len(text_chunks)

208

## Approach 2: Add preprocessed tables and metadata

Import tables and corresponding metadata

In [2]:
#Import dataframes and metadata separately
imported_dfs = []
for i in range(11):
    df = pd.read_csv(f'\\Users\\HP\\Downloads\\csv_with_metadata\\acf\\acf_{i}.csv')
    imported_dfs.append(df)

with open('\\Users\\HP\\Downloads\\csv_with_metadata\\acf\\metadata.json', 'r') as f:
    imported_metadata = json.load(f)

Create documents from tables and metadata

In [3]:
#Create a list to hold the Document objects
documents = []

# Loop through each dataframe and its corresponding metadata
for df, meta in zip(imported_dfs, imported_metadata):
    # Convert the dataframe to a string (assuming it's already preprocessed and tokenized)
    text = df.to_string(index=False)

    # Append the metadata to the text
    text_with_meta = f"{text}\n\nMetadata: {meta}"

    # # Create a Document object with combined text and metadata
    # doc = Document(page_content=text)

    # # Append the Document object to the list
    # documents.append(doc)

In [4]:
text_with_meta

"             Semester-8 Course\\ncode  Credit\\nhours Pre-\\nrequisite                    Course\\ntype\n Accounting\\nelective V         None              3            None            Accounting\\nElective\nAccounting\\nelective VI         None              3            None            Accounting\\nElective\n   Finance elective VII         None              3            None               Finance\\nElective\n                    ELP       PRJ491              6            None Experiential\\nLearning\\nProject\n\nMetadata: {'table_name': 'Bachelor of Science (BS) Accounting and Finance semester-wise sequence of courses', 'description': 'Senior Semester 8'}"

Combine preprocessed tables (with metadata) with other text files

In [5]:
# Define the file paths for your text files
file_paths = ['\\Users\\HP\\ITA\\faculty_data_CS_FullTime_Detailed.txt', '\\Users\\HP\\ITA\\faculty_data_CS_PartTime_Detailed.txt',
            '\\Users\\HP\\ITA\\faqs.txt', '\\Users\\HP\\ITA\\pa_text.txt', '\\Users\\HP\\ITA\\student_handbook_text.txt',
            '\\Users\\HP\\ITA\\bscs.txt']

# Combine the content of the text files into a single string
combined_text = ""
for file_path in file_paths:
    with open(file_path, 'r') as file:
        content = file.read()
        combined_text += content

combined_text += text_with_meta

In [6]:
len(combined_text)

522769

## Approach 3: Further preprocess pdfs

Load pdfs

In [3]:
from langchain.document_loaders import PyPDFLoader
from collections import defaultdict
import re

def load_pdfs(file_paths):
    documents = []
    for path in file_paths:
        loader = PyPDFLoader(path)
        documents.extend(loader.load())
    return documents

Retain only lines that end with a terminal punctuation mark

In [4]:
def retain_terminal_punctuation_lines(text):
    terminal_punctuation_pattern = re.compile(r'.*[\.\!\?\"\']$')
    lines = text.split('\n')
    return '\n'.join([line for line in lines if terminal_punctuation_pattern.match(line.strip())])

Retain lines with atleast 5 words

In [5]:

def retain_lines_with_min_words(text, min_words=5):
    lines = text.split('\n')
    return '\n'.join([line for line in lines if len(line.split()) >= min_words])


In [6]:

def clean_text(text):
    text = retain_terminal_punctuation_lines(text)
    text = retain_lines_with_min_words(text)
    return text


discard pages with fewer than 3 sentences

In [7]:

def discard_short_pages(documents, min_sentences=3):
    cleaned_documents = []
    for doc in documents:
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', doc.page_content)
        if len(sentences) >= min_sentences:
            cleaned_documents.append(doc)
    return cleaned_documents


Deduplicate the data set by discarding all but one of any three-sentence spans occurring more than once.

In [8]:

def deduplicate_three_sentence_spans(documents):
    span_counts = defaultdict(int)
    for doc in documents:
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', doc.page_content)
        for i in range(len(sentences) - 2):
            span = ' '.join(sentences[i:i+3])
            span_counts[span] += 1
    
    unique_documents = []
    for doc in documents:
        unique_sentences = []
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', doc.page_content)
        for i in range(len(sentences) - 2):
            span = ' '.join(sentences[i:i+3])
            if span_counts[span] == 1 or span not in unique_sentences:
                unique_sentences.append(span)
        unique_content = ' '.join(unique_sentences)
        if unique_content:
            doc.page_content = unique_content
            unique_documents.append(doc)
    
    return unique_documents


preprocess student handbook and program announcement

In [9]:

def process_pdfs(file_paths):
    documents = load_pdfs(file_paths)
    documents = [doc for doc in documents if clean_text(doc.page_content)]
    documents = discard_short_pages(documents)
    documents = deduplicate_three_sentence_spans(documents)
    return documents

# Example usage
file_paths = ['\\Users\\HP\\ITA\\pa-2023-24.pdf', '\\Users\\HP\\ITA\\student-handbook2023-24.pdf']
processed_documents = process_pdfs(file_paths)

convert documents into string and combine with other text files

In [10]:
text_content = "\n".join([doc.page_content for doc in processed_documents])

# Define the file paths for your text files
file_paths = ['\\Users\\HP\\ITA\\faculty_data_CS_FullTime_Detailed.txt', '\\Users\\HP\\ITA\\faculty_data_CS_PartTime_Detailed.txt',
            '\\Users\\HP\\ITA\\faqs.txt', '\\Users\\HP\\ITA\\bscs.txt']

# Combine the content of the text files into a single string
combined_text = ""
for file_path in file_paths:
    with open(file_path, 'r') as file:
        content = file.read()
        combined_text += content

combined_text += text_content

## For each of the above approaches, perform the following steps iteratively

character text splitter

In [160]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=700,
    chunk_overlap=200
)

splits = text_splitter.split_text(combined_text)

Created a chunk of size 912, which is longer than the specified 700
Created a chunk of size 715, which is longer than the specified 700
Created a chunk of size 865, which is longer than the specified 700
Created a chunk of size 1301, which is longer than the specified 700
Created a chunk of size 862, which is longer than the specified 700
Created a chunk of size 810, which is longer than the specified 700
Created a chunk of size 909, which is longer than the specified 700
Created a chunk of size 974, which is longer than the specified 700
Created a chunk of size 1081, which is longer than the specified 700
Created a chunk of size 754, which is longer than the specified 700
Created a chunk of size 731, which is longer than the specified 700
Created a chunk of size 709, which is longer than the specified 700
Created a chunk of size 963, which is longer than the specified 700
Created a chunk of size 1062, which is longer than the specified 700
Created a chunk of size 1155, which is longer

In [161]:
len(splits)

1945

Recursive Text Splitter

In [184]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap = 200

)

splits = text_splitter.split_text(combined_text)

In [185]:
len(splits)

1972

Semantic Chunker

In [11]:
from langchain_experimental.text_splitter import SemanticChunker

text_splitter = SemanticChunker(GPT4AllEmbeddings(), breakpoint_threshold_type="standard_deviation")

In [12]:
docs = text_splitter.create_documents([combined_text])

In [197]:
splits = [doc.page_content for doc in docs]
# Filter out any None or empty strings from splits
splits = [text for text in splits if text and text.strip()]

BM25

In [198]:
# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(
    splits
)
bm25_retriever.k = 5

FAISS

In [199]:
faiss_vectorstore = FAISS.from_texts(
    texts=splits, embedding=GPT4AllEmbeddings()
)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 5})

Ensemble

In [200]:
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

Run parrotParaphrasing notebook on Kaggle to obtain the following list of paraphrased questions

In [201]:
paraphrased_questions = [('show the eligibility criteria for bcs programs?', 'What are the eligibility criteria for candidates applying for the BSCS program?'), ('what are the eligibility criteria for bscs candidates?', 'What are the eligibility criteria for candidates applying for the BSCS program?'), ('list the eligibility criteria for the bscs program?', 'What are the eligibility criteria for candidates applying for the BSCS program?'), ('tell me the eligibility criteria for bscs candidate?', 'What are the eligibility criteria for candidates applying for the BSCS program?'), ('show the eligibility criteria for bscs program applicants?', 'What are the eligibility criteria for candidates applying for the BSCS program?'), ('tell me the eligibility criteria for applicants of bscs?', 'What are the eligibility criteria for candidates applying for the BSCS program?'), (('show the prerequisites to the data structures course?', 21), 'What is the pre-requisite for the Data Structures course?'), (('which is the prerequisite for the data structure course?', 16), 'What is the pre-requisite for the Data Structures course?'), (('what is a prerequisite for the data structures course?', 16), 'What is the pre-requisite for the Data Structures course?'), ("what is sir sajjad haider's email address?", 'What is the email address of Sir Sajjad Haider?'), ("what is sir sajjad haider's email?", 'What is the email address of Sir Sajjad Haider?'), ('tell me the email address of sajjad haider?', 'What is the email address of Sir Sajjad Haider?'), ('tell me the email address of a computer support team?', 'What is the email address of the IT help desk?'), ('which is the email address of the technical help desk?', 'What is the email address of the IT help desk?'), ('tell me the email address of the it help desk?', 'What is the email address of the IT help desk?'), (('is there any possibility that you can exempt the iba aptitude test from this?', 38), 'Is it possible to exempt the IBA aptitude test?'), (('please send a quick e-mail to sir sajjad haider that i need a extension for the submission of the project report', 51), 'Write an email to Sir Sajjad Haider that I need extension for Project report submission.'), (('write an email to sir sajjad haider that i need an extension for project report submission', 16), 'Write an email to Sir Sajjad Haider that I need extension for Project report submission.')]

Perform ensemble retrieval for each paraphrased question

In [202]:
# Initialize a set to keep track of invoked questions
invoked_questions = set()

top_docs = {}

# Iterate over the list of tuples and perform ensemble retrieval
for para_question, orig_question in paraphrased_questions:
    # Invoke the ensemble retriever only if the original question hasn't been invoked before
    if orig_question not in invoked_questions:
        orig_docs = ensemble_retriever.invoke(orig_question)
        top_docs[orig_question] = orig_docs

        # Print results for the original question
        print("-" * 100)
        print("Original Question:", orig_question)
        print("Results:")
        for doc in orig_docs:
            print(doc)
        print("-" * 100)

        # Add the original question to the set of invoked questions
        invoked_questions.add(orig_question)

    para_docs = ensemble_retriever.invoke(orig_question)
    top_docs[orig_question].extend(para_docs)
    # Print results for the paraphrased question
    print("-" * 100)
    print("Paraphrased Question:", para_question)
    print("Results:")
    for doc in para_docs:
        print(doc)
    print("-" * 100)


----------------------------------------------------------------------------------------------------
Original Question: What are the eligibility criteria for candidates applying for the BSCS program?
Results:
page_content="A. When the admission process starts, it is announced on IBA website and advertised in all the major national newspapers. Normally, admissions activity commences in the month of October for Spring Semester Admissions and January for Fall Semester Admissions. Q. Does the IBA allow the students to apply for admission whose final result has not been announced? A. IBA allows the students to apply for admission whose final results have not been announced subject to the condition that they have no deficiency in the previous years of their academic career. The admission may be conditional pending submission of the required results. Q. Is the IBA aptitude test compulsory for every student seeking admission in IBA? A. No, it's not, an S.A.T. / GRE / GMAT required scores allow

Export top_docs.json file which contains a dictionary. Keys are original questions, and values are retrieved documents of the original questions and its paraphrased counterparts

In [203]:
import json

# Function to convert Document objects to JSON-serializable dictionaries
def convert_to_dict(documents):
    doc_list = []
    for doc in documents:
        doc_dict = {
            "page_content": doc.page_content,
            "metadata": doc.metadata
        }
        doc_list.append(doc_dict)
    return doc_list

# Your code to populate top_docs goes here

# Convert Document objects in top_docs to JSON-serializable dictionaries
json_serializable_top_docs = {key: convert_to_dict(docs) for key, docs in top_docs.items()}

# Define the file path for the JSON export
json_file_path = "top_docs.json"

# Export top_docs to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(json_serializable_top_docs, json_file)

print(f"Top docs exported to {json_file_path}")


Top docs exported to top_docs.json


Now run the cohereRerank.js file to select top 5 most relevant documents for each original question