# Approach Explanation:

So what we plan to do is having multiple questions from a long context, we plan to separate out the context into chunks of roughly 800 words (tokens) and pass all of the questions along with the chunks and ask LLM to answer the relevant questions from the chunk and leave the rest of them blank. then, depending on the response observations we have, we will either interleave the answers manually or ask an LLM to do so.

### Initial Code:

This is the setup code, from the Experiments notebook, downloads the datasets and initial setup of class

In [1]:
!pip install gdown #to download the dataset from google drive
!pip install -U langchain-community # to work with the langchain llm



import gdown # to download dataset from google drive
import pandas as pd #to import dataset as
from collections import defaultdict # to store the context map
from langchain.llms import HuggingFaceHub # to initialize the LLM



validation_dataset_url = 'https://drive.google.com/uc?id=1bGNCDoItbvFbBWoDgBF8YcLFI1Te5RmV'
training_dataset_url = 'https://drive.google.com/uc?id=1-vsbRTLKgo2PalcG3AargcBblytlGQd1'

validation_dataset_filename = 'validation_dataset.csv'
training_dataset_filename = 'training_dataset.csv'

gdown.download(validation_dataset_url, validation_dataset_filename, quiet=True)
gdown.download(training_dataset_url, training_dataset_filename, quiet=True)


df = pd.read_csv(training_dataset_filename)


context_map = defaultdict(list)
for _, row in df.iterrows():
    context = row['context']
    question = row['question']
    answer = row['text']
    context_map[context].append((question, answer))






Collecting langchain-community
  Downloading langchain_community-0.3.23-py3-none-any.whl.metadata (2.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

### Approach Class:

This is the class continued from notebook1

In [None]:
class Agent2:
    #same context overload notebook
    def __init__(self, context_map, model_name="mistralai/Mistral-7B-Instruct-v0.3", access_token="TOKEN-HERE"):
        self.context_map = context_map
        self.llm = HuggingFaceHub(
            repo_id=model_name,
            model_kwargs={"temperature": 0.3, "max_length": 200},
            huggingfacehub_api_token=access_token
        )
        self.current_prompt = None # NEW: ask user to input context and questions seperately. chunks will only be made of context.
        self.chunked_prompts = []
        self.responses = []

    def generate_response(self, prompt): #can use this function to get answer from 1 chunk, then 2nd chunk etc, all chunks and then proceed. with normal
        return self.llm(prompt)

    def get_context_questions(self, k=2, q=4):
      #same as context overload notebook
        contexts = []
        questions_answers = []
        selected_contexts = list(self.context_map.items())[:k]
        for context, qa_pairs in selected_contexts:
            contexts.append(context)
            selected_qa = qa_pairs[:q]
            questions_answers.append(selected_qa)
        return contexts, questions_answers

    def generate_Dataset_prompt(self, k=2, q=4):
        ''' This function generates and imitates a prompt from the dataset, later on, we will take questions and context as input from user.'''
        contexts, questions_answers_by_context = self.get_context_questions(k, q)
        combined_context = "\n\n".join(contexts)
        all_questions_answers = []
        for qa_pairs in questions_answers_by_context:
            all_questions_answers.extend(qa_pairs)
        prompt_lines = [
            "answer these questions:",
            "Answer as precisely as you can, in as minimum words as you can",
            "Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer"
        ]
        for i, (question, _) in enumerate(all_questions_answers, 1):
            prompt_lines.append(f"Q{i}: {question}")
        prompt_lines.append("\nfrom this context:")
        prompt_lines.append(combined_context)
        prompt = "\n".join(prompt_lines)
        self.current_prompt = prompt
        char_length = len(prompt)
        token_count = len(prompt.split())
        # print(f"\n🧠 Combined Prompt:")
        # print(f"Length: {char_length} characters, approximately {token_count} tokens")
        # print(f"{prompt}\n")


    def chunk_current_prompt(self, max_words_per_chunk=800):
        """
        Splits the context into ~800-word chunks and constructs prompts accordingly.
        Stores the chunked prompts in self.chunked_prompts.
        """
        if not hasattr(self, 'current_prompt') or self.current_prompt is None:
            raise ValueError("current_prompt not set. Please run generate_Dataset_prompt first.")

        # Split current prompt into parts
        prompt_parts = self.current_prompt.split("\nfrom this context:\n")
        if len(prompt_parts) != 2:
            raise ValueError("Prompt is not formatted correctly.")

        question_block = prompt_parts[0]  # instructions + questions
        context_block = prompt_parts[1]   # full context text

        # Split context into roughly 800-word chunks
        context_words = context_block.split()
        chunks = [
            " ".join(context_words[i:i+max_words_per_chunk])
            for i in range(0, len(context_words), max_words_per_chunk)
        ]

        # Rebuild prompt for each chunk
        self.chunked_prompts = []
        for chunk in chunks:
            new_prompt = f"{question_block}\n\nfrom this context:\n{chunk}"
            self.chunked_prompts.append(new_prompt)

        return self.chunked_prompts


    def print_current_prompt(self):
        ''' This function prints the generated or inputted prompt'''
        print("=======================================================================PRINTING PROMPT======================================================================")
        print(f"\n🧠 Current Prompt:")
        print(f"{self.current_prompt}\n")
        char_length = len(self.current_prompt)
        token_count = len(self.current_prompt.split())
        print("=======================================================================PROMPT COUNTS======================================================================")
        print(f"Length: {char_length} characters, approximately {token_count} tokens")
        print("========================================================================END PRINTING======================================================================")


    def print_chunks(self):
        ''' This function prints each chunked prompt stored in self.chunked_prompts '''
        if not hasattr(self, 'chunked_prompts') or not self.chunked_prompts:
            print("No chunked prompts found. Please run chunk_current_prompt() first.")
            return
        print("=======================================================================PRINTING CHUNKED PROMPTS======================================================================")
        for i, chunk in enumerate(self.chunked_prompts, 1):
            print(f"\n🧠 Prompt Chunk {i}:")
            print(chunk)
            char_length = len(chunk)
            token_count = len(chunk.split())
            print("-----------------------------------------")
            print(f"Length: {char_length} characters, approximately {token_count} tokens")
            print("============================================================================================================================================================")
        print("=======================================================================END OF CHUNKS======================================================================")



In [None]:
# Playground:
# print(help(list))
testing = Agent2(context_map)
testing.generate_Dataset_prompt(k=5)
testing.print_current_prompt()
testing.chunk_current_prompt(max_words_per_chunk=400)
testing.print_chunks()
# Okay so... this chunking words fine :)


  self.llm = HuggingFaceHub(



🧠 Current Prompt:
answer these questions:
Answer as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
Q1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Q2: What is in front of the Notre Dame Main Building?
Q3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Q4: What is the Grotto at Notre Dame?
Q5: When did the Scholastic Magazine of Notre dame begin publishing?
Q6: How often is Notre Dame's the Juggler published?
Q7: What is the daily student paper at Notre Dame called?
Q8: How many student news papers are found at Notre Dame?
Q9: Where is the headquarters of the Congregation of the Holy Cross?
Q10: What is the primary seminary of the Congregation of the Holy Cross?
Q11: What is the oldest structure at Notre Dame?
Q12: What individuals live at Fatima House at Notre Dame?
Q13: How many BS level degrees are offered in the College of Engineering at Notre 

### Next Steps:

okay now that we have written code to chunk the prompt into desired length, now moving onto actually performing some experiments.

In the next codeblock, we will be completing the class

In [None]:
class Agent2:
    #same context overload notebook
    def __init__(self, context_map, model_name="mistralai/Mistral-7B-Instruct-v0.3", access_token="TOKEN-HERE"):
        self.context_map = context_map
        self.llm = HuggingFaceHub(
            repo_id=model_name,
            model_kwargs={"temperature": 0.3, "max_length": 200},
            huggingfacehub_api_token=access_token
        )
        self.current_prompt = None # NEW: ask user to input context and questions seperately. chunks will only be made of context.
        self.chunked_prompts = []

    def generate_response(self, prompt): #can use this function to get answer from 1 chunk, then 2nd chunk etc, all chunks and then proceed. with normal
        return self.llm(prompt)

    def get_context_questions(self, k=2, q=4):
      #same as context overload notebook
        contexts = []
        questions_answers = []
        selected_contexts = list(self.context_map.items())[:k]
        for context, qa_pairs in selected_contexts:
            contexts.append(context)
            selected_qa = qa_pairs[:q]
            questions_answers.append(selected_qa)
        return contexts, questions_answers

    def generate_Dataset_prompt(self, k=2, q=4):
        ''' This function generates and imitates a prompt from the dataset, later on, we will take questions and context as input from user.'''
        contexts, questions_answers_by_context = self.get_context_questions(k, q)
        combined_context = "\n\n".join(contexts)
        all_questions_answers = []
        for qa_pairs in questions_answers_by_context:
            all_questions_answers.extend(qa_pairs)
        # modified the prompt
        prompt_lines = [
            "Answer these questions as precisely as you can, in as minimum words as you can",
            "Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer",
            "If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.",
        ]
        for i, (question, _) in enumerate(all_questions_answers, 1):
            prompt_lines.append(f"Q{i}: {question}")
        prompt_lines.append("\nfrom this context:")
        prompt_lines.append(combined_context)
        prompt = "\n".join(prompt_lines)
        self.current_prompt = prompt
        char_length = len(prompt)
        token_count = len(prompt.split())
        # print(f"\n🧠 Combined Prompt:")
        # print(f"Length: {char_length} characters, approximately {token_count} tokens")
        # print(f"{prompt}\n")


    def chunk_current_prompt(self, max_words_per_chunk=800):
        """
        Splits the context into ~800-word chunks and constructs prompts accordingly.
        Stores the chunked prompts in self.chunked_prompts.
        """
        if not hasattr(self, 'current_prompt') or self.current_prompt is None:
            raise ValueError("current_prompt not set. Please run generate_Dataset_prompt first.")

        # Split current prompt into parts
        prompt_parts = self.current_prompt.split("\nfrom this context:\n")
        if len(prompt_parts) != 2:
            raise ValueError("Prompt is not formatted correctly.")

        question_block = prompt_parts[0]  # instructions + questions
        context_block = prompt_parts[1]   # full context text

        # Split context into roughly 800-word chunks
        context_words = context_block.split()
        chunks = [
            " ".join(context_words[i:i+max_words_per_chunk])
            for i in range(0, len(context_words), max_words_per_chunk)
        ]

        # Rebuild prompt for each chunk
        self.chunked_prompts = []
        for chunk in chunks:
            new_prompt = f"{question_block}\n\nfrom this context:\n{chunk}"
            self.chunked_prompts.append(new_prompt)

        return self.chunked_prompts


    def run_all_chunks(self, verbose=True):
        """
        Calls the LLM for each chunked prompt and prints the responses.

        Args:
            verbose (bool): If True, prints each response.

        Returns:
            List of responses from the model for each chunked prompt.
        """
        if not self.chunked_prompts:
            raise ValueError("chunked_prompts not set. Please run chunk_current_prompt first.")

        self.responses = []
        for idx, prompt in enumerate(self.chunked_prompts, 1):
            response = self.generate_response(prompt)
            self.responses.append(response)
            if verbose:
                print(f"\n--- Response for Chunk {idx} ---\n")
                print(response)
                print("\n-------------------------------\n")
        return self.responses


    def print_current_prompt(self):
        ''' This function prints the generated or inputted prompt'''
        print("=======================================================================PRINTING PROMPT======================================================================")
        print(f"\n🧠 Current Prompt:")
        print(f"{self.current_prompt}\n")
        char_length = len(self.current_prompt)
        token_count = len(self.current_prompt.split())
        print("=======================================================================PROMPT COUNTS======================================================================")
        print(f"Length: {char_length} characters, approximately {token_count} tokens")
        print("========================================================================END PRINTING======================================================================")


    def print_chunks(self):
        ''' This function prints each chunked prompt stored in self.chunked_prompts '''
        if not hasattr(self, 'chunked_prompts') or not self.chunked_prompts:
            print("No chunked prompts found. Please run chunk_current_prompt() first.")
            return
        print("=======================================================================PRINTING CHUNKED PROMPTS======================================================================")
        for i, chunk in enumerate(self.chunked_prompts, 1):
            print(f"\n🧠 Prompt Chunk {i}:")
            print(chunk)
            char_length = len(chunk)
            token_count = len(chunk.split())
            print("-----------------------------------------")
            print(f"Length: {char_length} characters, approximately {token_count} tokens")
            print("============================================================================================================================================================")
        print("=======================================================================END OF CHUNKS======================================================================")



In [None]:
#Playground:
# agent = Agent2(context_map)
# agent.generate_Dataset_prompt(k=5)
# agent.chunk_current_prompt(max_words_per_chunk=200)
# agent.run_all_chunks()

# conclusion: This approach doesnt work with this specific llm as it is probably instruct tuned to not answer from the chunk.




--- Response for Chunk 1 ---

Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Q2: What is in front of the Notre Dame Main Building?
Q3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Q4: What is the Grotto at Notre Dame?
Q5: When did the Scholastic Magazine of Notre dame begin publishing?
Q6: How often is Notre Dame's the Juggler published?
Q7: What is the daily student paper at Notre Dame called?
Q8: How many student news papers are found at Notre Dame?
Q9: Where is the headquarters of the Congregation of the Holy Cross?
Q10: What is the primary seminary of the Congregation of the Holy Cross?
Q11: What is the oldest structure at Notre Dame?
Q12: Wha




--- Response for Chunk 2 ---

Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Q2: What is in front of the Notre Dame Main Building?
Q3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Q4: What is the Grotto at Notre Dame?
Q5: When did the Scholastic Magazine of Notre dame begin publishing?
Q6: How often is Notre Dame's the Juggler published?
Q7: What is the daily student paper at Notre Dame called?
Q8: How many student news papers are found at Notre Dame?
Q9: Where is the headquarters of the Congregation of the Holy Cross?
Q10: What is the primary seminary of the Congregation of the Holy Cross?
Q11: What is the oldest structure at Notre Dame?
Q12: Wha



KeyboardInterrupt: 

## Approach 2:

so initially i was thinking of post processing the answers from llms to guege whether the answers are actually from the context or not, but this approach wont work as llm do answer something from whatever little context is provided to them.

for example, both of the answer 20s:

A20: The First Year of Studies program at Notre Dame was recognized as outstanding by U.S. News & World Report.

A20: The First Year of Studies program at Notre Dame was declared "outstanding" by the American Council on Education.

are correct, given their own contexts.

So, for the updated approach, instead of post processing, we will try and do pre-processing and sort of based on keywords, assign the questions to their relevant chunks. and then call llms. will need to restructure some code as well.

And this simple keyword matching approach seems to be working a bit too fine, yes we could take this further and use tf-idf or cosine similarity between embedding etc, but lets just leave it for future works.

In [None]:
import re
from collections import defaultdict
from typing import List, Tuple


class Agent2:
    #same context overload notebook
    def __init__(self, context_map, model_name="mistralai/Mistral-7B-Instruct-v0.3", access_token="TOKEN-HERE"):
        self.context_map = context_map
        self.llm = HuggingFaceHub(
            repo_id=model_name,
            model_kwargs={"temperature": 0.3, "max_length": 200},
            huggingfacehub_api_token=access_token
        )
        self.current_prompt = None # NEW: ask user to input context and questions seperately. chunks will only be made of context.
        self.chunked_prompts = []
        self.chunk_responses = []
        # new variables for storing questions, chunks and gold answers respectively.
        self.current_context_chunks = []
        self.current_questions = []
        self.current_gold_answers = []
        # this is for chunk assignment to each question
        self.chunk_assignments = defaultdict(list)

    def generate_response(self, prompt): #can use this function to get answer from 1 chunk, then 2nd chunk etc, all chunks and then proceed. with normal
        return self.llm(prompt)

    def get_context_questions(self, k=2, q=4):
      #same as context overload notebook
        contexts = []
        questions_answers = []
        selected_contexts = list(self.context_map.items())[:k]
        for context, qa_pairs in selected_contexts:
            contexts.append(context)
            selected_qa = qa_pairs[:q]
            questions_answers.append(selected_qa)
        return contexts, questions_answers

    def generate_Dataset_prompt(self, k=2, q=4):
        ''' This function generates and imitates a prompt from the dataset, later on, we will take questions and context as input from user.'''
        contexts, questions_answers_by_context = self.get_context_questions(k, q)
        combined_context = "\n\n".join(contexts)
        all_questions_answers = []
        for qa_pairs in questions_answers_by_context:
            all_questions_answers.extend(qa_pairs)
        # modified the prompt
        prompt_lines = [
            "Answer these questions as precisely as you can, in as minimum words as you can",
            "Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer",
            "If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.",
        ]
        self.current_questions = []
        self.current_gold_answers = []
        for i, (question, answer) in enumerate(all_questions_answers, 1):
            self.current_questions.append(question)
            self.current_gold_answers.append(answer)
            prompt_lines.append(f"Q{i}: {question}")
        prompt_lines.append("\nfrom this context:")
        prompt_lines.append(combined_context)
        prompt = "\n".join(prompt_lines)
        self.current_prompt = prompt
        char_length = len(prompt)
        token_count = len(prompt.split())
        # print(f"\n🧠 Combined Prompt:")
        # print(f"Length: {char_length} characters, approximately {token_count} tokens")
        # print(f"{prompt}\n")


    def extract_keywords(self, questions: List[str]) -> List[List[str]]:
        """
        Given a list of questions, extract a list of keywords for each question.
        """
        # Common stopwords you might want to ignore
        stopwords = set([
            "the", "is", "at", "which", "on", "in", "a", "an", "of", "to", "for", "with",
            "and", "by", "from", "what", "who", "when", "where", "how", "does", "do",
            "did", "was", "were", "has", "have", "had"
        ])

        all_keywords = []

        for question in questions:
            # Remove punctuation
            question_clean = re.sub(r'[^\w\s]', '', question)
            words = question_clean.lower().split()
            # Filter stopwords and small words
            keywords = [word for word in words if word not in stopwords and len(word) > 2]
            all_keywords.append(keywords)

        return all_keywords

    def map_questions_to_chunks(self) -> List[Tuple[str, List[str]]]:
        """
        Map each context chunk to the most probable questions based on keyword matching.
        Returns a list of (context_chunk, assigned_questions) pairs.
        """
        if not self.current_context_chunks:
            raise ValueError("No context chunks available. Please run chunk_current_prompt first.")
        if not self.current_questions:
            raise ValueError("No questions available. Please populate current_questions first.")

        keywords_per_question = self.extract_keywords(self.current_questions)
        print(f"keywords per question: ", keywords_per_question) #so it returns a list of lists, the list has keywords of each question
        self.chunk_assignments = defaultdict(list)

        # Pre-lowercase context chunks
        lowered_chunks = [chunk.lower() for chunk in self.current_context_chunks]

        for q_idx, keywords in enumerate(keywords_per_question):
            question = self.current_questions[q_idx]
            best_chunk_idx = None
            best_match_count = 0

            # Check each chunk
            for idx, chunk in enumerate(lowered_chunks):
                match_count = sum(1 for keyword in keywords if keyword in chunk)

                if match_count > best_match_count:
                    best_match_count = match_count
                    best_chunk_idx = idx

            if best_chunk_idx is not None and best_match_count > 0:
                self.chunk_assignments[best_chunk_idx].append(question)

        # Build final result
        print(f"chunk assignments: ", self.chunk_assignments)
        result = []
        for idx, questions in self.chunk_assignments.items():
            result.append((self.current_context_chunks[idx], questions))

        return result

    def chunk_current_prompt(self, max_words_per_chunk=800):
        """
        Splits the context into ~800-word chunks and constructs prompts accordingly.
        Stores the chunked prompts in self.chunked_prompts.
        """
        if not hasattr(self, 'current_prompt') or self.current_prompt is None:
            raise ValueError("current_prompt not set. Please run generate_Dataset_prompt first.")

        # Split current prompt into parts
        prompt_parts = self.current_prompt.split("\nfrom this context:\n")
        if len(prompt_parts) != 2:
            raise ValueError("Prompt is not formatted correctly.")

        question_block = prompt_parts[0]  # instructions + questions
        context_block = prompt_parts[1]   # full context text

        # Split context into roughly 800-word chunks
        context_words = context_block.split()
        chunks = [
            " ".join(context_words[i:i+max_words_per_chunk])
            for i in range(0, len(context_words), max_words_per_chunk)
        ]

        # Rebuild prompt for each chunk
        self.chunked_prompts = []
        self.current_context_chunks = []
        for chunk in chunks:
            self.current_context_chunks.append(chunk)
            new_prompt = f"{question_block}\n\nfrom this context:\n{chunk}"
            self.chunked_prompts.append(new_prompt)

        return self.chunked_prompts

    def construct_individual_prompts_from_assignments(self):
        """
        Constructs individual prompts by pairing assigned questions with their corresponding context chunks.
        Formats the prompt according to specified rules: minimal word answers, labeled Q1, Q2..., and specific handling for unknowns.
        Stores results in self.individual_prompts as a list of prompt strings.
        """
        if not hasattr(self, 'chunk_assignments') or not self.chunk_assignments:
            raise ValueError("chunk_assignments not set. Please run map_questions_to_chunks first.")

        if not hasattr(self, 'current_context_chunks') or not self.current_context_chunks:
            raise ValueError("No context chunks available.")

        self.individual_prompts = []

        for idx, questions in self.chunk_assignments.items():
            chunk = self.current_context_chunks[idx]
            # Add Q1, Q2... prefix
            formatted_questions = [f"Q{i + 1}: {q}" for i, q in enumerate(questions)]
            question_block = "\n".join(formatted_questions)

            prompt = (
                "Answer these questions as precisely as you can, in as minimum words as you can\n"
                "Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer\n"
                "If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.\n"
                f"{question_block}\n\nfrom this context:\n{chunk}"
            )
            self.individual_prompts.append(prompt)
        self.chunked_prompts = self.individual_prompts
        return self.individual_prompts


    def run_all_chunks(self, verbose=True):
        """
        Calls the LLM for each chunked prompt and prints the responses.

        Args:
            verbose (bool): If True, prints each response.

        Returns:
            List of responses from the model for each chunked prompt.
        """
        if not self.chunked_prompts:
            raise ValueError("chunked_prompts not set. Please run chunk_current_prompt first.")

        self.chunk_responses = []
        for idx, prompt in enumerate(self.chunked_prompts, 1):
            response = self.generate_response(prompt)
            self.chunk_responses.append(response)
            if verbose:
                print(f"\n--- Response for Chunk {idx} ---\n")
                print(response)
                print("\n-------------------------------\n")
        return self.chunk_responses


    def print_current_prompt(self):
        ''' This function prints the generated or inputted prompt'''
        print("=======================================================================PRINTING PROMPT======================================================================")
        print(f"\n🧠 Current Prompt:")
        print(f"{self.current_prompt}\n")
        char_length = len(self.current_prompt)
        token_count = len(self.current_prompt.split())
        print("=======================================================================PROMPT COUNTS======================================================================")
        print(f"Length: {char_length} characters, approximately {token_count} tokens")
        print("========================================================================END PRINTING======================================================================")


    def print_chunks(self):
        ''' This function prints each chunked prompt stored in self.chunked_prompts '''
        if not hasattr(self, 'chunked_prompts') or not self.chunked_prompts:
            print("No chunked prompts found. Please run chunk_current_prompt() first.")
            return
        print("=======================================================================PRINTING CHUNKED PROMPTS======================================================================")
        for i, chunk in enumerate(self.chunked_prompts, 1):
            print(f"\n🧠 Prompt Chunk {i}:")
            print(chunk)
            char_length = len(chunk)
            token_count = len(chunk.split())
            print("-----------------------------------------")
            print(f"Length: {char_length} characters, approximately {token_count} tokens")
            print("============================================================================================================================================================")
        print("=======================================================================END OF CHUNKS======================================================================")




In [None]:
#Playground:
agent = Agent2(context_map)
agent.generate_Dataset_prompt(k=5)
agent.print_current_prompt()
agent.chunk_current_prompt(max_words_per_chunk=400)
assignments = agent.map_questions_to_chunks()
# # Print or use assignments
# for chunk, assigned_questions in assignments:
#     print("Context Preview:", chunk)
#     print("Questions:")
#     for q in assigned_questions:
#         print("-", q)
#     print("\n")

individual_prompts = agent.construct_individual_prompts_from_assignments()

# You can now loop over the prompts and feed them to your LLM:
for prompt in individual_prompts:
    print("=== Prompt ===")
    print(prompt)
    print("==============\n")

# agent.run_all_chunks()



🧠 Current Prompt:
Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Q2: What is in front of the Notre Dame Main Building?
Q3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Q4: What is the Grotto at Notre Dame?
Q5: When did the Scholastic Magazine of Notre dame begin publishing?
Q6: How often is Notre Dame's the Juggler published?
Q7: What is the daily student paper at Notre Dame called?
Q8: How many student news papers are found at Notre Dame?
Q9: Where is the headquarters of the Congregation of the Holy Cross?
Q10: What is the primary seminary of the Congregation of the Holy Cross?
Q11: What is the oldest structure at Notre Dame?
Q12: What individual

  return self.llm(prompt)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



--- Response for Chunk 1 ---

Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Q2: What is in front of the Notre Dame Main Building?
Q3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Q4: What is the Grotto at Notre Dame?
Q5: When did the Scholastic Magazine of Notre dame begin publishing?
Q6: How often is Notre Dame's the Juggler published?
Q7: What is the daily student paper at Notre Dame called?
Q8: How many student news papers are found at Notre Dame?
Q9: Where is the headquarters of the Congregation of the Holy Cross?
Q10: What is the primary seminary of the Congregation of the Holy Cross?
Q11: What is the oldest structure at Notre Dame?

from th




--- Response for Chunk 2 ---

Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: What individuals live at Fatima House at Notre Dame?
Q2: How many BS level degrees are offered in the College of Engineering at Notre Dame?
Q3: In what year was the College of Engineering at Notre Dame formed?
Q4: Before the creation of the College of Engineering similar studies were carried out at which Notre Dame college?
Q5: How many departments are within the Stinson-Remick Hall of Engineering?
Q6: What entity provides help with the management of time for new students at Notre Dame?
Q7: How many colleges for undergraduates are at Notre Dame?
Q8: What was created at Notre Dame in 1962 to assist first year students?
Q9: Which organization declared the First Year of Studie

['Answer these questions as precisely as you can, in as minimum words as you can\nFormat your answers with \'A1:\', \'A2:\', etc. at the beginning of each answer\nIf you cannot find enough information in the provided context to answer a question, respond with \'Not specified\'. Do NOT guess.\nQ1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?\nQ2: What is in front of the Notre Dame Main Building?\nQ3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?\nQ4: What is the Grotto at Notre Dame?\nQ5: When did the Scholastic Magazine of Notre dame begin publishing?\nQ6: How often is Notre Dame\'s the Juggler published?\nQ7: What is the daily student paper at Notre Dame called?\nQ8: How many student news papers are found at Notre Dame?\nQ9: Where is the headquarters of the Congregation of the Holy Cross?\nQ10: What is the primary seminary of the Congregation of the Holy Cross?\nQ11: What is the oldest structure at Notre Dame?\n\nfrom this cont

### Refining class

okay so now, given that the approach of assigning questions to chunks seems to be working fine, we will now proceed to cleaning the process and output etc.

In [None]:
import re
from collections import defaultdict
from typing import List, Tuple


class Agent2:
    #same context overload notebook
    def __init__(self, context_map, model_name="mistralai/Mistral-7B-Instruct-v0.3", access_token="TOKEN-HERE"):
        self.context_map = context_map
        self.llm = HuggingFaceHub(
            repo_id=model_name,
            model_kwargs={"temperature": 0.3, "max_length": 200},
            huggingfacehub_api_token=access_token
        )
        self.current_prompt = None # NEW: ask user to input context and questions seperately. chunks will only be made of context.
        self.chunked_prompts = []
        self.chunk_responses = ['Answer these questions as precisely as you can, in as minimum words as you can\nFormat your answers with \'A1:\', \'A2:\', etc. at the beginning of each answer\nIf you cannot find enough information in the provided context to answer a question, respond with \'Not specified\'. Do NOT guess.\nQ1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?\nQ2: What is in front of the Notre Dame Main Building?\nQ3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?\nQ4: What is the Grotto at Notre Dame?\nQ5: When did the Scholastic Magazine of Notre dame begin publishing?\nQ6: How often is Notre Dame\'s the Juggler published?\nQ7: What is the daily student paper at Notre Dame called?\nQ8: How many student news papers are found at Notre Dame?\nQ9: Where is the headquarters of the Congregation of the Holy Cross?\nQ10: What is the primary seminary of the Congregation of the Holy Cross?\nQ11: What is the oldest structure at Notre Dame?\n\nfrom this context:\nArchitecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary. As at most other universities, Notre Dame\'s students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the United States. The other magazine, The Juggler, is released twice a year and focuses on student literature and artwork. The Dome yearbook is published annually. The newspapers have varying publication interests, with The Observer published daily and mainly reporting university and other news, and staffed by students from both Notre Dame and Saint Mary\'s College. Unlike Scholastic and The Dome, The Observer is an independent publication and does not have a faculty advisor or any editorial oversight from the University. In 1987, when some students believed that The Observer began to show a conservative bias, a liberal newspaper, Common Sense was published. Likewise, in 2003, when other students believed that the paper showed a liberal bias, the conservative paper Irish Rover went into production. Neither paper is published as often as The Observer; however, all three are distributed to all students. Finally, in Spring 2008 an undergraduate journal for political science research, Beyond Politics, made its debut. The university is the major seat of the Congregation of Holy Cross (albeit not its official headquarters, which are in Rome). Its main seminary, Moreau Seminary, is located on the campus across St. Joseph lake from the Main Building. Old College, the oldest building on campus and located near the shore of St. Mary lake, houses undergraduate seminarians. Retired priests live in Moreau Seminary.\n\nA1: The Virgin Mary allegedly appeared to Saint Bernadette Soubirous in 1858 in Lourdes, France.\nA2: A copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes" is in front of the Notre Dame Main Building.\nA3: The Basilica of the Sacred Heart is beside the Notre Dame Main Building.\nA4: The Grotto at Notre Dame is a replica of the grotto at Lourdes, France where the Virgin Mary allegedly appeared to Saint Bernadette Soubirous in 1858.\nA5: The Scholastic Magazine of Notre Dame began publishing in September 1876.\nA6: The Juggler is published twice a year.\nA7: The daily student paper at Notre Dame is called The Observer.\nA8: There are 3 student news papers found at Notre Dame.\nA9: The headquarters of the Congregation of the Holy Cross is in Rome.\nA10: The primary seminary of the Congregation of the Holy Cross is Moreau Seminary.\nA11: Old College is the oldest structure at Notre Dame.',
 'Answer these questions as precisely as you can, in as minimum words as you can\nFormat your answers with \'A1:\', \'A2:\', etc. at the beginning of each answer\nIf you cannot find enough information in the provided context to answer a question, respond with \'Not specified\'. Do NOT guess.\nQ1: What individuals live at Fatima House at Notre Dame?\nQ2: How many BS level degrees are offered in the College of Engineering at Notre Dame?\nQ3: In what year was the College of Engineering at Notre Dame formed?\nQ4: Before the creation of the College of Engineering similar studies were carried out at which Notre Dame college?\nQ5: How many departments are within the Stinson-Remick Hall of Engineering?\nQ6: What entity provides help with the management of time for new students at Notre Dame?\nQ7: How many colleges for undergraduates are at Notre Dame?\nQ8: What was created at Notre Dame in 1962 to assist first year students?\nQ9: Which organization declared the First Year of Studies program at Notre Dame "outstanding?"\n\nfrom this context:\nand brothers reside in Fatima House (a former retreat center), Holy Cross House, as well as Columba Hall near the Grotto. The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching. The College of Engineering was established in 1920, however, early courses in civil and mechanical engineering were a part of the College of Science since the 1870s. Today the college, housed in the Fitzpatrick, Cushing, and Stinson-Remick Halls of Engineering, includes five departments of study – aerospace and mechanical engineering, chemical and biomolecular engineering, civil engineering and geological sciences, computer science and engineering, and electrical engineering – with eight B.S. degrees offered. Additionally, the college offers five-year dual degree programs with the Colleges of Arts and Letters and of Business awarding additional B.A. and Master of Business Administration (MBA) degrees, respectively. All of Notre Dame\'s undergraduate students are a part of one of the five undergraduate colleges at the school or are in the First Year of Studies program. The First Year of Studies program was established in 1962 to guide incoming freshmen in their first year at the school before they have declared a major. Each student is given an academic advisor from the program who helps them to choose classes that give them exposure to any major in which they are interested. The program also includes a Learning Resource Center which provides time management, collaborative learning, and subject tutoring. This program has been recognized previously, by U.S. News & World Report, as outstanding.\n\nA1: Individuals who live at Fatima House at Notre Dame are brothers.\nA2: The College of Engineering at Notre Dame offers 8 BS level degrees.\nA3: The College of Engineering at Notre Dame was formed in 1920.\nA4: Before the creation of the College of Engineering, similar studies were carried out at the College of Science.\nA5: There are 5 departments within the Stinson-Remick Hall of Engineering.\nA6: The entity that provides help with the management of time for new students at Notre Dame is the First Year of Studies program.\nA7: There are 6 colleges for undergraduates at Notre Dame (including the First Year of Studies program).\nA8: The First Year of Studies program was created at Notre Dame in 1962.\nA9: The First Year of Studies program at Notre Dame was declared "outstanding" by U.S. News & World Report.']
        # new variables for storing questions, chunks and gold answers respectively.
        self.current_context_chunks = []
        self.current_questions = []
        self.current_gold_answers = []
        # this is for chunk assignment to each question
        self.chunk_assignments = defaultdict(list)
        # for parsing and storing question answer pairs from chunk prompts:
        self.question_answer_map = {}

    def generate_response(self, prompt): #can use this function to get answer from 1 chunk, then 2nd chunk etc, all chunks and then proceed. with normal
        return self.llm(prompt)

    def get_context_questions(self, k=2, q=4):
      #same as context overload notebook
        contexts = []
        questions_answers = []
        selected_contexts = list(self.context_map.items())[:k]
        for context, qa_pairs in selected_contexts:
            contexts.append(context)
            selected_qa = qa_pairs[:q]
            questions_answers.append(selected_qa)
        return contexts, questions_answers

    def generate_Dataset_prompt(self, k=2, q=4):
        ''' This function generates and imitates a prompt from the dataset, later on, we will take questions and context as input from user.'''
        contexts, questions_answers_by_context = self.get_context_questions(k, q)
        combined_context = "\n\n".join(contexts)
        all_questions_answers = []
        for qa_pairs in questions_answers_by_context:
            all_questions_answers.extend(qa_pairs)
        # modified the prompt
        prompt_lines = [
            "Answer these questions as precisely as you can, in as minimum words as you can",
            "Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer",
            "If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.",
        ]
        self.current_questions = []
        self.current_gold_answers = []
        for i, (question, answer) in enumerate(all_questions_answers, 1):
            self.current_questions.append(question)
            self.current_gold_answers.append(answer)
            prompt_lines.append(f"Q{i}: {question}")
        prompt_lines.append("\nfrom this context:")
        prompt_lines.append(combined_context)
        prompt = "\n".join(prompt_lines)
        self.current_prompt = prompt
        char_length = len(prompt)
        token_count = len(prompt.split())


    def extract_keywords(self, questions: List[str]) -> List[List[str]]:
        """
        Given a list of questions, extract a list of keywords for each question.
        """
        stopwords = set([
            "the", "is", "at", "which", "on", "in", "a", "an", "of", "to", "for", "with",
            "and", "by", "from", "what", "who", "when", "where", "how", "does", "do",
            "did", "was", "were", "has", "have", "had"
        ])
        all_keywords = []
        for question in questions:
            # Remove punctuation
            question_clean = re.sub(r'[^\w\s]', '', question)
            words = question_clean.lower().split()
            # Filter stopwords and small words
            keywords = [word for word in words if word not in stopwords and len(word) > 2]
            all_keywords.append(keywords)
        return all_keywords

    def map_questions_to_chunks(self) -> List[Tuple[str, List[str]]]:
        """
        Map each context chunk to the most probable questions based on keyword matching.
        Returns a list of (context_chunk, assigned_questions) pairs.
        """
        if not self.current_context_chunks:
            raise ValueError("No context chunks available. Please run chunk_current_prompt first.")
        if not self.current_questions:
            raise ValueError("No questions available. Please populate current_questions first.")
        keywords_per_question = self.extract_keywords(self.current_questions)
        # print(f"keywords per question: ", keywords_per_question) #so it returns a list of lists, the list has keywords of each question
        self.chunk_assignments = defaultdict(list)
        lowered_chunks = [chunk.lower() for chunk in self.current_context_chunks]
        for q_idx, keywords in enumerate(keywords_per_question):
            question = self.current_questions[q_idx]
            best_chunk_idx = None
            best_match_count = 0
            for idx, chunk in enumerate(lowered_chunks):
                match_count = sum(1 for keyword in keywords if keyword in chunk)
                if match_count > best_match_count:
                    best_match_count = match_count
                    best_chunk_idx = idx
            if best_chunk_idx is not None and best_match_count > 0:
                self.chunk_assignments[best_chunk_idx].append(question)
        # print(f"chunk assignments: ", self.chunk_assignments)

    def chunk_current_prompt(self, max_words_per_chunk=800):
        """
        Splits the context into ~800-word chunks and constructs prompts accordingly.
        Stores the chunked prompts in self.chunked_prompts.
        """
        if not hasattr(self, 'current_prompt') or self.current_prompt is None:
            raise ValueError("current_prompt not set. Please run generate_Dataset_prompt first.")
        prompt_parts = self.current_prompt.split("\nfrom this context:\n")
        if len(prompt_parts) != 2:
            raise ValueError("Prompt is not formatted correctly.")
        question_block = prompt_parts[0]  # instructions + questions
        context_block = prompt_parts[1]   # full context text
        context_words = context_block.split()
        chunks = [
            " ".join(context_words[i:i+max_words_per_chunk])
            for i in range(0, len(context_words), max_words_per_chunk)
        ]
        self.chunked_prompts = []
        self.current_context_chunks = []
        for chunk in chunks:
            self.current_context_chunks.append(chunk)
            new_prompt = f"{question_block}\n\nfrom this context:\n{chunk}"
            self.chunked_prompts.append(new_prompt)


    def construct_individual_prompts_from_assignments(self):
        """
        Constructs individual prompts by pairing assigned questions with their corresponding context chunks.
        Formats the prompt according to specified rules: minimal word answers, labeled Q1, Q2..., and specific handling for unknowns.
        Stores results in self.individual_prompts as a list of prompt strings.
        """
        if not hasattr(self, 'chunk_assignments') or not self.chunk_assignments:
            raise ValueError("chunk_assignments not set. Please run map_questions_to_chunks first.")

        if not hasattr(self, 'current_context_chunks') or not self.current_context_chunks:
            raise ValueError("No context chunks available.")

        self.individual_prompts = []
        for idx, questions in self.chunk_assignments.items():
            chunk = self.current_context_chunks[idx]
            # Add Q1, Q2... prefix
            formatted_questions = [f"Q{i + 1}: {q}" for i, q in enumerate(questions)]
            question_block = "\n".join(formatted_questions)

            prompt = (
                "Answer these questions as precisely as you can, in as minimum words as you can\n"
                "Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer\n"
                "If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.\n"
                f"{question_block}\n\nfrom this context:\n{chunk}"
            )
            self.individual_prompts.append(prompt)
        self.chunked_prompts = self.individual_prompts


    def run_all_chunks(self, verbose=True):
        """
        Calls the LLM for each chunked prompt and prints the responses.

        Args:
            verbose (bool): If True, prints each response.

        Returns:
            List of responses from the model for each chunked prompt.
        """
        if not self.chunked_prompts:
            raise ValueError("chunked_prompts not set. Please run chunk_current_prompt first.")
        self.chunk_responses = []
        for idx, prompt in enumerate(self.chunked_prompts, 1):
            response = self.generate_response(prompt)
            self.chunk_responses.append(response)
            if verbose:
                print(f"\n--- Response for Chunk {idx} ---\n")
                print(response)
                print("\n-------------------------------\n")

    def extract_questions_and_answers(self, verbose=True):
        self.question_answer_map = {}

        for chunk_idx, response in enumerate(self.chunk_responses):
            questions = self.chunk_assignments.get(chunk_idx, [])

            # Remove any part before the FIRST 'Q1:' to skip prompt/instructions
            question_block_start = re.search(r'\bQ1:', response)
            if not question_block_start:
                if verbose:
                    print(f"[Warning] No 'Q1:' found in chunk {chunk_idx}, skipping")
                continue

            cleaned_response = response[question_block_start.start():]

            # Extract answers (A1:, A2:, ...) — multiline with DOTALL
            answers = re.findall(r'A\d+:\s*(.*?)(?=\nA\d+:|\Z)', cleaned_response, re.DOTALL)

            if verbose and len(answers) != len(questions):
                print(f"[Warning] Mismatch: {len(questions)} questions vs {len(answers)} answers in chunk {chunk_idx}")

            for i, answer in enumerate(answers):
                if i < len(questions):
                    question = questions[i]
                    self.question_answer_map[question] = answer.strip()
                else:
                    break  # More answers than questions

        return self.question_answer_map


    def print_current_prompt(self):
        ''' This function prints the generated or inputted prompt'''
        print("=======================================================================PRINTING PROMPT======================================================================")
        print(f"\n🧠 Current Prompt:")
        print(f"{self.current_prompt}\n")
        char_length = len(self.current_prompt)
        token_count = len(self.current_prompt.split())
        print("=======================================================================PROMPT COUNTS======================================================================")
        print(f"Length: {char_length} characters, approximately {token_count} tokens")
        print("========================================================================END PRINTING======================================================================")


    def print_chunks(self):
        ''' This function prints each chunked prompt stored in self.chunked_prompts '''
        if not hasattr(self, 'chunked_prompts') or not self.chunked_prompts:
            print("No chunked prompts found. Please run chunk_current_prompt() first.")
            return
        print("=======================================================================PRINTING CHUNKED PROMPTS======================================================================")
        for i, chunk in enumerate(self.chunked_prompts, 1):
            print(f"\n🧠 Prompt Chunk {i}:")
            print(chunk)
            char_length = len(chunk)
            token_count = len(chunk.split())
            print("-----------------------------------------")
            print(f"Length: {char_length} characters, approximately {token_count} tokens")
            print("============================================================================================================================================================")
        print("=======================================================================END OF CHUNKS======================================================================")

    def print_parsed_qa_pairs(self):
        if not hasattr(self, 'question_answer_map') or not self.question_answer_map:
            print("No question-answer pairs found. Please run extract_questions_and_answers() first.")
            return

        print("Parsed Question-Answer Pairs:\n")
        for idx, (question, answer) in enumerate(self.question_answer_map.items(), 1):
            print(f"{idx}. Q: {question}\n   A: {answer}\n")

    def print_qa_pair_comparison(self):
        if not hasattr(self, 'question_answer_map') or not self.question_answer_map:
            print("No question-answer pairs found. Please run extract_questions_and_answers() first.")
            return

        if not hasattr(self, 'current_gold_answers') or not self.current_gold_answers:
            print("No gold answers found. Please set self.current_gold_answers before comparing.")
            return

        questions = list(self.question_answer_map.keys())
        predicted_answers = list(self.question_answer_map.values())
        gold_answers = self.current_gold_answers

        print("Question | Predicted Answer | Gold Answer")
        print("-" * 80)

        for idx, question in enumerate(questions):
            predicted = predicted_answers[idx] if idx < len(predicted_answers) else "N/A"
            gold = gold_answers[idx] if idx < len(gold_answers) else "N/A"

            print(f"Q{idx+1}: {question}")
            print(f"  🤖 Predicted: {predicted}")
            print(f"  ✅ Gold     : {gold}")
            print()

    def print_qa_pairs(self):

        if not hasattr(self, 'current_gold_answers') or not self.current_gold_answers:
            print("No gold answers found. Please set self.current_gold_answers before comparing.")
            return

        questions = list(self.current_questions)
        gold_answers = self.current_gold_answers

        print("Question | Predicted Answer | Gold Answer")
        print("-" * 80)

        for idx, question in enumerate(questions):
            gold = gold_answers[idx] if idx < len(gold_answers) else "N/A"

            print(f"Q{idx+1}: {question}")
            print(f"  ✅ Gold     : {gold}")
            print()







# K8Q5

In [None]:
#driver code:
agent = Agent2(context_map)
agent.generate_Dataset_prompt(k=8, q=5)
# agent.print_current_prompt()
agent.chunk_current_prompt(max_words_per_chunk=700)
agent.map_questions_to_chunks()
agent.construct_individual_prompts_from_assignments()
agent.run_all_chunks()
agent.extract_questions_and_answers()
# agent.print_parsed_qa_pairs()
agent.print_qa_pair_comparison()


  self.llm = HuggingFaceHub(
  return self.llm(prompt)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



--- Response for Chunk 1 ---

Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Q2: What is in front of the Notre Dame Main Building?
Q3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Q4: What is the Grotto at Notre Dame?
Q5: What sits on top of the Main Building at Notre Dame?
Q6: When did the Scholastic Magazine of Notre dame begin publishing?
Q7: How often is Notre Dame's the Juggler published?
Q8: What is the daily student paper at Notre Dame called?
Q9: How many student news papers are found at Notre Dame?
Q10: In what year did the student paper Common Sense begin publication at Notre Dame?
Q11: Where is the headquarters of the Congregation of th




--- Response for Chunk 2 ---

Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: Which program at Notre Dame offers a Master of Education degree?
Q2: What institute at Notre Dame studies  the reasons for violent conflict?
Q3: What is the title of Notre Dame's Theodore Hesburgh?
Q4: In what year was the Joan B. Kroc Institute for International Peace Studies founded?
Q5: To whom was John B. Kroc married?
Q6: What company did Ray Kroc own?
Q7: How many stories tall is the main library at Notre Dame?
Q8: What is the name of the main library at Notre Dame?
Q9: In what year was the Theodore M. Hesburgh Library at Notre Dame finished?
Q10: Which artist created the mural on the Theodore M. Hesburgh Library?
Q11: What is a common name to reference the mural crea

### Experiments

Okay so the previous code was all about checking the code and our technique. now, we will try to run experiments, same as we did before and compare and contrast and document their results

# K16Q5

In [None]:
#driver code:
agent = Agent2(context_map)
agent.generate_Dataset_prompt(k=16, q=5)
# agent.print_current_prompt()
agent.chunk_current_prompt(max_words_per_chunk=700)
agent.map_questions_to_chunks()
agent.construct_individual_prompts_from_assignments()
agent.print_chunks()
# agent.run_all_chunks()
agent.extract_questions_and_answers()
# agent.print_parsed_qa_pairs()
agent.print_qa_pair_comparison()



🧠 Prompt Chunk 1:
Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Q2: What is in front of the Notre Dame Main Building?
Q3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Q4: What is the Grotto at Notre Dame?
Q5: What sits on top of the Main Building at Notre Dame?
Q6: When did the Scholastic Magazine of Notre dame begin publishing?
Q7: How often is Notre Dame's the Juggler published?
Q8: What is the daily student paper at Notre Dame called?
Q9: How many student news papers are found at Notre Dame?
Q10: In what year did the student paper Common Sense begin publication at Notre Dame?
Q11: Where is the headquarters of the Congregation of the Holy Cross



HfHubHTTPError: 402 Client Error: Payment Required for url: https://router.huggingface.co/hf-inference/models/mistralai/Mistral-7B-Instruct-v0.3 (Request ID: Root=1-6813114f-7effc2a3650cc66360af9d47;b40fb9c7-a353-4ea3-a555-5cb4f3256024)

You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.

In [None]:
agent = Agent2(context_map)
agent.generate_Dataset_prompt(k=16, q=5)
# agent.print_current_prompt()
agent.chunk_current_prompt(max_words_per_chunk=700)
agent.map_questions_to_chunks()
agent.construct_individual_prompts_from_assignments()
agent.print_qa_pairs()

Question | Predicted Answer | Gold Answer
--------------------------------------------------------------------------------
Q1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
  ✅ Gold     : Saint Bernadette Soubirous

Q2: What is in front of the Notre Dame Main Building?
  ✅ Gold     : a copper statue of Christ

Q3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
  ✅ Gold     : the Main Building

Q4: What is the Grotto at Notre Dame?
  ✅ Gold     : a Marian place of prayer and reflection

Q5: What sits on top of the Main Building at Notre Dame?
  ✅ Gold     : a golden statue of the Virgin Mary

Q6: When did the Scholastic Magazine of Notre dame begin publishing?
  ✅ Gold     : September 1876

Q7: How often is Notre Dame's the Juggler published?
  ✅ Gold     : twice

Q8: What is the daily student paper at Notre Dame called?
  ✅ Gold     : The Observer

Q9: How many student news papers are found at Notre Dame?
  ✅ Gold     : three

Q10

### Fixing the parsing issue:



In [None]:
import re
from collections import defaultdict
from typing import List, Tuple


class Agent2:
    #same context overload notebook
    def __init__(self, context_map, model_name="mistralai/Mistral-7B-Instruct-v0.3", access_token="TOKEN-HERE"):
        self.context_map = context_map
        self.llm = HuggingFaceHub(
            repo_id=model_name,
            model_kwargs={"temperature": 0.3, "max_length": 200},
            huggingfacehub_api_token=access_token
        )
        self.current_prompt = None # NEW: ask user to input context and questions seperately. chunks will only be made of context.
        self.chunked_prompts = []
        self.chunk_responses = ['Answer these questions as precisely as you can, in as minimum words as you can\nFormat your answers with \'A1:\', \'A2:\', etc. at the beginning of each answer\nIf you cannot find enough information in the provided context to answer a question, respond with \'Not specified\'. Do NOT guess.\nQ1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?\nQ2: What is in front of the Notre Dame Main Building?\nQ3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?\nQ4: What is the Grotto at Notre Dame?\nQ5: When did the Scholastic Magazine of Notre dame begin publishing?\nQ6: How often is Notre Dame\'s the Juggler published?\nQ7: What is the daily student paper at Notre Dame called?\nQ8: How many student news papers are found at Notre Dame?\nQ9: Where is the headquarters of the Congregation of the Holy Cross?\nQ10: What is the primary seminary of the Congregation of the Holy Cross?\n\n\nfrom this context:\nArchitecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary. As at most other universities, Notre Dame\'s students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the United States. The other magazine, The Juggler, is released twice a year and focuses on student literature and artwork. The Dome yearbook is published annually. The newspapers have varying publication interests, with The Observer published daily and mainly reporting university and other news, and staffed by students from both Notre Dame and Saint Mary\'s College. Unlike Scholastic and The Dome, The Observer is an independent publication and does not have a faculty advisor or any editorial oversight from the University. In 1987, when some students believed that The Observer began to show a conservative bias, a liberal newspaper, Common Sense was published. Likewise, in 2003, when other students believed that the paper showed a liberal bias, the conservative paper Irish Rover went into production. Neither paper is published as often as The Observer; however, all three are distributed to all students. Finally, in Spring 2008 an undergraduate journal for political science research, Beyond Politics, made its debut. The university is the major seat of the Congregation of Holy Cross (albeit not its official headquarters, which are in Rome). Its main seminary, Moreau Seminary, is located on the campus across St. Joseph lake from the Main Building. Old College, the oldest building on campus and located near the shore of St. Mary lake, houses undergraduate seminarians. Retired priests live in Moreau Seminary.\n\nA1: The Virgin Mary allegedly appeared to Saint Bernadette Soubirous in 1858 in Lourdes, France.\nA2: A copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes" is in front of the Notre Dame Main Building.\nA3: The Basilica of the Sacred Heart is beside the Notre Dame Main Building.\nA4: The Grotto at Notre Dame is a replica of the grotto at Lourdes, France where the Virgin Mary allegedly appeared to Saint Bernadette Soubirous in 1858.\nA5: The Scholastic Magazine of Notre Dame began publishing in September 1876.\nA6: The Juggler is published twice a year.\nA7: The daily student paper at Notre Dame is called The Observer.\nA8: There are 3 student news papers found at Notre Dame.\nA9: The headquarters of the Congregation of the Holy Cross is in Rome.\nA10: The primary seminary of the Congregation of the Holy Cross is Moreau Seminary.',
 'Answer these questions as precisely as you can, in as minimum words as you can\nFormat your answers with \'A1:\', \'A2:\', etc. at the beginning of each answer\nIf you cannot find enough information in the provided context to answer a question, respond with \'Not specified\'. Do NOT guess.\nQ1: What individuals live at Fatima House at Notre Dame?\nQ2: How many BS level degrees are offered in the College of Engineering at Notre Dame?\nQ3: In what year was the College of Engineering at Notre Dame formed?\nQ4: Before the creation of the College of Engineering similar studies were carried out at which Notre Dame college?\nQ5: How many departments are within the Stinson-Remick Hall of Engineering?\nQ6: What entity provides help with the management of time for new students at Notre Dame?\nQ7: How many colleges for undergraduates are at Notre Dame?\nQ8: What was created at Notre Dame in 1962 to assist first year students?\nQ9: Which organization declared the First Year of Studies program at Notre Dame "outstanding?"\nQ10: What is the oldest structure at Notre Dame?\n\nfrom this context:\nand brothers reside in Fatima House (a former retreat center), Holy Cross House, as well as Columba Hall near the Grotto. The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching. The College of Engineering was established in 1920, however, early courses in civil and mechanical engineering were a part of the College of Science since the 1870s. Today the college, housed in the Fitzpatrick, Cushing, and Stinson-Remick Halls of Engineering, includes five departments of study – aerospace and mechanical engineering, chemical and biomolecular engineering, civil engineering and geological sciences, computer science and engineering, and electrical engineering – with eight B.S. degrees offered. Additionally, the college offers five-year dual degree programs with the Colleges of Arts and Letters and of Business awarding additional B.A. and Master of Business Administration (MBA) degrees, respectively. All of Notre Dame\'s undergraduate students are a part of one of the five undergraduate colleges at the school or are in the First Year of Studies program. The First Year of Studies program was established in 1962 to guide incoming freshmen in their first year at the school before they have declared a major. Each student is given an academic advisor from the program who helps them to choose classes that give them exposure to any major in which they are interested. The program also includes a Learning Resource Center which provides time management, collaborative learning, and subject tutoring. This program has been recognized previously, by U.S. News & World Report, as outstanding.\n\nA1: Individuals who live at Fatima House at Notre Dame are brothers.\nA2: The College of Engineering at Notre Dame offers 8 BS level degrees.\nA3: The College of Engineering at Notre Dame was formed in 1920.\nA4: Before the creation of the College of Engineering, similar studies were carried out at the College of Science.\nA5: There are 5 departments within the Stinson-Remick Hall of Engineering.\nA6: The entity that provides help with the management of time for new students at Notre Dame is the First Year of Studies program.\nA7: There are 6 colleges for undergraduates at Notre Dame (including the First Year of Studies program).\nA8: The First Year of Studies program was created at Notre Dame in 1962.\nA9: The First Year of Studies program at Notre Dame was declared "outstanding" by U.S. News & World Report.\nA10: Old College is the oldest structure at Notre Dame.']
        # new variables for storing questions, chunks and gold answers respectively.
        self.current_context_chunks = []
        self.current_questions = []
        self.current_gold_answers = []
        # this is for chunk assignment to each question
        self.chunk_assignments = defaultdict(list)
        # for parsing and storing question answer pairs from chunk prompts:
        self.question_answer_map = {}

    def generate_response(self, prompt): #can use this function to get answer from 1 chunk, then 2nd chunk etc, all chunks and then proceed. with normal
        return self.llm(prompt)

    def get_context_questions(self, k=2, q=4):
      #same as context overload notebook
        contexts = []
        questions_answers = []
        selected_contexts = list(self.context_map.items())[:k]
        for context, qa_pairs in selected_contexts:
            contexts.append(context)
            selected_qa = qa_pairs[:q]
            questions_answers.append(selected_qa)
        return contexts, questions_answers

    def generate_Dataset_prompt(self, k=2, q=4):
        ''' This function generates and imitates a prompt from the dataset, later on, we will take questions and context as input from user.'''
        contexts, questions_answers_by_context = self.get_context_questions(k, q)
        combined_context = "\n\n".join(contexts)
        all_questions_answers = []
        for qa_pairs in questions_answers_by_context:
            all_questions_answers.extend(qa_pairs)
        # modified the prompt
        prompt_lines = [
            "Answer these questions as precisely as you can, in as minimum words as you can",
            "Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer",
            "If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.",
        ]
        self.current_questions = []
        self.current_gold_answers = []
        for i, (question, answer) in enumerate(all_questions_answers, 1):
            self.current_questions.append(question)
            self.current_gold_answers.append(answer)
            prompt_lines.append(f"Q{i}: {question}")
        prompt_lines.append("\nfrom this context:")
        prompt_lines.append(combined_context)
        prompt = "\n".join(prompt_lines)
        self.current_prompt = prompt


    def extract_keywords(self, questions: List[str]) -> List[List[str]]:
        """
        Given a list of questions, extract a list of keywords for each question.
        """
        stopwords = set([
            "the", "is", "at", "which", "on", "in", "a", "an", "of", "to", "for", "with",
            "and", "by", "from", "what", "who", "when", "where", "how", "does", "do",
            "did", "was", "were", "has", "have", "had"
        ])
        all_keywords = []
        for question in questions:
            # Remove punctuation
            question_clean = re.sub(r'[^\w\s]', '', question)
            words = question_clean.lower().split()
            # Filter stopwords and small words
            keywords = [word for word in words if word not in stopwords and len(word) > 2]
            all_keywords.append(keywords)
        return all_keywords

    def map_questions_to_chunks(self) -> List[Tuple[str, List[str]]]:
        """
        Map each context chunk to the most probable questions based on keyword matching.
        Returns a list of (context_chunk, assigned_questions) pairs.
        """
        if not self.current_context_chunks:
            raise ValueError("No context chunks available. Please run chunk_current_prompt first.")
        if not self.current_questions:
            raise ValueError("No questions available. Please populate current_questions first.")
        keywords_per_question = self.extract_keywords(self.current_questions)
        # print(f"keywords per question: ", keywords_per_question) #so it returns a list of lists, the list has keywords of each question
        self.chunk_assignments = defaultdict(list)
        lowered_chunks = [chunk.lower() for chunk in self.current_context_chunks]
        for q_idx, keywords in enumerate(keywords_per_question):
            question = self.current_questions[q_idx]
            best_chunk_idx = None
            best_match_count = 0
            for idx, chunk in enumerate(lowered_chunks):
                match_count = sum(1 for keyword in keywords if keyword in chunk)
                if match_count > best_match_count:
                    best_match_count = match_count
                    best_chunk_idx = idx
            if best_chunk_idx is not None and best_match_count > 0:
                self.chunk_assignments[best_chunk_idx].append(question)
        # print(f"chunk assignments: ", self.chunk_assignments)

    def chunk_current_prompt(self, max_words_per_chunk=800):
        """
        Splits the context into ~800-word chunks and constructs prompts accordingly.
        Stores the chunked prompts in self.chunked_prompts.
        """
        if not hasattr(self, 'current_prompt') or self.current_prompt is None:
            raise ValueError("current_prompt not set. Please run generate_Dataset_prompt first.")
        prompt_parts = self.current_prompt.split("\nfrom this context:\n")
        if len(prompt_parts) != 2:
            raise ValueError("Prompt is not formatted correctly.")
        question_block = prompt_parts[0]  # instructions + questions
        context_block = prompt_parts[1]   # full context text
        context_words = context_block.split()
        chunks = [
            " ".join(context_words[i:i+max_words_per_chunk])
            for i in range(0, len(context_words), max_words_per_chunk)
        ]
        self.chunked_prompts = []
        self.current_context_chunks = []
        for chunk in chunks:
            self.current_context_chunks.append(chunk)
            new_prompt = f"{question_block}\n\nfrom this context:\n{chunk}"
            self.chunked_prompts.append(new_prompt)


    def construct_individual_prompts_from_assignments(self):
        """
        Constructs individual prompts by pairing assigned questions with their corresponding context chunks.
        Formats the prompt according to specified rules: minimal word answers, labeled Q1, Q2..., and specific handling for unknowns.
        Stores results in self.individual_prompts as a list of prompt strings.
        """
        if not hasattr(self, 'chunk_assignments') or not self.chunk_assignments:
            raise ValueError("chunk_assignments not set. Please run map_questions_to_chunks first.")

        if not hasattr(self, 'current_context_chunks') or not self.current_context_chunks:
            raise ValueError("No context chunks available.")

        self.individual_prompts = []
        for idx, questions in self.chunk_assignments.items():
            chunk = self.current_context_chunks[idx]
            # Add Q1, Q2... prefix
            formatted_questions = [f"Q{i + 1}: {q}" for i, q in enumerate(questions)]
            question_block = "\n".join(formatted_questions)

            prompt = (
                "Answer these questions as precisely as you can, in as minimum words as you can\n"
                "Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer\n"
                "If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.\n"
                f"{question_block}\n\nfrom this context:\n{chunk}"
            )
            self.individual_prompts.append(prompt)
        self.chunked_prompts = self.individual_prompts


    def run_all_chunks(self, verbose=True):
        """
        Calls the LLM for each chunked prompt and prints the responses.

        Args:
            verbose (bool): If True, prints each response.

        Returns:
            List of responses from the model for each chunked prompt.
        """
        if not self.chunked_prompts:
            raise ValueError("chunked_prompts not set. Please run chunk_current_prompt first.")
        self.chunk_responses = []
        for idx, prompt in enumerate(self.chunked_prompts, 1):
            response = self.generate_response(prompt)
            self.chunk_responses.append(response)
            if verbose:
                print(f"\n--- Response for Chunk {idx} ---\n")
                print(response)
                print("\n-------------------------------\n")
        print("LOGGING chunk responses:")
        print(self.chunk_responses)

    def get_gold_answer_for_question(self, question):
        """
        Finds the gold answer for a given question.

        :param question: The question string to look up.
        :return: The corresponding gold answer string.
        :raises ValueError: If the question is not found in the current_questions list.
        """
        try:
            index = self.current_questions.index(question)
            return self.current_gold_answers[index]
        except ValueError:
            raise ValueError("Question not found in current_questions.")

    def extract_questions_and_answers(self, verbose=True):
        self.question_answer_map = {}
        for chunk_idx, response in enumerate(self.chunk_responses):
            question_block_start = re.search(r'\bQ1:', response)
            if not question_block_start:
                if verbose:
                    print(f"[Warning] No 'Q1:' found in chunk {chunk_idx}, skipping")
                continue
            cleaned_response = response[question_block_start.start():]
            # print("cleaned_Response is: ", cleaned_response) #so this skips all the prompt lines and starts from Q1
            questions = re.findall(r'Q\d+:\s*([^\n]*)', cleaned_response) #modified this to read from Q or A till a \n newline appeared as the last question also read the entire context.
            answers = re.findall(r'A\d+:\s*([^\n]*)', cleaned_response)
            print("answers: ", answers)
            print("questions: ", questions)
            if verbose and len(answers) != len(questions):
                print(f"[Warning] Mismatch: {len(questions)} questions vs {len(answers)} answers in chunk {chunk_idx}")

            for idx, question in enumerate(questions):
                gold_answer = self.get_gold_answer_for_question(question)
                self.question_answer_map[question] = (answers[idx].strip(), gold_answer)

        return self.question_answer_map


    def print_current_prompt(self):
        ''' This function prints the generated or inputted prompt'''
        print("=======================================================================PRINTING PROMPT======================================================================")
        print(f"\n🧠 Current Prompt:")
        print(f"{self.current_prompt}\n")
        char_length = len(self.current_prompt)
        token_count = len(self.current_prompt.split())
        print("=======================================================================PROMPT COUNTS======================================================================")
        print(f"Length: {char_length} characters, approximately {token_count} tokens")
        print("========================================================================END PRINTING======================================================================")


    def print_chunks(self):
        ''' This function prints each chunked prompt stored in self.chunked_prompts '''
        if not hasattr(self, 'chunked_prompts') or not self.chunked_prompts:
            print("No chunked prompts found. Please run chunk_current_prompt() first.")
            return
        print("=======================================================================PRINTING CHUNKED PROMPTS======================================================================")
        for i, chunk in enumerate(self.chunked_prompts, 1):
            print(f"\n🧠 Prompt Chunk {i}:")
            print(chunk)
            char_length = len(chunk)
            token_count = len(chunk.split())
            print("-----------------------------------------")
            print(f"Length: {char_length} characters, approximately {token_count} tokens")
            print("============================================================================================================================================================")
        print("=======================================================================END OF CHUNKS======================================================================")

    def print_parsed_qa_pairs(self):
        if not hasattr(self, 'question_answer_map') or not self.question_answer_map:
            print("No question-answer pairs found. Please run extract_questions_and_answers() first.")
            return

        print("Parsed Question-Answer Pairs:\n")
        for idx, (question, answer) in enumerate(self.question_answer_map.items(), 1):
            print(f"{idx}. Q: {question}\n   A: {answer}\n")

    def print_qa_pair_comparison(self):
        if not hasattr(self, 'question_answer_map') or not self.question_answer_map:
            print("No question-answer pairs found. Please run extract_questions_and_answers() first.")
            return

        if not hasattr(self, 'current_gold_answers') or not self.current_gold_answers:
            print("No gold answers found. Please set self.current_gold_answers before comparing.")
            return

        print("Question | Predicted Answer | Gold Answer")
        print("-" * 80)

        sorted_items = sorted(
        self.question_answer_map.items(),
        key=lambda item: self.current_questions.index(item[0]) if item[0] in self.current_questions else float('inf')
        )

        for idx, (question, (predicted_answer, gold_answer)) in enumerate(sorted_items):
            print(f"Q{idx+1}: {question}")
            print(f"LLM Answer: {predicted_answer}")
            print(f"Ground Truth: {gold_answer}")
            print()

    def print_qa_pairs(self):

        if not hasattr(self, 'current_gold_answers') or not self.current_gold_answers:
            print("No gold answers found. Please set self.current_gold_answers before comparing.")
            return

        questions = list(self.current_questions)
        gold_answers = self.current_gold_answers

        print("Question | Predicted Answer | Gold Answer")
        print("-" * 80)

        for idx, question in enumerate(questions):
            gold = gold_answers[idx] if idx < len(gold_answers) else "N/A"

            print(f"Q{idx+1}: {question}")
            print(f"  ✅ Gold     : {gold}")
            print()







In [34]:
#playground:
agent = Agent2(context_map)
agent.generate_Dataset_prompt(k=5)
# agent.print_current_prompt()
agent.chunk_current_prompt(max_words_per_chunk=400)
agent.map_questions_to_chunks()
agent.construct_individual_prompts_from_assignments()
# agent.print_chunks()
agent.extract_questions_and_answers()
# print("self.questions: ", agent.current_questions)
# print("self.answerrs: ", agent.current_gold_answers)
agent.print_qa_pair_comparison()


answers:  ['The Virgin Mary allegedly appeared to Saint Bernadette Soubirous in 1858 in Lourdes, France.', 'A copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes" is in front of the Notre Dame Main Building.', 'The Basilica of the Sacred Heart is beside the Notre Dame Main Building.', 'The Grotto at Notre Dame is a replica of the grotto at Lourdes, France where the Virgin Mary allegedly appeared to Saint Bernadette Soubirous in 1858.', 'The Scholastic Magazine of Notre Dame began publishing in September 1876.', 'The Juggler is published twice a year.', 'The daily student paper at Notre Dame is called The Observer.', 'There are 3 student news papers found at Notre Dame.', 'The headquarters of the Congregation of the Holy Cross is in Rome.', 'The primary seminary of the Congregation of the Holy Cross is Moreau Seminary.']
questions:  ['To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'What is in front of the Notre Dame Main Building?', 

# K32 Q5

In [37]:
#driver code:
agent = Agent2(context_map)
agent.generate_Dataset_prompt(k=32, q=5)
agent.print_current_prompt()
agent.chunk_current_prompt(max_words_per_chunk=700)
agent.map_questions_to_chunks()
agent.construct_individual_prompts_from_assignments()
agent.print_chunks()
agent.run_all_chunks()
agent.extract_questions_and_answers()
agent.print_qa_pair_comparison()



🧠 Current Prompt:
Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Q2: What is in front of the Notre Dame Main Building?
Q3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Q4: What is the Grotto at Notre Dame?
Q5: What sits on top of the Main Building at Notre Dame?
Q6: When did the Scholastic Magazine of Notre dame begin publishing?
Q7: How often is Notre Dame's the Juggler published?
Q8: What is the daily student paper at Notre Dame called?
Q9: How many student news papers are found at Notre Dame?
Q10: In what year did the student paper Common Sense begin publication at Notre Dame?
Q11: Where is the headquarters of the Congregation of the Holy Cross




--- Response for Chunk 1 ---

Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Q2: What is in front of the Notre Dame Main Building?
Q3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Q4: What is the Grotto at Notre Dame?
Q5: What sits on top of the Main Building at Notre Dame?
Q6: When did the Scholastic Magazine of Notre dame begin publishing?
Q7: How often is Notre Dame's the Juggler published?
Q8: What is the daily student paper at Notre Dame called?
Q9: How many student news papers are found at Notre Dame?
Q10: In what year did the student paper Common Sense begin publication at Notre Dame?
Q11: Where is the headquarters of the Congregation of th




--- Response for Chunk 2 ---

Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: In what year was the College of Engineering at Notre Dame formed?
Q2: Which department at Notre Dame is the only one to not offer a PhD program?
Q3: To whom was John B. Kroc married?
Q4: What person was the Director of the Science Museum at Notre Dame in the late 19th century?
Q5: What was the lifespan of John Augustine Zahm?
Q6: What program did John Augustine Zahm come to co-direct at Nore Dame?
Q7: What book did John Zahm write in 1896?
Q8: What professorship did Father Josh Carrier hold at Notre Dame?
Q9: In what year did Albert Zahm begin comparing aeronatical models at Notre Dame?
Q10: Which professor sent the first wireless message in the USA?
Q11: In what year did J




--- Response for Chunk 3 ---

Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: How many departments are within the Stinson-Remick Hall of Engineering?
Q2: How many student housing areas are reserved for Notre Dame's graduate students?
Q3: What did the Science Hall at Notre Dame come to be known as?
Q4: Catholic people identified with Notre Dame, what religious group did people feel Yale represented?
Q5: Which arena was constructed under Jenkins at Notre Dame?
Q6: What structure is found on the location of the original church of Father Sorin at Notre Dame?
Q7: Which individual painted the inside of the Basilica of the Sacred Heart at Notre Dame?
Q8: In what year was the Grotto of Our Lady of Lourdes at Notre Dame constructed?
Q9: Which person oversaw t




--- Response for Chunk 4 ---

Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: How many colleges for undergraduates are at Notre Dame?
Q2: How many stories tall is the main library at Notre Dame?
Q3: What is the name of the main library at Notre Dame?
Q4: How many teams participate in the Notre Dame Bookstore Basketball tournament?
Q5: For what cause is money raised at the Bengal Bouts tournament at Notre Dame?
Q6: What percentage of undergrads live on the Notre Dame campus?
Q7: How many dorms for males are on the Notre Dame campus?
Q8: What amount of the graduate student body at Notre Dame live on the campus?
Q9: There are how many dorms for females at Notre Dame?
Q10: What is Congregation of Holy Cross in Latin?
Q11: How often is Catholic mass held 




--- Response for Chunk 5 ---

Answer these questions as precisely as you can, in as minimum words as you can
Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer
If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.
Q1: The granting of Doctorate degrees first occurred in what year at Notre Dame?
Q2: Which program at Notre Dame offers a Master of Education degree?
Q3: What institute at Notre Dame studies  the reasons for violent conflict?
Q4: What is the title of Notre Dame's Theodore Hesburgh?
Q5: In what year was the Joan B. Kroc Institute for International Peace Studies founded?
Q6: What company did Ray Kroc own?
Q7: In what year was the Theodore M. Hesburgh Library at Notre Dame finished?
Q8: Which artist created the mural on the Theodore M. Hesburgh Library?
Q9: What is a common name to reference the mural created by Millard Sheets at Notre Dame?
Q10: How many incoming students did Not



HfHubHTTPError: 504 Server Error: Gateway Time-out for url: https://router.huggingface.co/hf-inference/models/mistralai/Mistral-7B-Instruct-v0.3

In [38]:
print(agent.chunk_responses)

['Answer these questions as precisely as you can, in as minimum words as you can\nFormat your answers with \'A1:\', \'A2:\', etc. at the beginning of each answer\nIf you cannot find enough information in the provided context to answer a question, respond with \'Not specified\'. Do NOT guess.\nQ1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?\nQ2: What is in front of the Notre Dame Main Building?\nQ3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?\nQ4: What is the Grotto at Notre Dame?\nQ5: What sits on top of the Main Building at Notre Dame?\nQ6: When did the Scholastic Magazine of Notre dame begin publishing?\nQ7: How often is Notre Dame\'s the Juggler published?\nQ8: What is the daily student paper at Notre Dame called?\nQ9: How many student news papers are found at Notre Dame?\nQ10: In what year did the student paper Common Sense begin publication at Notre Dame?\nQ11: Where is the headquarters of the Congregation of the Holy Cr

In [39]:
print(agent.chunked_prompts)

['Answer these questions as precisely as you can, in as minimum words as you can\nFormat your answers with \'A1:\', \'A2:\', etc. at the beginning of each answer\nIf you cannot find enough information in the provided context to answer a question, respond with \'Not specified\'. Do NOT guess.\nQ1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?\nQ2: What is in front of the Notre Dame Main Building?\nQ3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?\nQ4: What is the Grotto at Notre Dame?\nQ5: What sits on top of the Main Building at Notre Dame?\nQ6: When did the Scholastic Magazine of Notre dame begin publishing?\nQ7: How often is Notre Dame\'s the Juggler published?\nQ8: What is the daily student paper at Notre Dame called?\nQ9: How many student news papers are found at Notre Dame?\nQ10: In what year did the student paper Common Sense begin publication at Notre Dame?\nQ11: Where is the headquarters of the Congregation of the Holy Cr

In [None]:
# now parsing the responded to chunks:
import re
from collections import defaultdict
from typing import List, Tuple


class Agent2:
    #same context overload notebook
    def __init__(self, context_map, model_name="mistralai/Mistral-7B-Instruct-v0.3", access_token="TOKEN-HERE"):
        self.context_map = context_map
        self.llm = HuggingFaceHub(
            repo_id=model_name,
            model_kwargs={"temperature": 0.3, "max_length": 200},
            huggingfacehub_api_token=access_token
        )
        self.current_prompt = None # NEW: ask user to input context and questions seperately. chunks will only be made of context.
        self.chunked_prompts = []
        self.chunk_responses = ['Answer these questions as precisely as you can, in as minimum words as you can\nFormat your answers with \'A1:\', \'A2:\', etc. at the beginning of each answer\nIf you cannot find enough information in the provided context to answer a question, respond with \'Not specified\'. Do NOT guess.\nQ1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?\nQ2: What is in front of the Notre Dame Main Building?\nQ3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?\nQ4: What is the Grotto at Notre Dame?\nQ5: What sits on top of the Main Building at Notre Dame?\nQ6: When did the Scholastic Magazine of Notre dame begin publishing?\nQ7: How often is Notre Dame\'s the Juggler published?\nQ8: What is the daily student paper at Notre Dame called?\nQ9: How many student news papers are found at Notre Dame?\nQ10: In what year did the student paper Common Sense begin publication at Notre Dame?\nQ11: Where is the headquarters of the Congregation of the Holy Cross?\nQ12: What is the primary seminary of the Congregation of the Holy Cross?\nQ13: What is the oldest structure at Notre Dame?\nQ14: What individuals live at Fatima House at Notre Dame?\nQ15: Which prize did Frederick Buechner create?\nQ16: How many BS level degrees are offered in the College of Engineering at Notre Dame?\nQ17: Before the creation of the College of Engineering similar studies were carried out at which Notre Dame college?\nQ18: The College of Science began to offer civil engineering courses beginning at what time at Notre Dame?\nQ19: What entity provides help with the management of time for new students at Notre Dame?\nQ20: What was created at Notre Dame in 1962 to assist first year students?\nQ21: Which organization declared the First Year of Studies program at Notre Dame "outstanding?"\nQ22: What type of degree is an M.Div.?\nQ23: In what year was a Master of Arts course first offered at Notre Dame?\nQ24: Forbes.com placed Notre Dame at what position compared to other US research universities?\nQ25: Which individual worked on projects at Notre Dame that eventually created neoprene?\nQ26: What percentage of students at Notre Dame are the children of former Notre Dame students?\nQ27: Irvin Abell was given what award by Notre Dame?\nQ28: Which year was the Laetare Medal first given out at Notre Dame?\nQ29: Which congregation is in charge of the Old College at Notre Dame?\nQ30: In which architectural style is the Basilica of the Sacred Heart at Notre Dame made?\n\nfrom this context:\nArchitecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary. As at most other universities, Notre Dame\'s students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the United States. The other magazine, The Juggler, is released twice a year and focuses on student literature and artwork. The Dome yearbook is published annually. The newspapers have varying publication interests, with The Observer published daily and mainly reporting university and other news, and staffed by students from both Notre Dame and Saint Mary\'s College. Unlike Scholastic and The Dome, The Observer is an independent publication and does not have a faculty advisor or any editorial oversight from the University. In 1987, when some students believed that The Observer began to show a conservative bias, a liberal newspaper, Common Sense was published. Likewise, in 2003, when other students believed that the paper showed a liberal bias, the conservative paper Irish Rover went into production. Neither paper is published as often as The Observer; however, all three are distributed to all students. Finally, in Spring 2008 an undergraduate journal for political science research, Beyond Politics, made its debut. The university is the major seat of the Congregation of Holy Cross (albeit not its official headquarters, which are in Rome). Its main seminary, Moreau Seminary, is located on the campus across St. Joseph lake from the Main Building. Old College, the oldest building on campus and located near the shore of St. Mary lake, houses undergraduate seminarians. Retired priests and brothers reside in Fatima House (a former retreat center), Holy Cross House, as well as Columba Hall near the Grotto. The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching. The College of Engineering was established in 1920, however, early courses in civil and mechanical engineering were a part of the College of Science since the 1870s. Today the college, housed in the Fitzpatrick, Cushing, and Stinson-Remick Halls of Engineering, includes five departments of study – aerospace and mechanical engineering, chemical and biomolecular engineering, civil engineering and geological sciences, computer science and engineering, and electrical engineering – with eight B.S. degrees offered. Additionally, the college offers five-year dual degree programs with the Colleges of Arts and Letters and of Business awarding additional B.A. and Master of Business Administration (MBA) degrees, respectively. All of Notre Dame\'s undergraduate students are a part of one of the five undergraduate colleges at the school or are in the First Year of Studies program. The First Year of Studies program was established in 1962 to guide incoming freshmen in their first year at the school before they have declared a major. Each student is given an academic advisor from the program who helps them to choose classes that give them exposure to any major in which they are interested. The program also includes a Learning Resource Center which provides time management, collaborative learning, and subject tutoring. This program has been recognized previously, by U.S. News & World Report, as outstanding. The university first offered graduate degrees, in the form of a Master of Arts (MA), in the 1854–1855 academic year. The program expanded to include Master of Laws (LL.M.) and Master of Civil Law (MCL) degrees in 1869. The university was ranked 18th among U.S. research universities by Forbes.com in 2011. The university\'s roots in the chemical industry began with the work of chemist Irvin Abell, who worked on projects at Notre Dame that eventually created neoprene.\n\nA1: The Virgin Mary allegedly appeared to Bernadette Soubirous in 1858 in Lourdes, France.\nA2: In front of the Notre Dame Main Building is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".\nA3: The Basilica of the Sacred heart at Notre Dame is beside the Main Building.\nA4: The Grotto at Notre Dame is a Marian place of prayer and reflection, a replica of the grotto at Lourdes, France.\nA5: A golden statue of the Virgin Mary sits on top of the Main Building at Notre Dame.\nA6: The Scholastic Magazine of Notre Dame began publishing in September 1876.\nA7: The Juggler is published twice a year.\nA8: The daily student paper at Notre Dame is called The Observer.\nA9: There are 9 student-run news outlets at Notre Dame.\nA10: The student paper Common Sense began publication at Notre Dame in 1987.\nA11: The headquarters of the Congregation of the Holy Cross is in Rome.\nA12: The primary seminary of the Congregation of the Holy Cross is Moreau Seminary.\nA13: The oldest structure at Notre Dame is Old College.\nA14: Individuals who live at Fatima House at Notre Dame are retired priests and brothers.\nA15: Frederick Buechner created the Buechner Prize for Preaching.\nA16: The College of Engineering at Notre Dame offers 8 BS level degrees.\nA17: Similar studies were carried out at the College of Science before the creation of the College of Engineering.\nA18: The College of Science began to offer civil engineering courses in the 1870s at Notre Dame.\nA19: The entity that provides help with the management of time for new students at Notre Dame is the First Year of Studies program.\nA20: In 1962, a program called the Learning Resource Center was created at Notre Dame to assist first year students.\nA21: The First Year of Studies program at Notre Dame was recognized as "outstanding" by U.S. News & World Report.\nA22: An M.Div. is a Master of Divinity degree.\nA23: A Master of Arts course was first offered at Notre Dame in the 1854–1855 academic year.\nA24: Forbes.com placed Notre Dame at 18th compared to other US research universities.\nA25: Irvin Abell worked on projects at Notre Dame that eventually created neoprene.\nA26: 20% of students at Notre Dame are the children of former Notre Dame students.\nA27: Irvin Abell was given the Laetare Medal by Notre Dame.\nA28: The Laetare Medal was first given out at Notre Dame in 1883.\nA29: The Old College at Notre Dame is in charge of the Congregation of the Holy Cross.\nA30: The Basilica of the Sacred Heart at Notre Dame is made in the French Gothic architectural style.', 'Answer these questions as precisely as you can, in as minimum words as you can\nFormat your answers with \'A1:\', \'A2:\', etc. at the beginning of each answer\nIf you cannot find enough information in the provided context to answer a question, respond with \'Not specified\'. Do NOT guess.\nQ1: In what year was the College of Engineering at Notre Dame formed?\nQ2: Which department at Notre Dame is the only one to not offer a PhD program?\nQ3: To whom was John B. Kroc married?\nQ4: What person was the Director of the Science Museum at Notre Dame in the late 19th century?\nQ5: What was the lifespan of John Augustine Zahm?\nQ6: What program did John Augustine Zahm come to co-direct at Nore Dame?\nQ7: What book did John Zahm write in 1896?\nQ8: What professorship did Father Josh Carrier hold at Notre Dame?\nQ9: In what year did Albert Zahm begin comparing aeronatical models at Notre Dame?\nQ10: Which professor sent the first wireless message in the USA?\nQ11: In what year did Jerome Green send his first wireless message?\nQ12: What did the brother of John Zahm construct at Notre Dame?\nQ13: Work on a germ-free-life ended up in the creation of which Notre Dame institute?\nQ14: Around what time did Lobund of Notre Dame become independent?\nQ15: In what year did Lobund at Notre Dame become an Institute?\nQ16: The Lobund Institute was merged into the Department of Biology at Notre Dame in what year?\nQ17: Gurian created what in 1939 at Notre Dame?\nQ18: What was the Review of Politics inspired by?\nQ19: Over how many years did Gurian edit the Review of Politics at Notre Dame?\nQ20: Thomas Stritch was an editor of which publican from Notre Dame?\nQ21: Who was the president of Notre Dame in 2012?\nQ22: The Kellogg Institute for International Studies is part of which university?\nQ23: In what year did Notre Dame begin to host the Global Adaptation Index?\nQ24: What threat does the Global Adaptation Index study?\nQ25: How many undergrads were attending Notre Dame in 2014?\nQ26: How many students in total were at Notre Dame in 2014?\nQ27: Which college did Notre Dame add in 1921?\nQ28: Over how many years did the change to national standards undertaken at Notre Dame in the early 20th century take place?\nQ29: Which institute involving animal life did Cavanaugh create at Notre Dame?\nQ30: Outside of an institute studying animals, what other institute did Cavanugh create at Notre Dame?\nQ31: When did John Jenkins become the president of Notre Dame?\nQ32: In terms of the amount of presidents Notre Dame has had, where is John Jenkins on the list?\nQ33: Who was the Notre Dame president that preceded John Jenkins?\nQ34: How much money was spent on enhancing Notre Dame Stadium under John Jenkins?\n\nfrom this context:\namong colleges and universities in the United States. The median starting salary of $55,300 ranked 58th in the same peer group. Father Joseph Carrier, C.S.C. was Director of the Science Museum and the Library and Professor of Chemistry and Physics until 1874. Carrier taught that scientific research and its promise for progress were not antagonistic to the ideals of intellectual and moral culture endorsed by the Church. One of Carrier\'s students was Father John Augustine Zahm (1851–1921) who was made Professor and Co-Director of the Science Department at age 23 and by 1900 was a nationally prominent scientist and naturalist. Zahm was active in the Catholic Summer School movement, which introduced Catholic laity to contemporary intellectual issues. His book Evolution and Dogma (1896) defended certain aspects of evolutionary theory as true, and argued, moreover, that even the great Church teachers Thomas Aquinas and Augustine taught something like it. The intervention of Irish American Catholics in Rome prevented Zahm\'s censure by the Vatican. In 1913, Zahm and former President Theodore Roosevelt embarked on a major expedition through the Amazon. In 1882, Albert Zahm (John Zahm\'s brother) built an early wind tunnel used to compare lift to drag of aeronautical models. Around 1899, Professor Jerome Green became the first American to send a wireless message. In 1931, Father Julius Nieuwland performed early work on basic reactions that was used to create neoprene. Study of nuclear physics at the university began with the building of a nuclear accelerator in 1936, and continues now partly through a partnership in the Joint Institute for Nuclear Astrophysics. The Lobund Institute grew out of pioneering research in germ-free-life which began in 1928. This area of research originated in a question posed by Pasteur as to whether animal life was possible without bacteria. Though others had taken up this idea, their research was short lived and inconclusive. Lobund was the first research organization to answer definitively, that such life is possible and that it can be prolonged through generations. But the objective was not merely to answer Pasteur\'s question but also to produce the germ free animal as a new tool for biological and medical research. This objective was reached and for years Lobund was a unique center for the study and production of germ free animals and for their use in biological and medical investigations. Today the work has spread to other universities. In the beginning it was under the Department of Biology and a program leading to the master\'s degree accompanied the research program. In the 1940s Lobund achieved independent status as a purely research organization and in 1950 was raised to the status of an Institute. In 1958 it was brought back into the Department of Biology as integral part of that department, but with its own program leading to the degree of PhD in Gnotobiotics. The Review of Politics was founded in 1939 by Gurian, modeled after German Catholic journals. It quickly emerged as part of an international Catholic intellectual revival, offering an alternative vision to positivist philosophy. For 44 years, the Review was edited by Gurian, Matthew Fitzsimons, Frederick Crosson, and Thomas Stritch. Intellectual leaders included Gurian, Jacques Maritain, Frank O\'Malley, Leo Richard Ward, F. A. Hermens, and John U. Nef. It became a major forum for political ideas and modern political concerns, especially from a Catholic and scholastic tradition. As of 2012[update] research continued in many fields. The university president, John Jenkins, described his hope that Notre Dame would become "one of the pre–eminent research institutions in the world" in his inaugural address. The university has many multi-disciplinary institutes devoted to research in varying fields, including the Medieval Institute, the Kellogg Institute for International Studies, the Kroc Institute for International Peace studies, and the Center for Social Concerns. Recent research includes work on family conflict and child development, genome mapping, the increasing trade deficit of the United States with China, studies in fluid mechanics, computational science and engineering, and marketing trends on the Internet. As of 2013, the university is home to the Notre Dame Global Adaptation Index which ranks countries annually based on how vulnerable they are to climate change and how prepared they are to adapt. In 2014 the Notre Dame Global Adaptation Index ranked 178 countries. In 2014, there were 8,462 undergraduate students and 4,651 graduate students at the university. In 2014, the university spent $35 million on enhancing Notre Dame Stadium. In 2014, the university had 13,113 total students. In 1921, the College of Architecture was added to the university. In the early 20th century, the university undertook a change to national standards of education, a process that took over 20 years to complete. In 1939, Gurian created the Medieval Institute at the university. In 1967, Notre Dame added the College of Arts and Letters. In 1974, Notre Dame added the College of Science. In 1987, Notre Dame added the College of Engineering. In 2007, Notre Dame added the Keough School of Global Affairs. In 2012, John Jenkins became the president of the university. As of 2014[update], John Jenkins was the 18th president of the university. The president who preceded John Jenkins was Father Edward Malloy. The Kellogg Institute for International Studies is part of the University of Notre Dame. In 2013, Notre Dame began to host the Global Adaptation Index. The Global Adaptation Index studies the threat of climate change.\n\nA1: The College of Engineering at Notre Dame was formed in 1893.\nA2: The Department of Art, Art History, and Design at Notre Dame is the only one to not offer a PhD program.\nA3: John B. Kroc was married to Ruth Munroe.\nA4: The person who was the Director of the Science Museum at Notre Dame in the late 19th century was Father Joseph Carrier, C.S.C.\nA5: John Augustine Zahm had a lifespan of 1851-1921.\nA6: John Augustine Zahm came to co-direct the Science Department at Notre Dame.\nA7: John Zahm wrote the book "Evolution and Dogma" in 1896.\nA8: Father Josh Carrier held the professorship of Chemistry and Physics at Notre Dame.\nA9: Albert Zahm began comparing aeronatical models at Notre Dame in 1882.\nA10: The first wireless message in the USA was sent by Professor Jerome Green.\nA11: Jerome Green sent his first wireless message in 1899.\nA12: John Zahm\'s brother constructed the Lobund Institute at Notre Dame.\nA13: The work on a germ-free-life ended up in the creation of the Lobund Institute.\nA14: Lobund of Notre Dame became independent around the 1920s.\nA15: In 1950, Lobund at Notre Dame became an Institute.\nA16: The Lobund Institute was merged into the Department of Biology at Notre Dame in 1958.\nA17: In 1939, Gurian created the Medieval Institute at Notre Dame.\nA18: The Review of Politics was inspired by German Catholic journals.\nA19: Gurian edited the Review of Politics at Notre Dame for 44 years.\nA20: Thomas Stritch was an editor of the Review of Politics.\nA21: The president of Notre Dame in 2012 was John Jenkins.\nA22: The Kellogg Institute for International Studies is part of the University of Notre Dame.\nA23: Notre Dame began to host the Global Adaptation Index in 2013.\nA24: The Global Adaptation Index studies the threat of climate change.\nA25: In 2014, there were 8,462 undergrads attending Notre Dame.\nA26: In 2014, there were a total of 13,113 students at Notre Dame.\nA27: The College of Architecture was added to Notre Dame in 1921.\nA28: The change to national standards undertaken at Notre Dame in the early 20th century took place over 20 years.\nA29: Cavanaugh created the Lobund Institute involving animal life at Notre Dame.\nA30: Outside of an institute studying animals, Cavanugh created the Institute for Scholasticism at Notre Dame.\nA31: John Jenkins became the president of Notre Dame in 2012.\nA32: John Jenkins is the 18th president of Notre Dame.\nA33: The president who preceded John Jenkins was Father Edward Malloy.\nA34: $35 million was spent on enhancing Notre Dame Stadium under John Jenkins.', 'Answer these questions as precisely as you can, in as minimum words as you can\nFormat your answers with \'A1:\', \'A2:\', etc. at the beginning of each answer\nIf you cannot find enough information in the provided context to answer a question, respond with \'Not specified\'. Do NOT guess.\nQ1: How many departments are within the Stinson-Remick Hall of Engineering?\nQ2: How many student housing areas are reserved for Notre Dame\'s graduate students?\nQ3: What did the Science Hall at Notre Dame come to be known as?\nQ4: Catholic people identified with Notre Dame, what religious group did people feel Yale represented?\nQ5: Which arena was constructed under Jenkins at Notre Dame?\nQ6: What structure is found on the location of the original church of Father Sorin at Notre Dame?\nQ7: Which individual painted the inside of the Basilica of the Sacred Heart at Notre Dame?\nQ8: In what year was the Grotto of Our Lady of Lourdes at Notre Dame constructed?\nQ9: Which person oversaw the creation of a science hall at Notre Dame in 1883?\nQ10: In what year did the student union building at Notre Dame get renamed to LaFortune Center?\nQ11: After which individual was the LaFortune Center Notre Dame named?\nQ12: How large in square feet is the LaFortune Center at Notre Dame?\nQ13: What is the annual budget of Notre Dame\'s LaFortune Center?\nQ14: Which library was built at Notre Dame in 1963?\nQ15: Which baseball stadium is found at Notre Dame?\nQ16: In what year did Notre Dame create the Office of Sustainability?\nQ17: What percentage of the food served at Notre Dame is locally grown?\nQ18: Notre Dame got a "B" for its sustainability practices from which entity?\n\nfrom this context:\nand undergone many construction projects on campus, including Compton Family Ice Arena, a new architecture hall, additional residence halls, and the Campus Crossroads, a $400m enhancement and expansion of Notre Dame Stadium. Because of its Catholic identity, a number of religious buildings stand on campus. The Old College building has become one of two seminaries on campus run by the Congregation of Holy Cross. The current Basilica of the Sacred Heart is located on the spot of Fr. Sorin\'s original church, which became too small for the growing college. It is built in French Revival style and it is decorated by stained glass windows imported directly from France. The interior was painted by Luigi Gregori, an Italian painter invited by Fr. Sorin to be artist in residence. The Basilica also features a bell tower with a carillon. Inside the church there are also sculptures by Ivan Mestrovic. The Grotto of Our Lady of Lourdes, which was built in 1896, is a replica of the original in Lourdes, France. It is very popular among students and alumni as a place of prayer and meditation, and it is considered one of the most beloved spots on campus. A Science Hall was built in 1883 under the direction of Fr. Zahm, but in 1950 it was converted to a student union building and named LaFortune Center, after Joseph LaFortune, an oil executive from Tulsa, Oklahoma. Commonly known as "LaFortune" or "LaFun," it is a 4-story building of 83,000 square feet that provides the Notre Dame community with a meeting place for social, recreational, cultural, and educational activities. LaFortune employs 35 part-time student staff and 29 full-time non-student staff and has an annual budget of $1.2 million. Many businesses, services, and divisions of The Office of Student Affairs are found within. The building also houses restaurants from national restaurant chains. Since the construction of its oldest buildings, the university\'s physical plant has grown substantially. Over the years 29 residence halls have been built to accommodate students and each has been constructed with its own chapel. Many academic building were added together with a system of libraries, the most prominent of which is the Theodore Hesburgh Library, built in 1963 and today containing almost 4 million books. Since 2004, several buildings have been added, including the DeBartolo Performing Arts Center, the Guglielmino Complex, and the Jordan Hall of Science. Additionally, a new residence for men, Duncan Hall, was begun on March 8, 2007, and began accepting residents for the Fall 2008 semester. Ryan Hall was completed and began housing undergraduate women in the fall of 2009. A new engineering building, Stinson-Remick Hall, a new combination Center for Social Concerns/Institute for Church Life building, Geddes Hall, and a law school addition have recently been completed as well. Additionally the new hockey arena opened in the fall of 2011. The Stayer Center for Executive Education, which houses the Mendoza College of Business Executive Education Department opened in March 2013 just South of the Mendoza College of Business building. Because of its long athletic tradition, the university features also many building dedicated to sport. The most famous is Notre Dame Stadium, home of the Fighting Irish football team; it has been renovated several times and today it can hold more than 80 thousand people. Prominent venues include also the Edmund P. Joyce Center, with indoor basketball and volleyball courts, and the Compton Family Ice Arena, a two-rink facility dedicated to hockey. Also, there are many outdoor fields, as the Frank Eck Stadium for baseball. The University of Notre Dame has made being a sustainability leader an integral part of its mission, creating the Office of Sustainability in 2008 to achieve a number of goals in the areas of power generation, design and construction, waste reduction, procurement, food services, transportation, and water.As of 2012[update] four building construction projects were pursuing LEED-Certified status and three were pursuing LEED Silver. Notre Dame\'s dining services sources 40% of its food locally and offers sustainably caught seafood as well as many organic, fair-trade, and vegan options. On the Sustainable Endowments Institute\'s College Sustainability Report Card 2010, University of Notre Dame received a "B" grade. The university also houses the Kroc Institute for International Peace Studies, the Kellogg Institute for International Studies, and the Notre Dame Environmental Change Initiative.\n\nA1: Not specified\nA2: Not specified\nA3: Science Hall came to be known as the LaFortune Center\nA4: Protestant people identified with Yale\nA5: Compton Family Ice Arena\nA6: The Basilica of the Sacred Heart\nA7: Luigi Gregori\nA8: The Grotto of Our Lady of Lourdes was constructed in 1896\nA9: Fr. Zahm\nA10: The student union building at Notre Dame got renamed to LaFortune Center in 1950\nA11: The LaFortune Center Notre Dame was named after Joseph LaFortune\nA12: The LaFortune Center at Notre Dame is 83,000 square feet\nA13: The annual budget of Notre Dame\'s LaFortune Center is $1.2 million\nA14: The Hesburgh Library was built at Notre Dame in 1963\nA15: Notre Dame Stadium\nA16: The Office of Sustainability was created at Notre Dame in 2008\nA17: 40% of the food served at Notre Dame is locally grown\nA18: Notre Dame got a "B" for its sustainability practices from the Sustainable Endowments Institute.', 'Answer these questions as precisely as you can, in as minimum words as you can\nFormat your answers with \'A1:\', \'A2:\', etc. at the beginning of each answer\nIf you cannot find enough information in the provided context to answer a question, respond with \'Not specified\'. Do NOT guess.\nQ1: How many colleges for undergraduates are at Notre Dame?\nQ2: How many stories tall is the main library at Notre Dame?\nQ3: What is the name of the main library at Notre Dame?\nQ4: How many teams participate in the Notre Dame Bookstore Basketball tournament?\nQ5: For what cause is money raised at the Bengal Bouts tournament at Notre Dame?\nQ6: What percentage of undergrads live on the Notre Dame campus?\nQ7: How many dorms for males are on the Notre Dame campus?\nQ8: What amount of the graduate student body at Notre Dame live on the campus?\nQ9: There are how many dorms for females at Notre Dame?\nQ10: What is Congregation of Holy Cross in Latin?\nQ11: How often is Catholic mass held at Notre Dame in a week?\nQ12: How many chapels are on the Notre Dame campus?\nQ13: What amount of the student body of Notre Dame identifies as Catholic?\nQ14: In what year was the Main Building at Notre Dame razed in a fire?\nQ15: Who was the president of Notre Dame in 1879?\nQ16: On what date was the rebuilding of The Main Building begun at Notre Dame after the fire that claimed the previous?\nQ17: What is O\'Shaughnessy Hall of Notre Dame formerly known as?\nQ18: How many faculty members were at Notre Dame when Hesburgh left the role of president?\nQ19: In what year did Notre Dame have its earliest undergraduate that was female?\nQ20: How many halls are at Notre Dame that house students?\nQ21: How many books are housed at the Theodore Hesburgh Library?\nQ22: Construction for which hall started on March 8th 2007 at Notre Dame?\n\nfrom this context:\nDame student body consisted of 12,179 students, with 8,448 undergraduates, 2,138 graduate and professional and 1,593 professional (Law, M.Div., Business, M.Ed.) students. Around 21–24% of students are children of alumni, and although 37% of students come from the Midwestern United States, the student body represents all 50 states and 100 countries. As of March 2007[update] The Princeton Review ranked the school as the fifth highest \'dream school\' for parents to send their children. As of March 2015[update] The Princeton Review ranked Notre Dame as the ninth highest. The school has been previously criticized for its lack of diversity, and The Princeton Review ranks the university highly among schools at which "Alternative Lifestyles [are] Not an Alternative." It has also been commended by some diversity oriented publications; Hispanic Magazine in 2004 ranked the university ninth on its list of the top–25 colleges for Latinos, and The Journal of Blacks in Higher Education recognized the university in 2006 for raising enrollment of African-American students. With 6,000 participants, the university\'s intramural sports program was named in 2004 by Sports Illustrated as the best program in the country, while in 2007 The Princeton Review named it as the top school where "Everyone Plays Intramural Sports." The annual Bookstore Basketball tournament is the largest outdoor five-on-five tournament in the world with over 700 teams participating each year, while the Notre Dame Men\'s Boxing Club hosts the annual Bengal Bouts tournament that raises money for the Holy Cross Missions in Bangladesh. About 80% of undergraduates and 20% of graduate students live on campus. The majority of the graduate students on campus live in one of four graduate housing complexes on campus, while all on-campus undergraduates live in one of the 29 residence halls. Because of the religious affiliation of the university, all residence halls are single-sex, with 15 male dorms and 14 female dorms. The university maintains a visiting policy (known as parietal hours) for those students who live in dormitories, specifying times when members of the opposite sex are allowed to visit other students\' dorm rooms; however, all residence halls have 24-hour social spaces for students regardless of gender. Many residence halls have at least one nun and/or priest as a resident. There are no traditional social fraternities or sororities at the university, but a majority of students live in the same residence hall for all four years. Some intramural sports are based on residence hall teams, where the university offers the only non-military academy program of full-contact intramural American football. At the end of the intramural season, the championship game is played on the field in Notre Dame Stadium. The university is affiliated with the Congregation of Holy Cross (Latin: Congregatio a Sancta Cruce, abbreviated postnominals: "CSC"). While religious affiliation is not a criterion for admission, more than 93% of students identify as Christian, with over 80% of the total being Catholic. Collectively, Catholic Mass is celebrated over 100 times per week on campus, and a large campus ministry program provides for the faith needs of the community. There are multitudes of religious statues and artwork around campus, most prominent of which are the statue of Mary on the Main Building, the Notre Dame Grotto, and the Word of Life mural on Hesburgh Library depicting Christ as a teacher. Additionally, every classroom displays a crucifix. There are many religious clubs (catholic and non-Catholic) at the school, including Council #1477 of the Knights of Columbus (KOC), Baptist Collegiate Ministry (BCM), Jewish Club, Muslim Student Association, Orthodox Christian Fellowship, The Mormon Club, and many more. The Notre Dame KofC are known for being the first collegiate council of KofC, operating a charitable concession stand during every home football game and owning their own building on campus which can be used as a cigar lounge. Fifty-seven chapels are located throughout the campus. This Main Building, and the library collection, was entirely destroyed by a fire in April 1879, and the school closed immediately and students were sent home. The university founder, Fr. Sorin and the president at the time, the Rev. William Corby, immediately planned for the rebuilding of the structure that had housed virtually the entire University. Construction was started on the 17th of May and the Main Building was rebuilt in a Gothic Revival style. The Main Building was razed again in 1887 due to a fire, but was rebuilt once more, and was finally torn down in 1931 to make way for the current Main Building. The Main Building was originally known as the Academic Building, but was renamed in 1923 to honor the university\'s first president, Fr. Edward Sorin. The Main Building is the administrative center of the university, housing the offices of the president, the provost, and the dean of students. The Main Building is also home to the Notre Dame Bookstore, the Snack Bar, and the Notre Dame Federal Credit Union. O\'Shaughnessy Hall, originally known as the Administration Building, was built in 1931 and was the first building on campus to be built in the modern style. The building was named after Rev. John O\'Shaughnessy, who was the president of the university from 1905 to 1912. The building was originally used as the administrative center of the university, but is now used as a classroom building. The building was renovated in 1999 and reopened in 2000. The university had 1,100 faculty members when Fr. Theodore Hesburgh became president in 1952.\n\nA1: Not specified\nA2: Not specified\nA3: Main Library\nA4: 700\nA5: For the Holy Cross Missions in Bangladesh\nA6: 80%\nA7: 15\nA8: Not specified\nA9: 14\nA10: Congregatio a Sancta Cruce\nA11: Over 100 times\nA12: 57\nA13: Over 93%\nA14: 1879\nA15: The Rev. William Corby\nA16: May 17th\nA17: Administration Building\nA18: Not specified\nA19: Not specified\nA20: 29\nA21: Not specified\nA22: Jenkins Nanovic Halls\nA23: Not specified', 'Answer these questions as precisely as you can, in as minimum words as you can\nFormat your answers with \'A1:\', \'A2:\', etc. at the beginning of each answer\nIf you cannot find enough information in the provided context to answer a question, respond with \'Not specified\'. Do NOT guess.\nQ1: The granting of Doctorate degrees first occurred in what year at Notre Dame?\nQ2: Which program at Notre Dame offers a Master of Education degree?\nQ3: What institute at Notre Dame studies  the reasons for violent conflict?\nQ4: What is the title of Notre Dame\'s Theodore Hesburgh?\nQ5: In what year was the Joan B. Kroc Institute for International Peace Studies founded?\nQ6: What company did Ray Kroc own?\nQ7: In what year was the Theodore M. Hesburgh Library at Notre Dame finished?\nQ8: Which artist created the mural on the Theodore M. Hesburgh Library?\nQ9: What is a common name to reference the mural created by Millard Sheets at Notre Dame?\nQ10: How many incoming students did Notre Dame admit in fall 2015?\nQ11: What percentage of students were admitted to Notre Dame in fall 2015?\nQ12: Where does Notre Dame rank in terms of academic profile among research universities in the US?\nQ13: What percentage of students at Notre Dame participated in the Early Action program?\nQ14: How many miles does the average student at Notre Dame travel to study there?\nQ15: Where did U.S. News & World Report rank Notre Dame in its 2015-2016 university rankings?\nQ16: The undergrad school at the Mendoza College of Business was ranked where according to BusinessWeek?\nQ17: In 2014 what entity named Notre Dame 10th best of all American universities?\nQ18: What percentage of Notre Dame students decide to study abroad?\nQ19: When did study of a germ-free-life begin at Notre Dame?\nQ20: What does the Kroc Institute at Notre Dame focus on?\nQ21: What percentage of Notre Dame students feel they are Christian?\nQ22: What type of education was pushed at Notre Dame before its embracing of national standards?\nQ23: Which president did Notre Dame have in 1947?\nQ24: What was the lifespan of Theodore Hesburgh?\nQ25: During what years was Theodor Hesburgh president of Notre Dame?\n\nfrom this context:\nEngineering in its early stages of growth, before a formal graduate school education was developed with a thesis not required to receive the degrees. This changed in 1924 with formal requirements developed for graduate degrees, including offering Doctorate (PhD) degrees. Today each of the five colleges offer graduate education. Most of the departments from the College of Arts and Letters offer PhD programs, while a professional Master of Divinity (M.Div.) program also exists. All of the departments in the College of Science offer PhD programs, except for the Department of Pre-Professional Studies. The School of Architecture offers a Master of Architecture, while each of the departments of the College of Engineering offer PhD programs. The College of Business offers multiple professional programs including MBA and Master of Science in Accountancy programs. It also operates facilities in Chicago and Cincinnati for its executive MBA program. Additionally, the Alliance for Catholic Education program offers a Master of Education program where students study at the university during the summer and teach in Catholic elementary schools, middle schools, and high schools across the Southern United States for two school years. The Joan B. Kroc Institute for International Peace Studies at the University of Notre Dame is dedicated to research, education and outreach on the causes of violent conflict and the conditions for sustainable peace. It offers PhD, Master\'s, and undergraduate degrees in peace studies. It was founded in 1986 through the donations of Joan B. Kroc, the widow of McDonald\'s owner Ray Kroc. The institute was inspired by the vision of the Rev. Theodore M. Hesburgh CSC, President Emeritus of the University of Notre Dame. The institute has contributed to international policy discussions about peace building practices. The library system of the university is divided between the main library and each of the colleges and schools. The main building is the 14-story Theodore M. Hesburgh Library, completed in 1963, which is the third building to house the main collection of books. The front of the library is adorned with the Word of Life mural designed by artist Millard Sheets. This mural is popularly known as "Touchdown Jesus" because of its proximity to Notre Dame Stadium and Jesus\' arms appearing to make the signal for a touchdown. Notre Dame is known for its competitive admissions, with the incoming class enrolling in fall 2015 admitting 3,577 from a pool of 18,156 (19.7%). The academic profile of the enrolled class continues to rate among the top 10 to 15 in the nation for national research universities. The university practices a non-restrictive early action policy that allows admitted students to consider admission to Notre Dame as well as any other colleges to which they were accepted. 1,400 of the 3,577 (39.1%) were admitted under the early action plan. Admitted students came from 1,311 high schools and the average student traveled more than 750 miles to Notre Dame, making it arguably the most representative university in the United States. While all entering students begin in the College of the First Year of Studies, 25% have indicated they plan to study in the liberal arts or social sciences, 24% in engineering, 24% in business, 24% in science, and 3% in architecture. In 2015-2016, Notre Dame ranked 18th overall among "national universities" in the United States in U.S. News & World Report\'s Best Colleges 2016. In 2014, USA Today ranked Notre Dame 10th overall for American universities based on data from College Factual. Forbes.com\'s America\'s Best Colleges ranks Notre Dame 13th among colleges in the United States in 2015, 8th among Research Universities, and 1st in the Midwest. U.S. News & World Report also lists Notre Dame Law School as 22nd overall. BusinessWeek ranks Mendoza College of Business undergraduate school as 1st overall. It ranks the MBA program as 20th overall. The Philosophical Gourmet Report ranks Notre Dame\'s graduate philosophy program as 15th nationally, while ARCHITECT Magazine ranked the undergraduate architecture program as 12th nationally. Additionally, the study abroad program ranks sixth in highest participation percentage in the nation, with 57.6% of students choosing to study abroad in 17 countries. According to payscale.com, undergraduate alumni of University of Notre Dame have a mid-career median salary $110,000, making it the 24th highest in the nation. The university has a 93% graduation rate, the highest among private universities in the United States.\n\nA1: The granting of Doctorate degrees first occurred in 1924 at Notre Dame.\nA2: The Alliance for Catholic Education program offers a Master of Education degree.\nA3: The Joan B. Kroc Institute for International Peace Studies studies the reasons for violent conflict.\nA4: Theodore Hesburgh is the title of Notre Dame\'s Theodore M. Hesburgh.\nA5: The Joan B. Kroc Institute for International Peace Studies was founded in 1986.\nA6: Ray Kroc owned McDonald\'s.\nA7: The Theodore M. Hesburgh Library at Notre Dame was finished in 1963.\nA8: Millard Sheets created the mural on the Theodore M. Hesburgh Library.\nA9: The common name for the mural created by Millard Sheets at Notre Dame is "Touchdown Jesus".\nA10: Notre Dame admitted 3,577 incoming students in fall 2015.\nA11: 19.7% of students were admitted to Notre Dame in fall 2015.\nA12: Notre Dame ranks 10 to 15 in terms of academic profile among research universities in the US.\nA13: 39.1% of students at Notre Dame participated in the Early Action program.\nA14: The average student at Notre Dame travels more than 750 miles to study there.\nA15: U.S. News & World Report ranked Notre Dame 18th in its 2015-2016 university rankings.\nA16: The undergrad school at the Mendoza College of Business was ranked 1st overall by BusinessWeek.\nA17: In 2014, USA Today named Notre Dame 10th best of all American universities.\nA18: 57.6% of Notre Dame students decide to study abroad.\nA19: The study of a germ-free-life began at Notre Dame in Not specified.\nA20: The Kroc Institute at Notre Dame focuses on peace studies.\nA21: 81% of Notre Dame students feel they are Christian.\nA22: Before its embracing of national standards, Notre Dame pushed a classical education.\nA23: The president of Notre Dame in 1947 was John O\'Hara.\nA24: Theodore Hesburgh lived from 1917 to 2015.\nA25: Theodore Hesburgh was president of Notre Dame from 1952 to 1987.']

# new variables for storing questions, chunks and gold answers respectively.
        self.current_context_chunks = []
        self.current_questions = []
        self.current_gold_answers = []
        # this is for chunk assignment to each question
        self.chunk_assignments = defaultdict(list)
        # for parsing and storing question answer pairs from chunk prompts:
        self.question_answer_map = {}

    def generate_response(self, prompt): #can use this function to get answer from 1 chunk, then 2nd chunk etc, all chunks and then proceed. with normal
        return self.llm(prompt)

    def get_context_questions(self, k=2, q=4):
      #same as context overload notebook
        contexts = []
        questions_answers = []
        selected_contexts = list(self.context_map.items())[:k]
        for context, qa_pairs in selected_contexts:
            contexts.append(context)
            selected_qa = qa_pairs[:q]
            questions_answers.append(selected_qa)
        return contexts, questions_answers

    def generate_Dataset_prompt(self, k=2, q=4):
        ''' This function generates and imitates a prompt from the dataset, later on, we will take questions and context as input from user.'''
        contexts, questions_answers_by_context = self.get_context_questions(k, q)
        combined_context = "\n\n".join(contexts)
        all_questions_answers = []
        for qa_pairs in questions_answers_by_context:
            all_questions_answers.extend(qa_pairs)
        # modified the prompt
        prompt_lines = [
            "Answer these questions as precisely as you can, in as minimum words as you can",
            "Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer",
            "If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.",
        ]
        self.current_questions = []
        self.current_gold_answers = []
        for i, (question, answer) in enumerate(all_questions_answers, 1):
            self.current_questions.append(question)
            self.current_gold_answers.append(answer)
            prompt_lines.append(f"Q{i}: {question}")
        prompt_lines.append("\nfrom this context:")
        prompt_lines.append(combined_context)
        prompt = "\n".join(prompt_lines)
        self.current_prompt = prompt


    def extract_keywords(self, questions: List[str]) -> List[List[str]]:
        """
        Given a list of questions, extract a list of keywords for each question.
        """
        stopwords = set([
            "the", "is", "at", "which", "on", "in", "a", "an", "of", "to", "for", "with",
            "and", "by", "from", "what", "who", "when", "where", "how", "does", "do",
            "did", "was", "were", "has", "have", "had"
        ])
        all_keywords = []
        for question in questions:
            # Remove punctuation
            question_clean = re.sub(r'[^\w\s]', '', question)
            words = question_clean.lower().split()
            # Filter stopwords and small words
            keywords = [word for word in words if word not in stopwords and len(word) > 2]
            all_keywords.append(keywords)
        return all_keywords

    def map_questions_to_chunks(self) -> List[Tuple[str, List[str]]]:
        """
        Map each context chunk to the most probable questions based on keyword matching.
        Returns a list of (context_chunk, assigned_questions) pairs.
        """
        if not self.current_context_chunks:
            raise ValueError("No context chunks available. Please run chunk_current_prompt first.")
        if not self.current_questions:
            raise ValueError("No questions available. Please populate current_questions first.")
        keywords_per_question = self.extract_keywords(self.current_questions)
        # print(f"keywords per question: ", keywords_per_question) #so it returns a list of lists, the list has keywords of each question
        self.chunk_assignments = defaultdict(list)
        lowered_chunks = [chunk.lower() for chunk in self.current_context_chunks]
        for q_idx, keywords in enumerate(keywords_per_question):
            question = self.current_questions[q_idx]
            best_chunk_idx = None
            best_match_count = 0
            for idx, chunk in enumerate(lowered_chunks):
                match_count = sum(1 for keyword in keywords if keyword in chunk)
                if match_count > best_match_count:
                    best_match_count = match_count
                    best_chunk_idx = idx
            if best_chunk_idx is not None and best_match_count > 0:
                self.chunk_assignments[best_chunk_idx].append(question)
        # print(f"chunk assignments: ", self.chunk_assignments)

    def chunk_current_prompt(self, max_words_per_chunk=800):
        """
        Splits the context into ~800-word chunks and constructs prompts accordingly.
        Stores the chunked prompts in self.chunked_prompts.
        """
        if not hasattr(self, 'current_prompt') or self.current_prompt is None:
            raise ValueError("current_prompt not set. Please run generate_Dataset_prompt first.")
        prompt_parts = self.current_prompt.split("\nfrom this context:\n")
        if len(prompt_parts) != 2:
            raise ValueError("Prompt is not formatted correctly.")
        question_block = prompt_parts[0]  # instructions + questions
        context_block = prompt_parts[1]   # full context text
        context_words = context_block.split()
        chunks = [
            " ".join(context_words[i:i+max_words_per_chunk])
            for i in range(0, len(context_words), max_words_per_chunk)
        ]
        self.chunked_prompts = []
        self.current_context_chunks = []
        for chunk in chunks:
            self.current_context_chunks.append(chunk)
            new_prompt = f"{question_block}\n\nfrom this context:\n{chunk}"
            self.chunked_prompts.append(new_prompt)


    def construct_individual_prompts_from_assignments(self):
        """
        Constructs individual prompts by pairing assigned questions with their corresponding context chunks.
        Formats the prompt according to specified rules: minimal word answers, labeled Q1, Q2..., and specific handling for unknowns.
        Stores results in self.individual_prompts as a list of prompt strings.
        """
        if not hasattr(self, 'chunk_assignments') or not self.chunk_assignments:
            raise ValueError("chunk_assignments not set. Please run map_questions_to_chunks first.")

        if not hasattr(self, 'current_context_chunks') or not self.current_context_chunks:
            raise ValueError("No context chunks available.")

        self.individual_prompts = []
        for idx, questions in self.chunk_assignments.items():
            chunk = self.current_context_chunks[idx]
            # Add Q1, Q2... prefix
            formatted_questions = [f"Q{i + 1}: {q}" for i, q in enumerate(questions)]
            question_block = "\n".join(formatted_questions)

            prompt = (
                "Answer these questions as precisely as you can, in as minimum words as you can\n"
                "Format your answers with 'A1:', 'A2:', etc. at the beginning of each answer\n"
                "If you cannot find enough information in the provided context to answer a question, respond with 'Not specified'. Do NOT guess.\n"
                f"{question_block}\n\nfrom this context:\n{chunk}"
            )
            self.individual_prompts.append(prompt)
        self.chunked_prompts = self.individual_prompts


    # def run_all_chunks(self, verbose=True):
    #     """
    #     Calls the LLM for each chunked prompt and prints the responses.

    #     Args:
    #         verbose (bool): If True, prints each response.

    #     Returns:
    #         List of responses from the model for each chunked prompt.
    #     """
    #     if not self.chunked_prompts:
    #         raise ValueError("chunked_prompts not set. Please run chunk_current_prompt first.")
    #     self.chunk_responses = []
    #     for idx, prompt in enumerate(self.chunked_prompts, 1):
    #         response = self.generate_response(prompt)
    #         self.chunk_responses.append(response)
    #         if verbose:
    #             print(f"\n--- Response for Chunk {idx} ---\n")
    #             print(response)
    #             print("\n-------------------------------\n")
    #     print("LOGGING chunk responses:")
    #     print(self.chunk_responses)

    def get_gold_answer_for_question(self, question):
        """
        Finds the gold answer for a given question.

        :param question: The question string to look up.
        :return: The corresponding gold answer string.
        :raises ValueError: If the question is not found in the current_questions list.
        """
        try:
            index = self.current_questions.index(question)
            return self.current_gold_answers[index]
        except ValueError:
            return "Question not found"

    def extract_questions_and_answers(self, verbose=True):
        self.question_answer_map = {}
        for chunk_idx, response in enumerate(self.chunk_responses):
            question_block_start = re.search(r'\bQ1:', response)
            if not question_block_start:
                if verbose:
                    print(f"[Warning] No 'Q1:' found in chunk {chunk_idx}, skipping")
                continue
            cleaned_response = response[question_block_start.start():]
            # print("cleaned_Response is: ", cleaned_response) #so this skips all the prompt lines and starts from Q1
            questions = re.findall(r'Q\d+:\s*([^\n]*)', cleaned_response) #modified this to read from Q or A till a \n newline appeared as the last question also read the entire context.
            answers = re.findall(r'A\d+:\s*([^\n]*)', cleaned_response)
            print("answers: ", answers)
            print("questions: ", questions)
            if verbose and len(answers) != len(questions):
                print(f"[Warning] Mismatch: {len(questions)} questions vs {len(answers)} answers in chunk {chunk_idx}")

            for idx, question in enumerate(questions):
                gold_answer = self.get_gold_answer_for_question(question)
                self.question_answer_map[question] = (answers[idx].strip(), gold_answer)

        return self.question_answer_map


    def print_current_prompt(self):
        ''' This function prints the generated or inputted prompt'''
        print("=======================================================================PRINTING PROMPT======================================================================")
        print(f"\n🧠 Current Prompt:")
        print(f"{self.current_prompt}\n")
        char_length = len(self.current_prompt)
        token_count = len(self.current_prompt.split())
        print("=======================================================================PROMPT COUNTS======================================================================")
        print(f"Length: {char_length} characters, approximately {token_count} tokens")
        print("========================================================================END PRINTING======================================================================")


    def print_chunks(self):
        ''' This function prints each chunked prompt stored in self.chunked_prompts '''
        if not hasattr(self, 'chunked_prompts') or not self.chunked_prompts:
            print("No chunked prompts found. Please run chunk_current_prompt() first.")
            return
        print("=======================================================================PRINTING CHUNKED PROMPTS======================================================================")
        for i, chunk in enumerate(self.chunked_prompts, 1):
            print(f"\n🧠 Prompt Chunk {i}:")
            print(chunk)
            char_length = len(chunk)
            token_count = len(chunk.split())
            print("-----------------------------------------")
            print(f"Length: {char_length} characters, approximately {token_count} tokens")
            print("============================================================================================================================================================")
        print("=======================================================================END OF CHUNKS======================================================================")

    def print_parsed_qa_pairs(self):
        if not hasattr(self, 'question_answer_map') or not self.question_answer_map:
            print("No question-answer pairs found. Please run extract_questions_and_answers() first.")
            return

        print("Parsed Question-Answer Pairs:\n")
        for idx, (question, answer) in enumerate(self.question_answer_map.items(), 1):
            print(f"{idx}. Q: {question}\n   A: {answer}\n")

    def print_qa_pair_comparison(self):
        if not hasattr(self, 'question_answer_map') or not self.question_answer_map:
            print("No question-answer pairs found. Please run extract_questions_and_answers() first.")
            return

        if not hasattr(self, 'current_gold_answers') or not self.current_gold_answers:
            print("No gold answers found. Please set self.current_gold_answers before comparing.")
            return

        print("Question | Predicted Answer | Gold Answer")
        print("-" * 80)

        sorted_items = sorted(
        self.question_answer_map.items(),
        key=lambda item: self.current_questions.index(item[0]) if item[0] in self.current_questions else float('inf')
        )

        for idx, (question, (predicted_answer, gold_answer)) in enumerate(sorted_items):
            print(f"Q{idx+1}: {question}")
            print(f"LLM Answer: {predicted_answer}")
            print(f"Ground Truth: {gold_answer}")
            print()

    def print_qa_pairs(self):

        if not hasattr(self, 'current_gold_answers') or not self.current_gold_answers:
            print("No gold answers found. Please set self.current_gold_answers before comparing.")
            return

        questions = list(self.current_questions)
        gold_answers = self.current_gold_answers

        print("Question | Predicted Answer | Gold Answer")
        print("-" * 80)

        for idx, question in enumerate(questions):
            gold = gold_answers[idx] if idx < len(gold_answers) else "N/A"

            print(f"Q{idx+1}: {question}")
            print(f"  ✅ Gold     : {gold}")
            print()







In [56]:
#playground:
agent = Agent2(context_map)
agent.generate_Dataset_prompt(k=32, q=5)
# agent.print_current_prompt()
agent.chunk_current_prompt(max_words_per_chunk=700)
agent.map_questions_to_chunks()
agent.construct_individual_prompts_from_assignments()
# agent.print_chunks()
agent.extract_questions_and_answers()
# print("self.questions: ", agent.current_questions)
# print("self.answerrs: ", agent.current_gold_answers)
agent.print_qa_pair_comparison()


answers:  ['The Virgin Mary allegedly appeared to Bernadette Soubirous in 1858 in Lourdes, France.', 'In front of the Notre Dame Main Building is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".', 'The Basilica of the Sacred heart at Notre Dame is beside the Main Building.', 'The Grotto at Notre Dame is a Marian place of prayer and reflection, a replica of the grotto at Lourdes, France.', 'A golden statue of the Virgin Mary sits on top of the Main Building at Notre Dame.', 'The Scholastic Magazine of Notre Dame began publishing in September 1876.', 'The Juggler is published twice a year.', 'The daily student paper at Notre Dame is called The Observer.', 'There are 9 student-run news outlets at Notre Dame.', 'The student paper Common Sense began publication at Notre Dame in 1987.', 'The headquarters of the Congregation of the Holy Cross is in Rome.', 'The primary seminary of the Congregation of the Holy Cross is Moreau Seminary.', 'The oldest structure 