In [None]:
#derived from Nov8_RFPWorkflow_pypdf2_llamaindex.ipynb
pip install PyPDF2 llama-index-vector-stores-chroma nest-asyncio



In [None]:
!pip install llama-index-llms-openai



In [None]:
pip install openai



In [None]:
import nest_asyncio
import PyPDF2
from pathlib import Path
import pickle
from typing import List, Optional
from pydantic import BaseModel
import logging
import json
import os
import asyncio
import aiohttp
import urllib.parse
from llama_index.core.workflow import (
    Event,
    StartEvent,
    StopEvent,
    Context,
    Workflow,
    step,
)
from llama_index.core import Document, VectorStoreIndex, SummaryIndex
from llama_index.core.schema import NodeWithScore
from llama_index.core.tools import FunctionTool
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.openai import OpenAI
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

nest_asyncio.apply()

# Set up logging
_logger = logging.getLogger(__name__)
_logger.setLevel(logging.INFO)

AGENT_SYSTEM_PROMPT = """\
You are a senior research agent tasked with filling out a specific form key/question with the appropriate value, given a bank of context.
You must approach each question with the following priorities:
1. Competitive Differentiation: Always highlight unique strengths, market position, and key differentiators
2. Quantifiable Evidence: Use specific metrics, case studies, and achievements to support all claims
3. Value Proposition: Emphasize ROI, cost savings, and tangible benefits
4. Recent Experience: Prioritize recent and relevant experience, especially in the target industry

For each question:
1. Think step-by-step and use the existing tools to find relevant information
2. MUST use at least one tool to answer each question
3. When using multiple sources, synthesize information to present the strongest possible response
4. Always include:
   - Specific metrics and achievements
   - Relevant case studies or examples
   - Competitive advantages
   - Value proposition elements
5. Only after exhausting all tools should you reason from first principles
6. When forced to reason without direct evidence, maintain alignment with known company strengths and market position
7. Never say 'I don't know' - provide a strategic response based on available information

Remember: Each answer contributes to the overall win probability. Make every response compelling and evidence-based.
"""

EXTRACT_KEYS_PROMPT = """\
You are a strategic bid manager analyzing an RFP document to generate a winning response that aligns with company strengths and RFP requirements.

Your task is to extract a comprehensive list of "questions" that will:
1. Address all RFP requirements
2. Highlight company differentiators
3. Demonstrate clear competitive advantages
4. Showcase relevant experience and capabilities

When extracting questions, you MUST:
1. Create questions that specifically target information available in the knowledge base
2. Ensure questions will elicit responses that highlight:
   - Market position and financial strength ($1.2B revenue, growth metrics)
   - Technical capabilities and proprietary solutions (WaveAI Pro, EcoSensor)
   - Strategic partnerships and ecosystem relationships
   - Sustainability achievements (20% carbon reduction, etc.)
   - Industry-specific experience and success metrics
   - ROI and value proposition elements
3. Make questions specific and targeted. Instead of "Describe security measures", use "Detail how WaveAI Pro's security architecture meets HIPAA compliance requirements while leveraging our partnership with TechInnovate"
4. Ensure questions will generate responses that:
   - Include quantifiable metrics and achievements
   - Highlight competitive advantages
   - Demonstrate clear value proposition
   - Showcase relevant experience
   - Address specific RFP requirements

Additional Requirements:
- Questions must be comprehensive and cover all RFP sections
- Each question must provide sufficient context for downstream processing
- Questions should encourage integration of multiple knowledge base sources
- Ensure questions will generate responses that maintain consistent win themes

Knowledge Base Files:
{file_metadata}
RFP Full Template:
{rfp_text}
"""

GENERATE_OUTPUT_PROMPT = """\
You are a senior bid strategist with extensive experience in winning complex technical proposals.
Your task is to generate a compelling RFP response that achieves a minimum 75% win probability.

Input Materials:
<rfp_document>
{output_template}
</rfp_document>
<question_answer_pairs>
{answers}
</question_answer_pairs>

Response Requirements:
1. Market Leadership
   - Weave $1.2B revenue and market position throughout
   - Highlight global presence and growth trajectory
   - Emphasize industry rankings and recognition

2. Technical Excellence
   - Feature WaveAI Pro platform capabilities
   - Showcase EcoSensor technology
   - Detail proprietary algorithms and innovations
   - Emphasize edge computing and integration expertise

3. Sustainability Leadership
   - Highlight 20% carbon reduction achievement
   - Detail energy efficiency metrics
   - Showcase environmental certifications
   - Quantify sustainability benefits

4. Value Proposition
   - Include detailed ROI analysis
   - Quantify operational benefits
   - Highlight cost savings opportunities
   - Present clear total cost of ownership

5. Partnerships & Experience
   - Feature strategic partnerships (TechInnovate, GreenTech)
   - Highlight research collaborations
   - Showcase relevant case studies
   - Include success metrics

For sections without direct answers:
1. Generate responses that align with known company strengths
2. Maintain consistency with established win themes
3. Include specific metrics and achievements from similar projects
4. Ensure alignment with overall value proposition

Output Format:
- Generate in markdown format
- Follow template structure precisely
- Integrate answers seamlessly
- Maintain professional tone
- Ensure consistent win themes throughout

Begin output with direct markdown content, no additional text or markdown indicators.
"""

class PDFParser:
    """Simple PDF parser using PyPDF2."""

    def __init__(self):
        self.reader = None

    async def aload_data(self, file_path: str) -> List[Document]:
        """Asynchronously load PDF data and convert to Documents."""
        return self.load_data(file_path)

    def load_data(self, file_path: str) -> List[Document]:
        """Load PDF data and convert to Documents."""
        documents = []
        try:
            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                for page_num in range(len(reader.pages)):
                    page = reader.pages[page_num]
                    text = page.extract_text()

                    # Create document with metadata
                    doc = Document(
                        text=text,
                        metadata={
                            'page_num': page_num + 1,
                            'file_path': file_path,
                            'source': file_path
                        }
                    )
                    documents.append(doc)

        except Exception as e:
            _logger.error(f"Error reading PDF {file_path}: {str(e)}")
            raise

        return documents

class OutputQuestions(BaseModel):
    """List of keys that make up the sections of the RFP response."""
    questions: List[str]

class OutputTemplateEvent(Event):
    docs: List[Document]

class QuestionsExtractedEvent(Event):
    questions: List[str]

class HandleQuestionEvent(Event):
    question: str

class QuestionAnsweredEvent(Event):
    question: str
    answer: str

class CollectedAnswersEvent(Event):
    combined_answers: str

class LogEvent(Event):
    msg: str
    delta: bool = False

class RFPWorkflow(Workflow):
    """RFP workflow."""

    def __init__(
        self,
        tools,
        parser: PDFParser,
        llm: Optional[OpenAI] = None,
        similarity_top_k: int = 20,
        output_dir: str = "data_out_rfp",
        agent_system_prompt: str = AGENT_SYSTEM_PROMPT,
        generate_output_prompt: str = GENERATE_OUTPUT_PROMPT,
        extract_keys_prompt: str = EXTRACT_KEYS_PROMPT,
        **kwargs,
    ) -> None:
        """Init params."""
        super().__init__(**kwargs)
        self.tools = tools
        self.parser = parser
        self.llm = llm or OpenAI(model="gpt-4-turbo")
        self.similarity_top_k = similarity_top_k
        self.output_dir = output_dir
        self.agent_system_prompt = agent_system_prompt
        self.extract_keys_prompt = extract_keys_prompt

        # Create output directory if it doesn't exist
        out_path = Path(self.output_dir) / "workflow_output"
        if not out_path.exists():
            out_path.mkdir(parents=True, exist_ok=True)
            os.chmod(str(out_path), 0o0777)

        self.generate_output_prompt = PromptTemplate(generate_output_prompt)

    @step
    async def parse_output_template(
        self, ctx: Context, ev: StartEvent
    ) -> OutputTemplateEvent:
        # Load output template file
        out_template_path = Path(
            f"{self.output_dir}/workflow_output/output_template.jsonl"
        )
        if out_template_path.exists():
            with open(out_template_path, "r") as f:
                docs = [Document.model_validate_json(line) for line in f]
        else:
            docs = await self.parser.aload_data(ev.rfp_template_path)
            # Save output template to file
            with open(out_template_path, "w") as f:
                for doc in docs:
                    f.write(doc.model_dump_json())
                    f.write("\n")

        await ctx.set("output_template", docs)
        return OutputTemplateEvent(docs=docs)

    @step
    async def extract_questions(
        self, ctx: Context, ev: OutputTemplateEvent
    ) -> HandleQuestionEvent:
        docs = ev.docs

        # Save all_questions to file
        out_keys_path = Path(f"{self.output_dir}/workflow_output/all_keys.txt")
        if out_keys_path.exists():
            with open(out_keys_path, "r") as f:
                output_qs = [q.strip() for q in f.readlines()]
        else:
            # Try stuffing all text into the prompt
            all_text = "\n\n".join([d.get_content(metadata_mode="all") for d in docs])
            prompt = PromptTemplate(template=self.extract_keys_prompt)

            file_metadata = "\n\n".join(
                [
                    f"Name:{t.metadata.name}\nDescription:{t.metadata.description}"
                    for t in self.tools
                ]
            )
            try:
                if self._verbose:
                    ctx.write_event_to_stream(
                        LogEvent(msg=">> Extracting questions from LLM")
                    )

                output_qs = self.llm.structured_predict(
                    OutputQuestions,
                    prompt,
                    file_metadata=file_metadata,
                    rfp_text=all_text,
                ).questions

                if self._verbose:
                    qs_text = "\n".join([f"* {q}" for q in output_qs])
                    ctx.write_event_to_stream(LogEvent(msg=f">> Questions:\n{qs_text}"))

            except Exception as e:
                _logger.error(f"Error extracting questions from page: {all_text}")
                _logger.error(e)

            with open(out_keys_path, "w") as f:
                f.write("\n".join(output_qs))

        await ctx.set("num_to_collect", len(output_qs))

        for question in output_qs:
            ctx.send_event(HandleQuestionEvent(question=question))

        return None

    @step
    async def handle_question(
        self, ctx: Context, ev: HandleQuestionEvent
    ) -> QuestionAnsweredEvent:
        question = ev.question

        # Initialize a Function Calling "research" agent
        research_agent = FunctionCallingAgentWorker.from_tools(
            self.tools, llm=self.llm, verbose=False, system_prompt=self.agent_system_prompt
        ).as_agent()

        response = await research_agent.aquery(question)

        if self._verbose:
            msg = f">> Asked question: {question}\n>> Got response: {str(response)}"
            ctx.write_event_to_stream(LogEvent(msg=msg))

        return QuestionAnsweredEvent(question=question, answer=str(response))

    @step
    async def combine_answers(
        self, ctx: Context, ev: QuestionAnsweredEvent
    ) -> CollectedAnswersEvent:
        num_to_collect = await ctx.get("num_to_collect")
        results = ctx.collect_events(ev, [QuestionAnsweredEvent] * num_to_collect)
        if results is None:
            return None

        combined_answers = "\n".join([result.model_dump_json() for result in results])
        # Save combined_answers to file
        with open(
            f"{self.output_dir}/workflow_output/combined_answers.jsonl", "w"
        ) as f:
            f.write(combined_answers)

        return CollectedAnswersEvent(combined_answers=combined_answers)

    @step
    async def generate_output(
        self, ctx: Context, ev: CollectedAnswersEvent
    ) -> StopEvent:
        output_template = await ctx.get("output_template")
        output_template = "\n".join(
            [doc.get_content("none") for doc in output_template]
        )

        if self._verbose:
            ctx.write_event_to_stream(LogEvent(msg=">> GENERATING FINAL OUTPUT"))

        resp = await self.llm.astream(
            self.generate_output_prompt,
            output_template=output_template,
            answers=ev.combined_answers,
        )

        final_output = ""
        async for r in resp:
            ctx.write_event_to_stream(LogEvent(msg=r, delta=True))
            final_output += r

        # Save final_output to file
        with open(f"{self.output_dir}/workflow_output/final_output.md", "w") as f:
            f.write(final_output)

        return StopEvent(result=final_output)

def generate_tool(file: str, file_description: Optional[str] = None):
    """Return a function that retrieves only within a given file."""
    filters = MetadataFilters(
        filters=[
            MetadataFilter(key="file_path", operator=FilterOperator.EQ, value=file),
        ]
    )

    def chunk_retriever_fn(query: str) -> str:
        retriever = index.as_retriever(similarity_top_k=5, filters=filters)
        nodes = retriever.retrieve(query)

        full_text = "\n\n========================\n\n".join(
            [n.get_content(metadata_mode="all") for n in nodes]
        )

        return full_text

    # Define name as a function of the file
    fn_name = Path(file).stem + "_retrieve"

    tool_description = f"Retrieves a small set of relevant document chunks from {file}."
    if file_description is not None:
        tool_description += f"\n\nFile Description: {file_description}"

    tool = FunctionTool.from_defaults(
        fn=chunk_retriever_fn, name=fn_name, description=tool_description
    )

    return tool

In [None]:
pip install llama-index-embeddings-openai



In [None]:
!pip install llama_index.utils.workflow



In [None]:
from pathlib import Path
import os
import tiktoken
from typing import List, Dict, Any
from tqdm import tqdm
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

class ChunkedPDFProcessor:
    def __init__(
        self,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
        max_tokens_per_chunk: int = 2000
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.max_tokens_per_chunk = max_tokens_per_chunk
        self.tokenizer = tiktoken.encoding_for_model("gpt-4-turbo")

    def count_tokens(self, text: str) -> int:
        """Count tokens in text using tiktoken"""
        return len(self.tokenizer.encode(text))

    def chunk_document(self, doc: Document) -> List[Document]:
        """Split document into chunks while preserving metadata"""
        # Initialize sentence splitter
        splitter = SentenceSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap
        )

        # Split into chunks
        chunks = splitter.split_text(doc.get_content())

        # Create new documents with preserved metadata and chunk info
        chunked_docs = []
        for i, chunk in enumerate(chunks):
            # Count tokens in chunk
            token_count = self.count_tokens(chunk)

            # Skip empty chunks
            if not chunk.strip():
                continue

            # Create new document with original metadata plus chunk info
            new_doc = Document(
                text=chunk,
                metadata={
                    **doc.metadata,
                    "chunk_id": i,
                    "token_count": token_count,
                    "total_chunks": len(chunks)
                }
            )
            chunked_docs.append(new_doc)

        return chunked_docs

def setup_environment():
    """Setup directories and environment variables"""
    data_dir = "/content/data"
    data_out_dir = "data_out_rfp"
    persist_dir = "storage_rfp_chroma"

    # Create directories
    for dir_path in [data_dir, data_out_dir, persist_dir]:
        Path(dir_path).mkdir(exist_ok=True)

    return data_dir, data_out_dir, persist_dir

def process_files_in_batches(
    files: List[str],
    parser: Any,
    processor: ChunkedPDFProcessor,
    data_dir: str,
    batch_size: int = 5
) -> Dict[str, Any]:
    """Process files in batches with chunking"""
    summary_llm = OpenAI(model="gpt-4-turbo")
    file_dicts = {}

    for file_name in tqdm(files, desc="Processing files"):
        print(f"\n>> Processing file {file_name}")
        file_base = Path(file_name).stem
        full_file_path = str(Path(data_dir) / file_name)

        # Parse the file
        file_docs = parser.load_data(full_file_path)

        # Process each document with chunking
        chunked_docs = []
        for doc in tqdm(file_docs, desc="Chunking documents"):
            # Add basic metadata
            doc.metadata["file_path"] = file_name
            doc.metadata["page_num"] = doc.metadata.get("page_label", 0)

            # Chunk the document
            doc_chunks = processor.chunk_document(doc)
            chunked_docs.extend(doc_chunks)

        # Process chunks in batches for summary generation
        summaries = []
        for i in range(0, len(chunked_docs), batch_size):
            batch = chunked_docs[i:i + batch_size]

            # Generate summary for batch
            summary_index = SummaryIndex(batch)
            response = summary_index.as_query_engine(llm=summary_llm).query(
                "Generate a short 1-2 line summary of this section."
            )
            summaries.append(str(response))

        # Combine summaries
        combined_summary = "\n".join(summaries)
        final_summary = SummaryIndex([Document(text=combined_summary)]).as_query_engine(
            llm=summary_llm
        ).query("Combine these summaries into a coherent 1-2 line overview.")

        # Store results
        file_dicts[file_name] = {
            "file_path": full_file_path,
            "docs": chunked_docs,
            "summary": str(final_summary),
            "chunk_count": len(chunked_docs),
            "total_tokens": sum(doc.metadata["token_count"] for doc in chunked_docs)
        }

        print(f">> Generated summary: {str(final_summary)}")
        print(f">> Chunks created: {len(chunked_docs)}")
        print(f">> Total tokens: {file_dicts[file_name]['total_tokens']}")

    return file_dicts

# Main execution
if __name__ == "__main__":
    # Setup environment
    data_dir, data_out_dir, persist_dir = setup_environment()

    # Initialize components
    os.environ["OPENAI_API_KEY"] = "your_api_key_here"
    files = ["inno_wave_customer_feedback_2023.pdf", "innowave_annual_report_2023.pdf", "innowave_competitor_analysis_2023.pdf","innowave_innovation_strategy_roadmap_2023.pdf", "innowave_market_research_2023.pdf" ]
    parser = PDFParser()
    processor = ChunkedPDFProcessor()

    # Initialize vector store
    vector_store = ChromaVectorStore.from_params(
        collection_name="rfp_docs",
        persist_dir=persist_dir
    )
    index = VectorStoreIndex.from_vector_store(vector_store)

    # Process files
    file_dicts = process_files_in_batches(files, parser, processor, data_dir)

    # Generate tools and workflow
    tools = [
        generate_tool(f, file_description=file_dicts[f]["summary"])
        for f in files
    ]

    # Initialize and run workflow
    llm = OpenAI(model="gpt-4-turbo")
    workflow = RFPWorkflow(
        tools,
        parser=parser,
        llm=llm,
        verbose=True,
        timeout=None
    )

    # Run the workflow
    async def run_workflow():
        handler = workflow.run(
            rfp_template_path=str(Path(data_dir) / "RFP_sample.pdf")
        )
        async for event in handler.stream_events():
            if isinstance(event, LogEvent):
                if event.delta:
                    print(event.msg, end="")
                else:
                    print(event.msg)
        response = await handler
        print(str(response))

    # Execute workflow
    import asyncio
    asyncio.run(run_workflow())

Processing files:   0%|          | 0/5 [00:00<?, ?it/s]


>> Processing file inno_wave_customer_feedback_2023.pdf



Chunking documents: 100%|██████████| 2/2 [00:00<00:00, 290.91it/s]
Processing files:  20%|██        | 1/5 [00:03<00:14,  3.54s/it]

>> Generated summary: In 2023, InnoWave Inc. received positive feedback for its AI and IoT products, customer service, and sustainability, but was advised to enhance user interfaces, expand product options for SMEs, and improve third-party integration.
>> Chunks created: 2
>> Total tokens: 840

>> Processing file innowave_annual_report_2023.pdf



Chunking documents: 100%|██████████| 4/4 [00:00<00:00, 317.14it/s]
Processing files:  40%|████      | 2/5 [00:08<00:12,  4.10s/it]

>> Generated summary: InnoWave Inc.'s 2023 Annual Report reveals a 15% revenue increase to $1.2 billion, attributed to advancements in AI and IoT, sustainability initiatives, and global expansion, with future goals focusing on further technological and market growth.
>> Chunks created: 4
>> Total tokens: 1686

>> Processing file innowave_competitor_analysis_2023.pdf



Chunking documents: 100%|██████████| 6/6 [00:00<00:00, 646.99it/s]
Processing files:  60%|██████    | 3/5 [00:12<00:08,  4.24s/it]

>> Generated summary: The InnoWave Inc. Competitor Analysis 2023 examines the AI and IoT markets, identifying key competitors and market shares, and uses the SWOT framework to analyze strategic opportunities for differentiation and growth.
>> Chunks created: 6
>> Total tokens: 2191

>> Processing file innowave_innovation_strategy_roadmap_2023.pdf



Chunking documents: 100%|██████████| 5/5 [00:00<00:00, 199.62it/s]
Processing files:  80%|████████  | 4/5 [00:15<00:03,  3.88s/it]

>> Generated summary: InnoWave Inc.'s Innovation Strategy Roadmap for 2023 aims to position the company as a leader in the AI and IoT markets by developing innovative, sustainable solutions and focusing on customer-centric products over the next five years.
>> Chunks created: 5
>> Total tokens: 1876

>> Processing file innowave_market_research_2023.pdf



Chunking documents: 100%|██████████| 4/4 [00:00<00:00, 568.43it/s]
Processing files: 100%|██████████| 5/5 [00:18<00:00,  3.80s/it]

>> Generated summary: The InnoWave Inc. Market Research Report 2023 offers an in-depth analysis of the global AI and IoT markets, highlighting growth trends, competitive dynamics, and strategic recommendations for market positioning and opportunity exploitation.
>> Chunks created: 4
>> Total tokens: 1654
Running step parse_output_template
Step parse_output_template produced event OutputTemplateEvent
Running step extract_questions
Step extract_questions produced no event
Running step handle_question
Running step handle_question
Running step handle_question
Running step handle_question





Step handle_question produced event QuestionAnsweredEvent
Running step handle_question
>> Asked question: 2. What are the sustainability requirements specified in the RFP for the project?
>> Got response: To provide an accurate and compelling response to this question, I will need to access the specific RFP document or any related materials that outline the sustainability requirements for the project. Since I currently do not have direct access to such documents in this environment, I will proceed by outlining a general approach based on typical sustainability requirements often seen in RFPs, especially in sectors related to technology and innovation.

Typical sustainability requirements in RFPs may include:

1. **Environmental Impact Reduction**: Requirements to demonstrate how the project will minimize environmental impact, including reductions in energy use, waste, and greenhouse gas emissions.

2. **Sustainable Materials and Resources**: Expectations to use sustainable, recycled, o

In [None]:
from llama_index.utils.workflow import draw_all_possible_flows

draw_all_possible_flows(RFPWorkflow, filename="rfp_workflow_Nov10_6:39am_2024.html")

<class 'NoneType'>
<class '__main__.CollectedAnswersEvent'>
<class '__main__.HandleQuestionEvent'>
<class 'llama_index.core.workflow.events.StopEvent'>
<class '__main__.QuestionAnsweredEvent'>
<class '__main__.OutputTemplateEvent'>
rfp_workflow_Nov10_6:39am_2024.html
