In [1]:
!pip install llama-index --quiet
!pip install llama-index-core --quiet
!pip install llama-index-postprocessor-flag-embedding-reranker --quiet
!pip install git+https://github.com/FlagOpen/FlagEmbedding.git --quiet
!pip install llama-parse --quiet

!pip install llama-index-embeddings-gemini -q
!pip install llama-index-llms-gemini -q
!pip install langchain -q 

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 15.0.0 which is incompatible.
kfp 2.5.0 requires google-cloud-storage<3,>=2.2.1, but you have google-cloud-storage 1.44.0 which is incompatible.
kfp 2.5.0 requires kubernetes<27,>=8.0.0, but you have kubernetes 29.0.0 which is incompatible.
libpysal 4.9.2 requires packaging>=22, but you have packaging 21.3 which is incompatible.
libpysal 4.9.2 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.
momepy 0.7.0 requires shapely>=2, but you have shapely 1.8.5.post1 which is incompatible.
preprocessing 0.1.13 req

In [2]:
import pandas as pd

import gc
import torch
from llama_index.core import Settings
from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex
from llama_index.core.prompts import PromptTemplate

from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [3]:
import nest_asyncio
nest_asyncio.apply()

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
GOOGLE_API_KEY = user_secrets.get_secret("GOOGLE_API_KEY")
LLAMA_CLOUD_API_KEY = user_secrets.get_secret("llama_cloud")

import os
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [4]:
data_dir = "/kaggle/input/unifi-data"

amkey_syn = pd.read_csv(f"{data_dir}/ActivityMetricsSynonyms.csv")
amkey_df = pd.read_csv(f"{data_dir}/AMKEY_GoldenStandard.csv")
sub_df = pd.read_csv(f"{data_dir}/SampleSubmission.csv")

In [5]:
import asyncio
from abc import abstractmethod
from typing import Any, Dict, List, Optional, Sequence, Tuple, cast

from tqdm import tqdm

from llama_index.core.async_utils import DEFAULT_NUM_WORKERS, run_jobs
from llama_index.core.base.response.schema import PydanticResponse
from llama_index.core.bridge.pydantic import BaseModel, Field, ValidationError
from llama_index.core.callbacks.base import CallbackManager
from llama_index.core.llms.llm import LLM
from llama_index.core.node_parser.interface import NodeParser
from llama_index.core.schema import BaseNode, Document, IndexNode, TextNode
from llama_index.core.utils import get_tqdm_iterable

DEFAULT_SUMMARY_QUERY_STR = """print NO INPUT"""

class TableColumnOutput(BaseModel):
    """Output from analyzing a table column."""

    col_name: str
    col_type: str
    summary: Optional[str] = None

    def __str__(self) -> str:
        """Convert to string representation."""
        return (
            f"Column: {self.col_name}\nType: {self.col_type}\nSummary: {self.summary}"
        )


class TableOutput(BaseModel):
    """Output from analyzing a table."""

    summary: str
    table_title: Optional[str] = None
    table_id: Optional[str] = None
    columns: List[TableColumnOutput]


class Element(BaseModel):
    """Element object."""

    id: str
    type: str
    element: Any
    title_level: Optional[int] = None
    table_output: Optional[TableOutput] = None
    table: Optional[pd.DataFrame] = None

    class Config:
        arbitrary_types_allowed = True


class BaseElementNodeParser(NodeParser):
    """
    Splits a document into Text Nodes and Index Nodes corresponding to embedded objects.

    Supports text and tables currently.
    """

    callback_manager: CallbackManager = Field(
        default_factory=CallbackManager, exclude=True
    )
    llm: Optional[LLM] = Field(
        default=None, description="LLM model to use for summarization."
    )
    summary_query_str: str = Field(
        default=DEFAULT_SUMMARY_QUERY_STR,
        description="Query string to use for summarization.",
    )
    num_workers: int = Field(
        default=DEFAULT_NUM_WORKERS,
        description="Num of works for async jobs.",
    )

    show_progress: bool = Field(default=True, description="Whether to show progress.")

    @classmethod
    def class_name(cls) -> str:
        return "BaseStructuredNodeParser"

    @classmethod
    def from_defaults(
        cls,
        callback_manager: Optional[CallbackManager] = None,
        **kwargs: Any,
    ) -> "BaseElementNodeParser":
        callback_manager = callback_manager or CallbackManager([])

        return cls(
            callback_manager=callback_manager,
            **kwargs,
        )

    def _parse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        all_nodes: List[BaseNode] = []
        nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")

        for node in nodes_with_progress:
            nodes = self.get_nodes_from_node(node)
            all_nodes.extend(nodes)

        return all_nodes

    @abstractmethod
    def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """Get nodes from node."""

    @abstractmethod
    def extract_elements(self, text: str, **kwargs: Any) -> List[Element]:
        """Extract elements from text."""

    def get_table_elements(self, elements: List[Element]) -> List[Element]:
        """Get table elements."""
        return [e for e in elements if e.type == "table" or e.type == "table_text"]

    def get_text_elements(self, elements: List[Element]) -> List[Element]:
        """Get text elements."""
        # TODO: There we should maybe do something with titles
        # and other elements in the future?
        return [e for e in elements if e.type != "table"]

    def extract_table_summaries(self, elements: List[Element]) -> None:
        """Go through elements, extract out summaries that are tables."""
        from llama_index.core.indices.list.base import SummaryIndex
        from llama_index.core.service_context import ServiceContext

        llm = self.llm
        ## Changes
#         if self.llm:
#             llm = self.llm
#         else:
#             try:
#                 from llama_index.llms.openai import OpenAI  # pants: no-infer-dep
#             except ImportError as e:
#                 raise ImportError(
#                     "`llama-index-llms-openai` package not found."
#                     " Please install with `pip install llama-index-llms-openai`."
#                 )
#             llm = OpenAI()
        llm = cast(LLM, llm)

        service_context = ServiceContext.from_defaults(llm=llm, embed_model=None)

        table_context_list = []
        for idx, element in tqdm(enumerate(elements)):
            if element.type not in ("table", "table_text"):
                continue
            table_context = str(element.element)
            if idx > 0 and str(elements[idx - 1].element).lower().strip().startswith(
                "table"
            ):
                table_context = str(elements[idx - 1].element) + "\n" + table_context
            if idx < len(elements) + 1 and str(
                elements[idx - 1].element
            ).lower().strip().startswith("table"):
                table_context += "\n" + str(elements[idx + 1].element)

            table_context_list.append(table_context)

        ## Changes
        async def _get_table_output(table_context: str, summary_query_str: str) -> Any:
#             index = SummaryIndex.from_documents(
#                 [Document(text=table_context)], service_context=service_context
#             )
#             query_engine = index.as_query_engine(llm=llm, output_cls=TableOutput)
#             try:
#                 response = await query_engine.aquery(summary_query_str)
#                 return cast(PydanticResponse, response).response
#             except ValidationError:
#                 # There was a pydantic validation error, so we will run with text completion
#                 # fill in the summary and leave other fields blank
#                 query_engine = index.as_query_engine()
#                 response_txt = await query_engine.aquery(summary_query_str)
            return TableOutput(summary=str(table_context), columns=[])

        summary_jobs = [
            _get_table_output(table_context, self.summary_query_str)
            for table_context in table_context_list
        ]
        summary_outputs = asyncio.run(
            run_jobs(
                summary_jobs, show_progress=self.show_progress, workers=self.num_workers
            )
        )
        for element, summary_output in zip(elements, summary_outputs):
            element.table_output = summary_output

    def get_base_nodes_and_mappings(
        self, nodes: List[BaseNode]
    ) -> Tuple[List[BaseNode], Dict]:
        """Get base nodes and mappings.

        Given a list of nodes and IndexNode objects, return the base nodes and a mapping
        from index id to child nodes (which are excluded from the base nodes).

        """
        node_dict = {node.node_id: node for node in nodes}

        node_mappings = {}
        base_nodes = []

        # first map index nodes to their child nodes
        nonbase_node_ids = set()
        for node in nodes:
            if isinstance(node, IndexNode):
                node_mappings[node.index_id] = node_dict[node.index_id]
                nonbase_node_ids.add(node.index_id)
            else:
                pass

        # then add all nodes that are not children of index nodes
        for node in nodes:
            if node.node_id not in nonbase_node_ids:
                base_nodes.append(node)

        return base_nodes, node_mappings

    def get_nodes_and_objects(
        self, nodes: List[BaseNode]
    ) -> Tuple[List[BaseNode], List[IndexNode]]:
        base_nodes, node_mappings = self.get_base_nodes_and_mappings(nodes)

        nodes = []
        objects = []
        for node in base_nodes:
            if isinstance(node, IndexNode):
                node.obj = node_mappings[node.index_id]
                objects.append(node)
            else:
                nodes.append(node)

        return nodes, objects

    def _get_nodes_from_buffer(
        self, buffer: List[str], node_parser: NodeParser
    ) -> List[BaseNode]:
        """Get nodes from buffer."""
        doc = Document(text="\n\n".join(list(buffer)))
        return node_parser.get_nodes_from_documents([doc])

    def get_nodes_from_elements(self, elements: List[Element]) -> List[BaseNode]:
        """Get nodes and mappings."""
        from llama_index.core.node_parser import SentenceSplitter

        node_parser = SentenceSplitter()

        nodes = []
        cur_text_el_buffer: List[str] = []
        for element in elements:
            if element.type == "table" or element.type == "table_text":
                # flush text buffer for table
                if len(cur_text_el_buffer) > 0:
                    cur_text_nodes = self._get_nodes_from_buffer(
                        cur_text_el_buffer, node_parser
                    )
                    nodes.extend(cur_text_nodes)
                    cur_text_el_buffer = []

                table_output = cast(TableOutput, element.table_output)
                table_md = ""
                if element.type == "table":
                    table_df = cast(pd.DataFrame, element.table)
                    # We serialize the table as markdown as it allow better accuracy
                    # We do not use the table_df.to_markdown() method as it generate
                    # a table with a token hungry format.
                    table_md = "|"
                    for col_name, col in table_df.items():
                        table_md += f"{col_name}|"
                    table_md += "\n|"
                    for col_name, col in table_df.items():
                        table_md += f"---|"
                    table_md += "\n"
                    for row in table_df.itertuples():
                        table_md += "|"
                        for col in row[1:]:
                            table_md += f"{col}|"
                        table_md += "\n"
                elif element.type == "table_text":
                    # if the table is non-perfect table, we still want to keep the original text of table
                    table_md = str(element.element)
                table_id = element.id + "_table"
                table_ref_id = element.id + "_table_ref"

                col_schema = "\n\n".join([str(col) for col in table_output.columns])

                # We build a summary of the table containing the extracted summary, and a description of the columns
                table_summary = str(table_output.summary)
                if table_output.table_title:
                    table_summary += ",\nwith the following table title:\n"
                    table_summary += str(table_output.table_title)

                table_summary += ",\nwith the following columns:\n"

                for col in table_output.columns:
                    table_summary += f"- {col.col_name}: {col.summary}\n"

                index_node = IndexNode(
                    text=table_summary,
                    metadata={"col_schema": col_schema},
                    excluded_embed_metadata_keys=["col_schema"],
                    id_=table_ref_id,
                    index_id=table_id,
                )

                table_str = table_summary + "\n" + table_md

                text_node = TextNode(
                    text=table_str,
                    id_=table_id,
                    metadata={
                        # serialize the table as a dictionary string for dataframe of perfect table
                        "table_df": (
                            str(table_df.to_dict())
                            if element.type == "table"
                            else table_md
                        ),
                        # add table summary for retrieval purposes
                        "table_summary": table_summary,
                    },
                    excluded_embed_metadata_keys=["table_df", "table_summary"],
                    excluded_llm_metadata_keys=["table_df", "table_summary"],
                )
                nodes.extend([index_node, text_node])
            else:
                cur_text_el_buffer.append(str(element.element))
        # flush text buffer
        if len(cur_text_el_buffer) > 0:
            cur_text_nodes = self._get_nodes_from_buffer(
                cur_text_el_buffer, node_parser
            )
            nodes.extend(cur_text_nodes)
            cur_text_el_buffer = []

        # remove empty nodes
        return [node for node in nodes if len(node.text) > 0]

In [6]:
from io import StringIO
from typing import Any, Callable, List, Optional

import pandas as pd
from llama_index.core.node_parser.relational.base_element import (
#     BaseElementNodeParser,
    Element,
)
from llama_index.core.schema import BaseNode, TextNode


def md_to_df(md_str: str) -> pd.DataFrame:
    """Convert Markdown to dataframe."""
    # Replace " by "" in md_str
    md_str = md_str.replace('"', '""')

    # Replace markdown pipe tables with commas
    md_str = md_str.replace("|", '","')

    # Remove the second line (table header separator)
    lines = md_str.split("\n")
    md_str = "\n".join(lines[:1] + lines[2:])

    # Remove the first and last second char of the line (the pipes, transformed to ",")
    lines = md_str.split("\n")
    md_str = "\n".join([line[2:-2] for line in lines])

    # Check if the table is empty
    if len(md_str) == 0:
        return None

    # Use pandas to read the CSV string into a DataFrame
    return pd.read_csv(StringIO(md_str))


class MarkdownElementNodeParser(BaseElementNodeParser):
    """Markdown element node parser.

    Splits a markdown document into Text Nodes and Index Nodes corresponding to embedded objects
    (e.g. tables).

    """

    @classmethod
    def class_name(cls) -> str:
        return "MarkdownElementNodeParser"

    def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """Get nodes from node."""
        elements = self.extract_elements(
            node.get_content(),
            table_filters=[self.filter_table],
            node_id=node.id_,
        )
        table_elements = self.get_table_elements(elements)
        # extract summaries over table elements
        self.extract_table_summaries(table_elements)
        # convert into nodes
        # will return a list of Nodes and Index Nodes
        return self.get_nodes_from_elements(elements)

    def extract_elements(
        self,
        text: str,
        node_id: Optional[str] = None,
        table_filters: Optional[List[Callable]] = None,
        **kwargs: Any,
    ) -> List[Element]:
        # get node id for each node so that we can avoid using the same id for different nodes
        """Extract elements from text."""
        lines = text.split("\n")
        currentElement = None

        elements: List[Element] = []
        # Then parse the lines
        for line in lines:
            if line.startswith("```"):
                # check if this is the end of a code block
                if currentElement is not None and currentElement.type == "code":
                    elements.append(currentElement)
                    currentElement = None
                    # if there is some text after the ``` create a text element with it
                    if len(line) > 3:
                        elements.append(
                            Element(
                                id=f"id_{len(elements)}",
                                type="text",
                                element=line.lstrip("```"),
                            )
                        )

                elif line.count("```") == 2 and line[-3] != "`":
                    # check if inline code block (aka have a second ``` in line but not at the end)
                    if currentElement is not None:
                        elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}",
                        type="code",
                        element=line.lstrip("```"),
                    )
                elif currentElement is not None and currentElement.type == "text":
                    currentElement.element += "\n" + line
                else:
                    if currentElement is not None:
                        elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="text", element=line
                    )

            elif currentElement is not None and currentElement.type == "code":
                currentElement.element += "\n" + line

            elif line.startswith("|"):
                if currentElement is not None and currentElement.type != "table":
                    if currentElement is not None:
                        elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="table", element=line
                    )
                elif currentElement is not None:
                    currentElement.element += "\n" + line
                else:
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="table", element=line
                    )
            elif line.startswith("#"):
                if currentElement is not None:
                    elements.append(currentElement)
                currentElement = Element(
                    id=f"id_{len(elements)}",
                    type="title",
                    element=line.lstrip("#"),
                    title_level=len(line) - len(line.lstrip("#")),
                )
            else:
                if currentElement is not None and currentElement.type != "text":
                    elements.append(currentElement)
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="text", element=line
                    )
                elif currentElement is not None:
                    currentElement.element += "\n" + line
                else:
                    currentElement = Element(
                        id=f"id_{len(elements)}", type="text", element=line
                    )
        if currentElement is not None:
            elements.append(currentElement)

        for idx, element in enumerate(elements):
            if element.type == "table":
                should_keep = True
                perfect_table = True

                # verify that the table (markdown) have the same number of columns on each rows
                table_lines = element.element.split("\n")
                table_columns = [len(line.split("|")) for line in table_lines]
                if len(set(table_columns)) > 1:
                    # if the table have different number of columns on each rows, it's not a perfect table
                    # we will store the raw text for such tables instead of converting them to a dataframe
                    perfect_table = False

                # verify that the table (markdown) have at least 2 rows
                if len(table_lines) < 2:
                    should_keep = False

                # apply the table filter, now only filter empty tables
                if should_keep and perfect_table and table_filters is not None:
                    should_keep = all(tf(element) for tf in table_filters)

                # if the element is a table, convert it to a dataframe
                if should_keep:
                    if perfect_table:
                        table = md_to_df(element.element)

                        elements[idx] = Element(
                            id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
                            type="table",
                            element=element,
                            table=table,
                        )
                    else:
                        # for non-perfect tables, we will store the raw text
                        # and give it a different type to differentiate it from perfect tables
                        elements[idx] = Element(
                            id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
                            type="table_text",
                            element=element.element,
                            # table=table
                        )
                else:
                    elements[idx] = Element(
                        id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
                        type="text",
                        element=element.element,
                    )
            else:
                # if the element is not a table, keep it as to text
                elements[idx] = Element(
                    id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
                    type="text",
                    element=element.element,
                )

        # merge consecutive text elements together for now
        merged_elements: List[Element] = []
        for element in elements:
            if (
                len(merged_elements) > 0
                and element.type == "text"
                and merged_elements[-1].type == "text"
            ):
                merged_elements[-1].element += "\n" + element.element
            else:
                merged_elements.append(element)
        elements = merged_elements
        return merged_elements

    def filter_table(self, table_element: Any) -> bool:
        """Filter tables."""
        table_df = md_to_df(table_element.element)

        # check if table_df is not None, has more than one row, and more than one column
        return table_df is not None and not table_df.empty and len(table_df.columns) > 1

In [7]:
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory

safety_settings = {
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,

}

Settings.embed_model = GeminiEmbedding(model_name="models/embedding-001")
Settings.llm = Gemini(model="models/gemini-pro",
                             temperature=0.0,
                             safety_settings=safety_settings,
                             )

Settings.chunk_size = 200
Settings.chunk_overlap = 20

In [8]:
big_dict = {
    "Absa": {"path": f"{data_dir}/Absa.pdf","retriever": None},
    "Clicks": {"path": f"{data_dir}/Clicks.pdf","retriever": None},
    "Distell": {"path": f"{data_dir}/Distell.pdf","retriever": None},
    "Oceana1&2": {"path": [f"{data_dir}/Oceana1.pdf",f"{data_dir}/Oceana2.pdf"],"retriever": None}, 
    "Ssw": {"path": f"{data_dir}/Ssw.pdf","retriever": None},
    "Picknpay": {"path": f"{data_dir}/Picknpay.pdf","retriever": None},
    "Sasol": {"path": f"{data_dir}/Sasol.pdf","retriever": None},
    "Impala": {"path": f"{data_dir}/Impala.pdf","retriever": None},
}

In [9]:
parser = LlamaParse(
        result_type="markdown",  # "markdown" and "text" are available
        num_workers=4, # if multiple files passed, split in `num_workers` API calls
        api_key=LLAMA_CLOUD_API_KEY
    )

node_parser = MarkdownElementNodeParser(llm = None, num_workers=8)

for group, _ in big_dict.items():
    print(group)
    documents = parser.load_data(big_dict[group]['path'])

    ## NODES
    nodes = node_parser.get_nodes_from_documents(documents)

    ## nodes and mappings
    base_nodes, node_mappings = node_parser.get_base_nodes_and_mappings(nodes)

    index = VectorStoreIndex(nodes=base_nodes)
    index_ret = index.as_retriever(top_k=5)

    ## recursive query engine
    recursive_index = RecursiveRetriever(
        "vector",
        retriever_dict={"vector": index_ret},
        node_dict=node_mappings,
        verbose=False,
    )
    
    big_dict[group]['retriever'] = recursive_index
    
    del documents, nodes, base_nodes, node_mappings, index, index_ret, recursive_index
    gc.collect()

Absa
Started parsing the file under job_id dcceb915-a5c9-42c5-a60e-eb0ca356d48c
...............................................................

  service_context = ServiceContext.from_defaults(llm=llm, embed_model=None)


LLM is explicitly disabled. Using MockLLM.
Embeddings have been explicitly disabled. Using MockEmbedding.


46it [00:00, 11943.67it/s]
100%|██████████| 46/46 [00:00<00:00, 16173.86it/s]


Clicks
Started parsing the file under job_id 2e5cbb2c-4177-4b54-b8e2-066103dc7147
..

  service_context = ServiceContext.from_defaults(llm=llm, embed_model=None)


LLM is explicitly disabled. Using MockLLM.
Embeddings have been explicitly disabled. Using MockEmbedding.


31it [00:00, 11728.61it/s]
100%|██████████| 31/31 [00:00<00:00, 9799.04it/s]


Distell
Started parsing the file under job_id 6dd4f65e-14c1-48bb-8e65-7e873bdf1375
..

  service_context = ServiceContext.from_defaults(llm=llm, embed_model=None)


LLM is explicitly disabled. Using MockLLM.
Embeddings have been explicitly disabled. Using MockEmbedding.


22it [00:00, 10390.12it/s]
100%|██████████| 22/22 [00:00<00:00, 14366.29it/s]


Oceana1&2
Started parsing the file under job_id 7724d73a-b8ab-4e76-8c39-136ff8ef08e9
Started parsing the file under job_id e3d7ba7a-25e5-43bb-acef-3d9f334dac38


  service_context = ServiceContext.from_defaults(llm=llm, embed_model=None)


LLM is explicitly disabled. Using MockLLM.
Embeddings have been explicitly disabled. Using MockEmbedding.


56it [00:00, 14408.11it/s]
100%|██████████| 56/56 [00:00<00:00, 11715.35it/s]
  service_context = ServiceContext.from_defaults(llm=llm, embed_model=None)


LLM is explicitly disabled. Using MockLLM.
Embeddings have been explicitly disabled. Using MockEmbedding.


14it [00:00, 35416.32it/s]
100%|██████████| 14/14 [00:00<00:00, 9419.35it/s]


Ssw
Started parsing the file under job_id 0a9466e9-e44a-4507-9b5a-084f0ed08975


  service_context = ServiceContext.from_defaults(llm=llm, embed_model=None)


LLM is explicitly disabled. Using MockLLM.
Embeddings have been explicitly disabled. Using MockEmbedding.


363it [00:00, 18941.21it/s]
100%|██████████| 363/363 [00:00<00:00, 16903.88it/s]


Picknpay
Started parsing the file under job_id 7698065c-b17f-4fe3-8fb5-e36afb9df17c


  service_context = ServiceContext.from_defaults(llm=llm, embed_model=None)


LLM is explicitly disabled. Using MockLLM.
Embeddings have been explicitly disabled. Using MockEmbedding.


25it [00:00, 9189.97it/s]
100%|██████████| 25/25 [00:00<00:00, 14048.45it/s]


Sasol
Started parsing the file under job_id daf2be7c-acd8-411b-a5fd-3c73927f430e


  service_context = ServiceContext.from_defaults(llm=llm, embed_model=None)


LLM is explicitly disabled. Using MockLLM.
Embeddings have been explicitly disabled. Using MockEmbedding.


85it [00:00, 18606.33it/s]
100%|██████████| 85/85 [00:00<00:00, 17333.52it/s]


Impala
Started parsing the file under job_id 5e6a3fbb-2268-4316-a6eb-5eac17b68965


  service_context = ServiceContext.from_defaults(llm=llm, embed_model=None)


LLM is explicitly disabled. Using MockLLM.
Embeddings have been explicitly disabled. Using MockEmbedding.


81it [00:00, 20024.67it/s]
100%|██████████| 81/81 [00:00<00:00, 16380.84it/s]


In [10]:
qa_prompt = """Retrieve and present accurate factual numeric value.
Very Important Rules to remember before answering:
1. Output should be integer or float value only, no alphabets or symbols are allowed.
2. False positives can cause big loss, so if you don't find exact Input in the context, then just return 0.0 float value.
4. Data should be from 2022 annual reports only.

Here is the context: {context_str}
Input: {query_str}"""

qa_template = PromptTemplate(qa_prompt)

In [11]:
with_2022_data = ["Absa",
        "Clicks",
        "Distell",
        "Oceana1&2",
        "Ssw",
        "Picknpay",
        "Sasol",
        "Impala"]

In [12]:
import time

def generate(id_):
    amkey, group = id_.split("_X_")
    print(f'Group: {group}')
    try:
        question = amkey_syn[(amkey_syn["AMKEY"]==int(amkey)) & (amkey_syn["Group"]==group)]['ClientMetric'].values[0]
    except:
        question = amkey_df[amkey_df["AMKEY"]==int(amkey)]["ActivityMetric"].values[0]

    print(f'Question: {question}\n')

    if group not in with_2022_data:
        return 0.0

    recursive_index = big_dict[group]["retriever"]
    recursive_query_engine = RetrieverQueryEngine.from_args(recursive_index,
                                                            verbose=False)

    recursive_query_engine.update_prompts(
        {"response_synthesizer:text_qa_template": qa_template}
        )
    
    while True:
        try:
            
            result = recursive_query_engine.query(question)
            response = result.response
            break
        except Exception as e:
            print(f"Error: {e}")

    return response

In [13]:
def clean_output(text):
    try:
        return float(text)
    
    except:
        text = text.replace(" ","").replace(",","")
        return float(text)

In [14]:
import time
for i, id_ in enumerate(sub_df.ID):
    result = generate(id_)
    print(f'idx: {i} ---- value: {result}')
    sub_df.loc[i, "2022_Value"] = clean_output(result)

Group: Absa
Question: Discussion of strategies to reduce the environmental impact of packaging

idx: 0 ---- value: 0.0
Group: Clicks
Question: Discussion of strategies to reduce the environmental impact of packaging

idx: 1 ---- value: 0.0
Group: Distell
Question: Discussion of strategies to reduce the environmental impact of packaging

idx: 2 ---- value: 0.0
Group: Impala
Question: Discussion of strategies to reduce the environmental impact of packaging

idx: 3 ---- value: 0.0
Group: Oceana1&2
Question: Discussion of strategies to reduce the environmental impact of packaging

idx: 4 ---- value: 0.0
Group: Picknpay
Question: Discussion of strategies to reduce the environmental impact of packaging

idx: 5 ---- value: 0.0
Group: Sasol
Question: Discussion of strategies to reduce the environmental impact of packaging

idx: 6 ---- value: 0.0
Group: Ssw
Question: Discussion of strategies to reduce the environmental impact of packaging

idx: 7 ---- value: 0.0
Group: Tongaat
Question: Discuss

In [15]:
sub_df.to_csv("submission.csv", index=False)