In [1]:
import sys

sys.path.append("../Scripts/")

import pandas as pd
from pdf_parser import sentence_parser
from sentence_transformers import SentenceTransformer, util

# Specify the model
mvp_v2_model = SentenceTransformer("all-mpnet-base-v2")
# Declare file path to the Report
file_path = "../SampleReports/2022_BBunting_Report.pdf"
maximum_lines = 5

# Retieve Sentences from the report
pdf_sentences = sentence_parser(
    file_path, print_sentences=False, num_sentences=10, parsing_method="pypdf"
)

  from .autonotebook import tqdm as notebook_tqdm


Report located in specified directory

Summary:
Total number of sentences: 1071

 Baby Bunting Annual Report 2022a Annual Report 2022 Baby Bunting Group Limited ABN 58 128 533 693 The 2022 Baby Bunting Annual Report reflects Baby Bunting’s performance for the 52 week period from 28 June 2021 to 26 June 2022.

 The Baby Bunting Group Limited Annual Report is available online at babybunting.com.au/investor.

 Hard copies can be obtained by contacting the Company’s share registry.

 Contents 8 Chair and CEO’s report14 Store network16 Sustainability20 The board23 Corporate governance statement36 Directors’ report55 Remuneration report77 Auditor’s independence declaration78 Financial report124 Directors’ declaration125 Independent auditor’s report130 Shareholder information133 Corporate directory Baby Bunting Annual Report 20221Notice of 2022 Annual General Meeting 10.00am (Melbourne time) Tuesday, 11 October 2022Further details will be contained in the Notice of Annual General Meeting that

In [2]:
# Declare test query here
queries = [
    "This statement is strongly related to Sustainable product packaging"
    "This statement talks about Modern Slavery",
    "This statement talks about Scope 1 emissions",
    "This statement talks about Scope 2 emissions",
    "This statement talks about Scope 3 emissions",
    "This statement talks about carbon offset",
    "This statement talks about carbon offsetting",
    "This statement talks about ethical sourcing",
    "This statement talks about recycled materials",
]

In [3]:
import time


def execute_query(parsed_sentences, current_query):
    # Record the end time
    start_time = time.time()

    # Create Embeddings for query
    query_embedding = mvp_v2_model.encode(current_query)

    # Create Embeddings for retrieved sentences
    sentence_embedding = mvp_v2_model.encode(parsed_sentences)

    # Compute similarity scores
    scores = util.cos_sim(query_embedding, sentence_embedding)[0]

    # Record the end time
    end_time = time.time()
    # Calculate the total time taken
    execution_time = end_time - start_time

    print("Execution Time:", execution_time, "seconds")

    # Create a DataFrame
    df = pd.DataFrame(
        {
            "Sentence_ID": range(1, len(parsed_sentences) + 1),
            "Sentence": parsed_sentences,
            "Score": scores.tolist(),
        }
    )

    sorted_df = df.sort_values(by="Score", ascending=False)
    sorted_df = sorted_df.head(maximum_lines)
    # Uncomment to save the scores
    sorted_df.to_csv(f"tmp/Bbunting_{current_query}.csv", index=False)


for q in queries:
    execute_query(pdf_sentences, q)

Execution Time: 29.68019723892212 seconds
Execution Time: 25.18332600593567 seconds
Execution Time: 30.06203603744507 seconds
Execution Time: 27.223752975463867 seconds
Execution Time: 27.68509316444397 seconds
Execution Time: 35.673964977264404 seconds
Execution Time: 32.29630494117737 seconds
Execution Time: 29.408658981323242 seconds
