In [1]:
import sys
import time

sys.path.append("../Scripts/")

import pandas as pd
from pdf_parser import sentence_parser
from sentence_transformers import SentenceTransformer, util

# Specify the model
mvp_v2_model = SentenceTransformer("all-mpnet-base-v2")
maximum_lines = 5



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Retieve Sentences from the report
def parse(file_path):
    pdf_sentences = sentence_parser(
        file_path, print_sentences=False, num_sentences=10, parsing_method="pypdf"
    )
    return [p['Sentence'] for p in pdf_sentences]

def execute_query(parsed_sentences, current_query, filename):
    # Record the end time
    start_time = time.time()

    # Create Embeddings for query
    query_embedding = mvp_v2_model.encode(current_query)

    # Create Embeddings for retrieved sentences
    sentence_embedding = mvp_v2_model.encode(parsed_sentences)

    # Compute similarity scores
    scores = util.cos_sim(query_embedding, sentence_embedding)[0]

    # Record the end time
    end_time = time.time()
    # Calculate the total time taken
    execution_time = end_time - start_time

    print("Execution Time:", execution_time, "seconds")

    # Create a DataFrame
    df = pd.DataFrame(
        {
            "Sentence_ID": range(1, len(parsed_sentences) + 1),
            "Sentence": parsed_sentences,
            "Score": scores.tolist(),
        }
    )

    sorted_df = df.sort_values(by="Score", ascending=False)
    sorted_df = sorted_df.head(maximum_lines)
    # Uncomment to save the scores
    sorted_df.to_csv(f"tmp/{filename}_{current_query}.csv", index=False)



In [3]:
# Declare test query here
queries = [
    "Related to ethical sourcing",
    "Related to Modern Slavery",
    "Investment in Renewable Energy",
    "sustainable packaging",
    "recycling",
    "reducing waste",
    "carbon emissions",
    "reduction in ghg emissions",
    "net zero",
    'scope 1',
    "scope 2",
    "scope 3",
    "carbon offset",
    "sustainable product"
]

# Baby Bunting

In [4]:
pdf_sentences = parse('../SampleReports/2022_BBunting_Report.pdf')

for q in queries:
    execute_query(pdf_sentences, q, 'Baby_bunt_')


Summary:
Total number of sentences: 1167
Execution Time: 17.35959482192993 seconds
Execution Time: 14.142327785491943 seconds
Execution Time: 13.053298950195312 seconds
Execution Time: 13.226266145706177 seconds
Execution Time: 13.159518003463745 seconds
Execution Time: 14.068059206008911 seconds
Execution Time: 14.22685194015503 seconds
Execution Time: 13.999032020568848 seconds
Execution Time: 14.16236686706543 seconds
Execution Time: 14.292062997817993 seconds
Execution Time: 14.421489953994751 seconds
Execution Time: 14.759880781173706 seconds
Execution Time: 14.57327914237976 seconds
Execution Time: 14.582238912582397 seconds


In [5]:
pdf_sentences = parse('../SampleReports/Metcash-Sustainability-Report-2022.pdf')

for q in queries:
    execute_query(pdf_sentences, q, 'Metcash_')


Summary:
Total number of sentences: 375
Execution Time: 6.166338920593262 seconds
Execution Time: 4.947556972503662 seconds
Execution Time: 4.974768877029419 seconds
Execution Time: 4.896899938583374 seconds
Execution Time: 4.917685031890869 seconds
Execution Time: 5.11457896232605 seconds
Execution Time: 4.912309885025024 seconds
Execution Time: 4.8960840702056885 seconds
Execution Time: 5.169673919677734 seconds
Execution Time: 5.024552822113037 seconds
Execution Time: 5.502887964248657 seconds
Execution Time: 5.484130144119263 seconds
Execution Time: 5.218505859375 seconds
Execution Time: 5.477402210235596 seconds
