In [1]:
import sys
sys.path.append("../NotebookScripts")

import pandas as pd
from pdf_parser import sentence_parser 
import torch
from transformers import AutoTokenizer, T5EncoderModel
import time

import warnings
# warnings.filterwarnings("ignore")

# # Load T5EncoderModel - small
# tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small", model_max_length = 750, suppress_warnings=True)
# model = T5EncoderModel.from_pretrained("google-t5/t5-small")

# Load T5EncoderModel - large (>2.8 GB - longer load time)
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-large", model_max_length = 750, suppress_warnings = True)
model = T5EncoderModel.from_pretrained("google-t5/t5-large")

In [2]:
file_path = '../SampleReports/2023_Coles_Report.pdf'

In [3]:
pdf_sentences = sentence_parser(file_path, print_sentences=True ,num_sentences=10, parsing_method='pypdf')

Report located in specified directory

Summary:
Total number of sentences: 685

 2023 Sustainability ReportWorking towards a more sustainable future Coles Group Limited ABN 11 004 089 936Acknowledgement of Country Coles wishes to acknowledge the Traditional Custodians of Country throughout Australia.

 We recognise their strength and resilience and pay our respects to their Elders past and present.

 Coles extends that respect to all Aboriginal and Torres Strait Islander people, and recognises their rich cultures and their continuing connection to land and waters.

 Aboriginal and Torres Strait Islander people are advised that this report may contain names and images of people who are deceased.

 All references to Indigenous and First Nations people in this report are intended to include Aboriginal and/or Torres Strait Islander people.

 Feedback We welcome feedback on this report.

 This report also includes forward-looking statements regarding climate change and other environmental a

In [4]:
# Record the end time
start_time = time.time()

# Initialize an empty list to store dictionaries
sentence_data = []

# Iterate over the sentences
for idx, sentence in enumerate(pdf_sentences, start=1):
    # Tokenize Input sentence and query
    input_ids = tokenizer(sentence, return_tensors='pt').input_ids
    query_ids = tokenizer('This statement is strongly related to Sustainable practices', return_tensors='pt').input_ids

    # Generate Encodings
    outputs = model(input_ids=input_ids)
    query = model(input_ids=query_ids)

    # Retrieve encodings for sentence and query.
    last_hidden_states = outputs.last_hidden_state
    last_hidden_states_query = query.last_hidden_state

    # Apply mean pooling along the token dimension (dim=1)
    pooled_last_hidden_states = torch.mean(last_hidden_states, dim=1).unsqueeze(0)
    pooled_last_hidden_states_query = torch.mean(last_hidden_states_query, dim=1).unsqueeze(0)

    # Reshape pooled_last_hidden_states_query to match the shape of pooled_last_hidden_states
    pooled_last_hidden_states_query = pooled_last_hidden_states_query.view(1, -1)

    # Compute dot product between the pooled representations
    similarity = torch.matmul(pooled_last_hidden_states, pooled_last_hidden_states_query.T)

    # Append a dictionary with score and sentence to the list
    sentence_data.append({'Sentence_ID': idx, 'Score': similarity.item(), 'Sentence': sentence})

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(sentence_data)

# Record the end time
end_time = time.time()

In [5]:
# Calculate the total time taken
execution_time = end_time - start_time

print("Execution Time:", execution_time, "seconds")

Execution Time: 172.2840449810028 seconds


In [6]:
# Sort the DataFrame by the "Score" column in descending order
df_sorted = df.sort_values(by='Score', ascending=False)

# Display the sorted DataFrame
print(df_sorted.head(100))

     Sentence_ID     Score                                           Sentence
513          514  4.953652                     Achieve Platinum in the AWEI .
509          510  4.738400                    Sustain 40/40/201 on the Board.
110          111  4.572626  We are doing this by: Building the resilience ...
674          675  4.564928  Coles reported performance of the following se...
151          152  4.564800  This Scope 3 target was validated by the Scien...
..           ...       ...                                                ...
560          561  3.783192  During the year our Safety Plan focused on bui...
253          254  3.782845  This includes a dedicated sustainable packagin...
6              7  3.775985  This report also includes forward-looking stat...
104          105  3.775572  In the area of product safety and quality , we...
301          302  3.774663  This joins a range of MSC-certified products w...

[100 rows x 3 columns]
