<a href="https://colab.research.google.com/github/JyotsanaShankar/Thesis/blob/main/thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installations

In [None]:
!pip install transformers torch torchvision farm-haystack[all]

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting farm-haystack[all]
  Downloading farm_haystack-1.21.0-py3-none-any.whl (816 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m816.9/816.9 kB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
 

In [None]:
!pip install tensorflow==2.13.*

In [None]:
!pip install tabula-py

In [None]:
!pip install PyPDF2 pycryptodome

In [None]:
!pip install nltk

## Imports

In [None]:
import os
import fitz
import tabula
import io
import PyPDF2
import pickle
import json

In [None]:
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import PreProcessor, FARMReader, DensePassageRetriever, DenseRetriever, PDFToTextConverter
from haystack.document_stores.faiss import FAISSDocumentStore

In [None]:
import nltk
from nltk import word_tokenize
from nltk.metrics.scores import f_measure
# nltk.download() # Run this if you get error to download punkt. select download adn type 'punkt' and hit enter

# Implementation

### Pdf File processing

Remove header and footer to remove unnecessary duplicate headers, footers and page numbers

In [None]:
def remove_header_footer(header, footer, input_path, output_path):
  with open(input_path,'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    pdf_writer = PyPDF2.PdfWriter()

    for page in pdf_reader.pages:
      page.mediabox.top = page.mediabox.top - header
      page.mediabox.bottom = page.mediabox.bottom + footer
      pdf_writer.add_page(page)

  with open(output_path, 'wb') as output_pdf:
    pdf_writer.write(output_pdf)


Funtion to extract text data(using PyMuPdf) and tabular data(using tabula)

In [None]:
def get_text_data(pdf_path):
  doc = fitz.open(temp_file_path)
  pdf_text = ""
  print('reading', pdf_path, 'having', len(doc), 'pages' )

  for page_num in range(len(doc)):
      page = doc[page_num]
      pdf_text += page.get_text("text")
  return pdf_text

def get_tabular_data(pdf_path):
  # Extract tabular data using tabula-py
  try:
    tabular_data = tabula.read_pdf(pdf_path, pages="all", format='JSON')
  except Exception as e:
      tabular_data = []

  # Initialize an empty list to store the converted tabular data
  tabular_data_dicts = []

  cleaned_df = [df.dropna(axis=1, how='all') for df in tabular_data]
  # Iterate through the list of DataFrames and convert each one to a list of dictionaries
  for df in cleaned_df:
      if not df.empty:
          tabular_data_dicts.append(df.to_dict(orient='records'))
  return tabular_data_dicts

In [None]:
preprocessor = PreProcessor(split_by='word',
                            split_length=250,
                            split_overlap=50,
                            clean_whitespace=True,
                            clean_header_footer=True,
                            clean_empty_lines=True,
                            split_respect_sentence_boundary=True
                            )

def get_processed_documents(all_documents):
  processed_docs = []
  for files in all_documents:
    processed_docs = processed_docs + preprocessor.process([files])
  return processed_docs

def get_qa_pipeline(document_store, reader):
  retriever = DensePassageRetriever(document_store=document_store, use_gpu=True)
  document_store.update_embeddings(retriever)
  qa_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
  return qa_pipeline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Defining FAISSDocumentStore

In [None]:
faiss_doc = 'faiss_document_store.db'
if os.path.exists(faiss_doc):
          os.remove(faiss_doc)
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)

Defining Reader

In [None]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

Defining Questions

In [None]:
data = {
    "questions":[
        ['How much is  greenhouse gas (GHG) emission is generated from production during the reporting period?',
        'How much is the scope 1 emissions for the reporting period?',
        'How much is the scope 2 emission for the reporting period?',
        'How much percentage of scope 1 emissions is reduced in the reporting period?',
        'By when company is targetting to become carbon neutral?',
        'How much water company consume in reporting year?',
        'How much waste company generate in reporting year?',
        'How much energy is consumed in reporting year?']
    ]
}

questions = data["questions"]

In [None]:
with open("ground_truth_real.json", "r") as json_file:
    ground_truth = json.load(json_file)

Generating answer for all the files present in **pdf_files** folder

In [None]:
# Flag to use PDFToTextConverter from Haystack
use_pdf_to_text_converter = True

In [None]:
pdf_dir = "pdf_files"
temp_pdf_dir = "temp"
header = 40
footer = 40
all_pdf_answer = {}

# Iterate through PDF files in the directory
for pdf_file in os.listdir(pdf_dir):

    all_documents = []
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        temp_file_path = os.path.join(temp_pdf_dir, pdf_file)

        # Remove header footer
        remove_header_footer(header,
                             footer,
                             pdf_path,
                             temp_file_path)

        if use_pdf_to_text_converter:
          converter = PDFToTextConverter(remove_numeric_tables=False, valid_languages=['en'])
          all_documents = converter.convert(temp_file_path, meta = {"file_name": pdf_file})

        else:
          # Extract text from the PDF file
          pdf_text = get_text_data(temp_file_path)

          # Extract tabular data
          tabular_data_dicts = get_tabular_data(temp_file_path)

          # Create a document and add it to the document store
          document = {
              "content": pdf_text,
              "tables": tabular_data_dicts,
              "meta": {"file_name": pdf_file}
          }
          all_documents.append(document)

        if os.path.exists(temp_file_path):
          os.remove(temp_file_path)

        processed_docs = get_processed_documents(all_documents)

        document_store.delete_documents()
        document_store.write_documents(processed_docs)

        qa_pipeline = get_qa_pipeline(document_store, reader)

        answers = []
        #questions = list(ground_truth[pdf_file].keys())
        for question in questions:
          result = qa_pipeline.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 1}})
          print(result)
          answers.append(result)

        all_pdf_answer[pdf_file] = answers


Save answer in pickle file

In [None]:
answer_file_name = 'answers.pkl'

with open(answer_file_name, 'wb') as fp:
    pickle.dump(all_pdf_answer, fp)


Read answer from pickle file

In [None]:
with open(answer_file_name, 'rb') as fp:
    answers = pickle.load(fp)

In [None]:
answer_list = [{'answer': answer.answer, 'score' : answer.score} for answer in answers['1.pdf'][0]['answers']]
(answer_list)

In [None]:
min_ans = {}
for pdf_file in os.listdir(pdf_dir):
  min_ans_list = []
  answer_list = all_pdf_answer[pdf_file]
  for answer in answer_list:
    min_ans_list.append(
      {
          "query":answer['query'],
          "answers": [{'answer': answer.answer, 'score' : answer.score} for answer in answer['answers']]
      }
    )
  min_ans[pdf_file] = min_ans_list

In [None]:
generated_answer = min_ans

In [None]:
min_answer_file_name = 'min_answers.json'
json_data = json.dumps(min_ans)

with open(min_answer_file_name, 'w') as fp:
    fp.write(json_data)

In [None]:
all_pdf_answer

In [None]:
ground_truth_answers = {
    questions[0]: ["2,062 ", "10,068","25.922","13.9","11,048,504","7,114,667","13.9","11,509,756","145","106","1.5", "0.2","259,429","14,336,42777,3","1,237"],
    questions[1]: ["1,027", "326","742", "333","1,471","10,068","11,509,756", "106","46","0.4","54","5","1.5","0.2","304","138"],
    questions[2]: ["1,492", "1035","426", "954","11,245","147","9,631", "135,183","60","39","28","10","206","178","28","10","0.9","1.5","0.2","499",""],
    questions[3]: ["18", "20", "25","more than 20","between 30 and 50 percent","28 percent"],
    questions[4]: ["2025","2039","2040","2050","2030"],
    questions[5]: ["11,778","113.736", "60.8", "1.16","1.14","14"],
    questions[6]: ["1,025","1.183","0.633","25.3","1.14","2,334","126,736","12,702"],
    questions[7]: ["419.6 GWh","1,005 GWh","86.7 GWh","76.761","0.041","859","836","5","1,632","1,424","1,201","207","625","10","10,883,534","1,326,842","2,126,050","0.2","0.7","14,336,42777.,3"],

  #"15,136,769 metric tons", "145000 metric tonnes "," 1.7 metric tones reduced", "259,429 metric tonnes "
    #"Implemented energy-efficient lighting", "Switched to electric vehicles", "Installed solar panels"
}

In [None]:
for answerdetail in answers:
  print(50*"-")
  print("Query : ", answerdetail["query"])
  print(50*"-")
  for answer in answerdetail["answers"]:
    print("Answer : ", answer.answer)
    print("Score : ", answer.score)
    print("Filename : ", answer.meta["file_name"])
    print("\n")

# Performance Metrics

In [None]:
# Function to calculate F1 score between predicted and ground truth answers
def calculate_f1_score(predicted_answer, ground_truth_answer):

    predicted_tokens = word_tokenize(predicted_answer.lower())
    gt_tokens = word_tokenize(ground_truth_answer.lower())

    # Create sets of unique tokens
    predicted_set = set(predicted_tokens)
    gt_set = set(gt_tokens)

    # Calculate precision, recall, and F1 score using nltk's f_measure function
    precision = f_measure(predicted_set, gt_set)
    recall = f_measure(gt_set, predicted_set)
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return f1

In [None]:
import json
with open("ground_truth.json", "r") as json_file:
    ground_truth = json.load(json_file)

with open("generated_answer.json", "r") as json_file:
    generated_answer = json.load(json_file)

In [None]:
ground_truth

In [None]:
for each_file in generated_answer:
  print('File:', each_file)
  accuracy = 0
  f1_scores = []
  has_ans = 0
  qa_list = generated_answer[each_file]
  for qa in qa_list:
    question = qa['query']
    predicted_answer = qa['answers'][0]['answer']
    gt_answer = ground_truth[each_file][question]
    if gt_answer == "":
      continue
    else:
      if predicted_answer == gt_answer:
        accuracy += 1
      f1_score = calculate_f1_score(predicted_answer, gt_answer)
      f1_scores.append(f1_score)
      has_ans = has_ans + 1
  total_questions = len(qa_list)
  accuracy = accuracy / total_questions
  average_f1_score = sum(f1_scores) / total_questions
  print(f"Accuracy: {accuracy:.2f}")
  print(f"Average F1 Score: {average_f1_score:.2f}")
  print('F1 scores:', f1_scores)
  print("\n")

File: 2.pdf
Accuracy: 0.00
Average F1 Score: 0.17
F1 scores: [0, 0, 0, 0, 0.6666666666666666, 0.6666666666666666]


File: 1.pdf
Accuracy: 0.12
Average F1 Score: 0.12
F1 scores: [0, 0, 0, 0, 1.0, 0, 0, 0]


File: 5.pdf
Accuracy: 0.12
Average F1 Score: 0.17
F1 scores: [0, 0, 0, 1.0, 0.3333333333333333]


File: 3.pdf
Accuracy: 0.12
Average F1 Score: 0.12
F1 scores: [0, 0, 0, 1.0, 0]


File: 6.pdf
Accuracy: 0.38
Average F1 Score: 0.38
F1 scores: [1.0, 0, 0, 1.0, 0, 0, 1.0]


File: 4.pdf
Accuracy: 0.12
Average F1 Score: 0.23
F1 scores: [0, 0, 0.16666666666666666, 1.0, 0.6666666666666666, 0, 0, 0]


