##Utils

### Installation and requirements

In [1]:
!pip install pdf2image
!pip install opencv-python
!pip install Pillow
!sudo apt install tesseract-ocr
!pip install pytesseract
!apt-get install poppler-utils
!pip install -U sentence-transformers
!pip install rank_bm25

Collecting pdf2image
  Downloading pdf2image-1.16.3-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.16.3
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 19 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (5,255 kB/s)
debconf: una

In [2]:
#Creating useful folders
!mkdir img
!mkdir texts
!mkdir -p texts/page_splitted
!mkdir -p texts/full_text

In [3]:
from sentence_transformers import SentenceTransformer, util
from pdf2image import convert_from_path
import re
import cv2
import numpy as np
import pandas as pd
import pytesseract
from pytesseract import Output
from matplotlib import pyplot as plt
from rank_bm25 import BM25L, BM25Plus
from sentence_transformers import CrossEncoder

### Functions definition

In [4]:
def phrases_from_txt(file_path, full_text = True, split_value = '.\n'):
  if full_text == False:
    with open(file_path, 'r') as f:
      txt = f.read()
    phrases = txt.split("\n\n\n")
    phrases = phrases[:-1] #removing last empty element of the list
  else:
    with open(file_path, 'r') as f:
      txt = f.read()
      phrases = txt.split(split_value)

  return phrases

In [5]:
def split_at_colon(file_path):

  with open(file_path, 'r') as f:
      txt = f.read()

  phrases  = txt.split(": ")

  for i in range(len(phrases )-1):
    elem = phrases [i].split("\n")[-1]
    phrases [i] = phrases [i].replace(elem, '')
    phrases [i+1] = elem + " " + phrases [i+1] + ":"

  return phrases

In [6]:
def remove_unwanted_text(text):
  new_text = str(text)
  new_text = re.sub('(\:(-)?\)|\:(-)?\(|<3|\:(-)?\/|\:-\/|\:(-)?\||\:(-)?[pP]|\s\:+(-)?([0-9])?\s|\^\^|\s\:+(-)?(\D)?\s)', '', new_text)  # removing smile with :
  new_text = new_text.replace('__', '')
  new_text = new_text.replace('“', '')
  new_text = new_text.replace('”', '')
  new_text = new_text.replace('"', '')
  new_text = new_text.replace('/', '')
  new_text = new_text.replace('\\','/')
  new_text = new_text.replace('\n\n',' ')
  return new_text


In [7]:
def combined_list(list1, list2, desired_length):
    common_elements = list(set(list1) & set(list2))  # Get common elements between the two lists
    list1 = [x for x in list1 if x not in common_elements]
    list2 = [x for x in list2 if x not in common_elements]
    combined_list = []
    combined_list.extend(common_elements)
    for i in range(min(len(list1), len(list2), desired_length)):
        combined_list.append(list1[i])
        combined_list.append(list2[i])
    return combined_list[:desired_length]

In [8]:
def semantic_search(query, docs, model, metric, top_n = 10, doc_emb_available = False, doc_emb = 0):
  #Load the model
  model = SentenceTransformer(model)

  #Encode query and documents
  query_emb = model.encode(query)

  if doc_emb_available == True:
    doc_emb = doc_emb
  else:
    doc_emb = model.encode(docs)

  #Compute the selected metric between query and all document embeddings
  scores = metric(query_emb, doc_emb)[0].cpu().tolist()

  #Combine docs & scores
  doc_score_pairs = list(zip(docs, scores))

  #doc_score_pairs = [(index + 1, doc, score) for index, (doc, score) in enumerate(doc_score_pairs)]

  #Sort by decreasing score
  doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

  #Select only the top_n documents
  doc_score_pairs = doc_score_pairs[:top_n]

  best_document_match = doc_score_pairs[0][0] #text of the best match (page of best match in case of page_splitting)

  #Output passages & scores
  for doc, score in doc_score_pairs:
      print(score, '\n', doc, '\n\n')
  return doc_emb

In [9]:
def semantic_search_top_n(query, docs, model, metric, top_n = 3, doc_emb_available = False, doc_emb = 0):
  #Load the model
  model = SentenceTransformer(model)

  #Encode query and documents
  query_emb = model.encode(query)

  if doc_emb_available == True:
    doc_emb = doc_emb
  else:
    doc_emb = model.encode(docs)

  #Compute the selected metric between query and all document embeddings
  scores = metric(query_emb, doc_emb)[0].cpu().tolist()

  #Combine docs & scores
  doc_score_pairs = list(zip(docs, scores))

  #doc_score_pairs = [(index + 1, doc, score) for index, (doc, score) in enumerate(doc_score_pairs)]

  #Sort by decreasing score
  doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

  #Choosing only the top_n documents
  doc_score_pairs = doc_score_pairs[:top_n]

  pages = [text[0] for text in doc_score_pairs]

  concatenated_pages = ' '.join(pages)

  print(concatenated_pages)
  # Try opening the file in append mode
  try:
      with open("Text for LLM.txt", "a") as file:
          file.write('\nDocument name: ' + pdf_name+ '\n\n' + concatenated_pages + "\n\n")
  except FileNotFoundError:
      # File doesn't exist, create it and append the text
      with open("Text for LLM.txt", "w") as file:
          file.write(concatenated_pages + "\n")

  return doc_emb

In [10]:
def semantic_search_top_n_hybrid(query, docs, model, metric, top_n = 3, bm25model = BM25Plus, doc_emb_available = False, doc_emb = 0):

  tokenized_corpus = [doc.split(" ") for doc in docs]

  bm25 = bm25model(tokenized_corpus)

  tokenized_query = query.split(" ")

  scores_bm25 = bm25.get_scores(tokenized_query)

  sorted_indices = np.argsort(scores_bm25)[::-1]

  # Return the indices of the top n elements
  top_n_indices_bm25 = sorted_indices[:top_n]
  top_n_indices_bm25 = top_n_indices_bm25.tolist()

  #Load the model
  model = SentenceTransformer(model)

  #Encode query and documents
  query_emb = model.encode(query)

  if doc_emb_available == True:
    doc_emb = doc_emb
  else:
    doc_emb = model.encode(docs)

  #Compute the selected metric between query and all document embeddings
  scores_semantic = metric(query_emb, doc_emb)[0].cpu().tolist()

  sorted_indices = np.argsort(scores_semantic)[::-1]

  # Return the indices of the top n elements
  top_n_indices_semantic = sorted_indices[:top_n]
  top_n_indices_semantic = top_n_indices_semantic.tolist()

  indices_top_n_documents = combined_list(top_n_indices_bm25, top_n_indices_semantic, top_n)

  print("Top3 best results with semantic search: ", top_n_indices_semantic, '\n')
  print("Top3 best results with bm25 search: ", top_n_indices_bm25, '\n')
  print("Top3 best results with hybrid search: ", indices_top_n_documents, '\n')

  text = ""
  for i in indices_top_n_documents:
    text = text + phrases[i]

  return doc_emb, text

In [27]:
def semantic_search_top_n_re_ranking(query, docs, model, metric, top_n = 3, num_docs = 20, model_cross = 'cross-encoder/ms-marco-MiniLM-L-6-v2', doc_emb_available = False, doc_emb = 0):
  #Semantic search
  #Load the model
  model = SentenceTransformer(model)

  #Encode query and documents
  query_emb = model.encode(query)

  if doc_emb_available == True:
    doc_emb = doc_emb
  else:
    doc_emb = model.encode(docs)

  #Compute the selected metric between query and all document embeddings
  scores_semantic = metric(query_emb, doc_emb)[0].cpu().tolist()

  sorted_indices = np.argsort(scores_semantic)[::-1]

  # Return the indices of the top n elements
  top_n_indices_semantic = sorted_indices[:num_docs]
  top_n_indices_semantic = top_n_indices_semantic.tolist()

  #Re-ranking
  scores = []
  model_cross = CrossEncoder(model_cross, max_length=512)
  for i in top_n_indices_semantic:
    scores.append((i,(model_cross.predict((query, docs[i])))))

  sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]

  print("Top results with semantic and re-ranking search: ", sorted_scores)
  indices = [t[0] for t in sorted_scores]

  text = ""
  for i in indices:
    text = text + phrases[i]

  return doc_emb, text

In [12]:
# Semantic search that outputs the best page according to the semantic search and both the previous and the following one concatenating all text in one

def semantic_search_three_pages(query, docs, model, metric, doc_emb_available=False, doc_emb=0):
    # Load the model
    model = SentenceTransformer(model)

    # Encode query and documents
    query_emb = model.encode(query)

    if doc_emb_available:
        doc_emb = doc_emb
    else:
        doc_emb = model.encode(docs)

    # Compute the selected metric between query and all document embeddings
    scores = metric(query_emb, doc_emb)[0].cpu().tolist()

    # Combine docs, scores, and indices
    doc_score_index_triplets = list(zip(docs, scores, range(len(docs))))

    # Find the index of the best match
    best_match_index = max(doc_score_index_triplets, key=lambda x: x[1])[2]

    # Find the index of the page before the best match
    page_before_index = best_match_index - 1 if best_match_index > 0 else None

    # Find the index of the page after the best match
    page_after_index = best_match_index + 1 if best_match_index < len(docs) - 1 else None

    # Sort by decreasing score
    doc_score_index_triplets = sorted(doc_score_index_triplets, key=lambda x: x[1], reverse=True)

    # Extract the text of the best match
    best_document_match = doc_score_index_triplets[0][0]

    # Concatenate the text of the page before the best match, best match, and the page after the best match
    if page_before_index is not None:
        best_document_match = docs[page_before_index] + best_document_match
    if page_after_index is not None:
        best_document_match += docs[page_after_index]

    print(best_document_match)

    return doc_emb

##Variables

In [13]:
pdf_name = "prova.pdf"
pdf_path = r"/content/" + pdf_name
IMG_DIR = '/content/img/'
full_text_file_path = '/content/texts/full_text/'+ "full text " + pdf_name.replace(".pdf", ".txt")
page_splitted_file_path = '/content/texts/page_splitted/'+ "page splitted " + pdf_name.replace(".pdf", ".txt")

##Pytesseract

###First time converting pdf

In [14]:
pages = convert_from_path(pdf_path, 400)

i = 1
for page in pages:
    image_name = IMG_DIR + "Page_" + str(i) + ".jpg"
    page.save(image_name, "JPEG")
    i = i+1

In [15]:
stringTotal=[]
custom_config = r'--oem 3 --psm 6'
i=1
for page in pages:
  image = cv2.imread(IMG_DIR + 'Page_' + str(i) + '.jpg')
  writePdf=(pytesseract.image_to_string(image, config=custom_config))
  page_name = 'Page number: ' + str(i)
  #writePdf = '\n' + page_name + '\n' + writePdf #useful only for page splitting
  stringTotal.append(writePdf)
  i=i+1

pdfTotal=' '.join(stringTotal)


In [16]:
#Sanity check
len(pages)

13

In [17]:
with open(full_text_file_path, 'w') as f:
  f.write(pdfTotal)

In [18]:
#Splitting at each page
phrases = stringTotal

with open(page_splitted_file_path, 'w') as f:
  for phrase in phrases:
        f.write(f"{phrase}\n\n\n")


###Starting from an already converted pdf

In [19]:
#split_at_colon(full_text_file_path)

In [20]:
phrases = phrases_from_txt(page_splitted_file_path,full_text = False) #For page splitting
#phrases = phrases_from_txt(full_text_file_path, split_value = ".\n") #For line splitting at specific character (split_value)

In [21]:
#Sanity check
len(phrases)

13

##Text processing

In [22]:
phrases=[remove_unwanted_text(text) for text in phrases]

## Semantic search


In [28]:
doc_emb, text = semantic_search_top_n_re_ranking("Death benefit increase", phrases, 'sentence-transformers/all-MiniLM-L6-v2',util.cos_sim, top_n=2, num_docs = 20)

Top results with semantic and re-ranking search:  [(5, 3.7563944), (6, 0.23575786)]


In [24]:
text

'C Benefits provided by this Policy\nC1 Death Benefit\nUpon the death of the Designated Life Insured, the Death Benefit specified in the Policy Schedule is payable to the\nBeneficiary subject to the terms and provisions of this Policy.\nIf your Coverage Option is Joint First-to-die, and a second Life Insured dies within sixty (60) days of the Designated\nLife Insured, we will pay a supplementary Death Benefit to the Beneficiary provided that the surviving Life Insured\nhas not exercised the Survivor Privilege described in provision E 1. The supplementary Death Benefit payable will\nbe equal to the Death Benefit under this Policy the day the Designated Life Insured died, excluding any Riders. We\nwill not pay a Death Benefit for any subsequent deaths.\nIf two or more Lives Insured die at the same time, or under circumstances that make it uncertain who died first, we\nwill deem a younger insured to have survived an older insured and the oldest such Life Insured will be deemed the\nDesign

## LLM

In [None]:
%pip install "accelerate>=0.16.0,<1" "transformers[torch]>=4.28.1,<5" "torch>=1.13.1,<2"

In [None]:
import torch
from transformers import pipeline

generate_text = pipeline(model="databricks/dolly-v2-3b", torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")

In [30]:
prompt = "Given the following text, tell me which is the MINIMUM increase allowed in death benefit: " + text
res = generate_text(prompt)
print(res[0]["generated_text"])

The minimum increase allowed is $10,000.


## Test stage

### Tables


In [None]:
!pip install tabula-py

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tabula-py
  Downloading tabula_py-2.7.0-py3-none-any.whl (12.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
Collecting distro (from tabula-py)
  Downloading distro-1.8.0-py3-none-any.whl (20 kB)
Installing collected packages: distro, tabula-py
Successfully installed distro-1.8.0 tabula-py-2.7.0


In [None]:
import tabula
#tabula.convert_into("SLIP RC GRUPO SUMCAB 2022.pdf","prova.csv",pages = "all")
df = tabula.read_pdf("/content/prova", pages="all")

Jun 14, 2023 1:42:02 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Jun 14, 2023 1:42:04 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>



In [None]:
df[1]

Unnamed: 0.1,EXPLOTACIÓN Y PATRONAL (PARTE A),Unnamed: 0
0,"DAÑOS POR INCENDIO, EXPLOSIÓN, AGUAS",INCLUIDO
1,LOCATIVA,INCLUIDO
2,PATRONAL (SUBLÍMITE POR VÍCTIMA),600.000 €
3,R.C. DAÑOS A BIENES DE EMPLEADOS,30.000 € STRO/AÑO Y 3.000 POR\rOBJETO
4,R.C. CRUZADA,INCLUIDO
5,R.C. SUBSIDIARIA,INCLUIDO
6,R.C. CONTAMINACIÓN ACCIDENTAL,INCLUIDO
7,R.C. TECNICOS EN PLANTILLA,INCLUIDO
8,R.C. SUBSIDIARIA DE VEHICULOS,INCLUIDO
9,R.C. CARGA Y DESCARGA,INCLUIDO


In [None]:
import json
diction = df[1].to_dict()
dic_json = json.dumps(diction)
dic_json

'{"EXPLOTACI\\u00d3N Y PATRONAL (PARTE A)": {"0": "DA\\u00d1OS POR INCENDIO, EXPLOSI\\u00d3N, AGUAS", "1": "LOCATIVA", "2": "PATRONAL (SUBL\\u00cdMITE POR V\\u00cdCTIMA)", "3": "R.C. DA\\u00d1OS A BIENES DE EMPLEADOS", "4": "R.C. CRUZADA", "5": "R.C. SUBSIDIARIA", "6": "R.C. CONTAMINACI\\u00d3N ACCIDENTAL", "7": "R.C. TECNICOS EN PLANTILLA", "8": "R.C. SUBSIDIARIA DE VEHICULOS", "9": "R.C. CARGA Y DESCARGA", "10": "R.C. TRANSPORTE DE MERCANCIAS", "11": "PRODUCTOS (PARTE B)", "12": "R.C. PRODUCTOS", "13": "R.C. UNION Y MEZCLA", "14": "GASTOS DE MONTAJE/DESMONTAJE/SUSTITUCION (Incluido\\rproducto de Terceros)", "15": "R.C. PATRIMONIALES PRIMARIOS", "16": "OTRAS CLAUSULAS INCLUIDAS", "17": "CLAUSULA DE RECOMPRA DE S.A.", "18": "CLAUSULA DE PROGRAMA INTERNACIONAL L.P.S.", "19": "LIBERACION DE GASTOS", "20": "R.C. DEFENSA Y FIANZAS"}, "Unnamed: 0": {"0": "INCLUIDO", "1": "INCLUIDO", "2": "600.000 \\u20ac", "3": "30.000 \\u20ac STRO/A\\u00d1O Y 3.000 POR\\rOBJETO", "4": "INCLUIDO", "5": "INC

In [None]:
import re
txt = ""
for elem in range(len(df)):
  txt = txt + (str(df[elem]).replace('\n', '').replace('\t', ''))

txt = re.sub(r'\s+', ' ', txt)

In [None]:
print(txt)

 COBERTURA LIMITE0 Límite por siniestro/año 3.000.000 € EXPLOTACIÓN Y PATRONAL (PARTE A) \0 DAÑOS POR INCENDIO, EXPLOSIÓN, AGUAS 1 LOCATIVA 2 PATRONAL (SUBLÍMITE POR VÍCTIMA) 3 R.C. DAÑOS A BIENES DE EMPLEADOS 4 R.C. CRUZADA 5 R.C. SUBSIDIARIA 6 R.C. CONTAMINACIÓN ACCIDENTAL 7 R.C. TECNICOS EN PLANTILLA 8 R.C. SUBSIDIARIA DE VEHICULOS 9 R.C. CARGA Y DESCARGA 10 R.C. TRANSPORTE DE MERCANCIAS 11 PRODUCTOS (PARTE B) 12 R.C. PRODUCTOS 13 R.C. UNION Y MEZCLA 14 GASTOS DE MONTAJE/DESMONTAJE/SUSTITUCION (Incl... 15 R.C. PATRIMONIALES PRIMARIOS 16 OTRAS CLAUSULAS INCLUIDAS 17 CLAUSULA DE RECOMPRA DE S.A. 18 CLAUSULA DE PROGRAMA INTERNACIONAL L.P.S. 19 LIBERACION DE GASTOS 20 R.C. DEFENSA Y FIANZAS Unnamed: 0 0 INCLUIDO 1 INCLUIDO 2 600.000 € 3 30.000 € STRO/AÑO Y 3.000 POR\rOBJETO 4 INCLUIDO 5 INCLUIDO 6 INCLUIDO 7 INCLUIDO 8 INCLUIDO 9 INCLUIDO 10 INCLUIDO 11 NaN 12 INCLUIDO 13 INCLUIDO 14 400.000 € 15 150.000 € 16 NaN 17 INCLUIDO 18 INCLUIDO 19 INCLUIDO 20 INCLUIDO Empty DataFrameColumns: [R

In [None]:
for elem in range(len(df)):
  df[elem].to_csv('output.txt', sep='\t', index=False, mode = 'a')

In [None]:
with open('output.txt') as file:
  txt = file.read()

In [None]:
text = txt.replace('\n', ' ').replace('\t', ' ')
text

'Country TSI PD TSI BI Premium PD Premium BI UK 969,970 250,985 0,673 0,327 DE 731,555 141,304 0,340 0,147 PL 519,970 84,895 0,256 0,071 IT 483,032 73,430 0,239 0,070 ES 492,525 46,307 0,236 0,037 NL 452,292 66,468 0,217 0,053 Various Countries 152,436 13,189 0,073 0,016 Total 3.801,780 676,578 2,033 0,722 Country TSI PD TSI BI Premium PD Premium BI DE 825,526 200,196 0,369 0,216 UK 316,164 59,183 0,196 0,118 Various Countries 111,809 29,269 0,074 0,047 Total 1.253,499 288,648 0,639 0,381 '