In [30]:
# general impors
import os
import re
from termcolor import colored

# ocr imports
import pytesseract
import cv2 as cv
import pymupdf 
from fitz import open as fitz_open

# db management imports
import pymongo
import requests

# dotenv imports
from dotenv import load_dotenv
load_dotenv()

True

![image.png](attachment:image.png)

-----------------------------------

#### <span style="color:yellow"> all-MiniLM-L6-v2 </span> is a sentence-transformers model. It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.

------------------------------------

#### Set APIs Keys and HuggingFace Tokens

In [31]:
## MongoDB Setup
my_client = os.getenv('MONGO_URI')
client = pymongo.MongoClient(my_client)
db = client.Loaded_Text
collection = db.filesText


## Huggingface API Setup
huggingface_tk = os.getenv('huggingface_tk')
embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"

hf_token = huggingface_tk

In [32]:
directory = "static"
files = [file for file in os.listdir(directory) if file.endswith(".pdf")]


for i, file in enumerate(files):
    print(f"File: {i}")
    print(f"--> {file}\n-------")

File: 0
--> A PlusCal User's Manual - C-Syntax Version 1.8 (31 Aug 2018).pdf
-------
File: 1
--> AEWC Follow up 2019 - Requests to the Management.pdf
-------
File: 2
--> fors_policybrief_genderequality_2010.pdf
-------
File: 3
--> Investigation of Hardware Transactional Memory - 2015 (Andrew-Nguyen-Thesis).pdf
-------
File: 4
--> ssrn-1014233.pdf
-------
File: 5
--> TIE_W05_ReuterTruman.pdf
-------


In [34]:
full_json = dict()

def parse_file(filepath, fileName):
  """ 
  Parse a PDF file and return a dictionary with the text of each page.
  """

  blocks = ""

  pdf = fitz_open(filepath)

  file_json = dict()
  file_json[f"FILE: {fileName}"] = [] 

  for i, page in enumerate(pdf): 
    blocks = page.get_text(sort=True).replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').strip()

    replacements = [("\.+", ""), ("\s+", " ")]
    for pat, repl in replacements:
      blocks = re.sub(pattern=pat, repl=repl, string= blocks)
    
    blocks += blocks

    file_json[f"FILE: {fileName}"] = blocks

  pdf.close()

  return file_json

In [35]:
for file in files:
    filepath = os.path.join(directory, file)
    fileName = file.split(".")[0]

    file_json = parse_file(filepath, fileName)
    full_json.update(file_json)

In [36]:
for parsed in full_json:
    print(parsed)
    print("------------------------")

FILE: A PlusCal User's Manual - C-Syntax Version 1
------------------------
FILE: AEWC Follow up 2019 - Requests to the Management
------------------------
FILE: fors_policybrief_genderequality_2010
------------------------
FILE: Investigation of Hardware Transactional Memory - 2015 (Andrew-Nguyen-Thesis)
------------------------
FILE: ssrn-1014233
------------------------
FILE: TIE_W05_ReuterTruman
------------------------


In [37]:
def generate_embedding(text: str) -> list[float]:

    """ 
    Set up the embedding creation function
    """

    response = requests.post(
        embedding_url, 
        headers={
            "Authorization": f"Bearer {hf_token}"},
            json={"inputs": text, 
            "options":{"wait_for_model":True}}
            )

    if response.status_code != 200:
        raise ValueError(f"Request failed with status code {response.status_code}: {response.text}")

    return response.json()

## Write data onn database

In [32]:
for k,v in full_json.items():
    collection.insert_one({"documentName": k, "text": v})

## Write embedded data on database

In [34]:
# replace data in-place

for doc in collection.find({'text': {"$exists": True}}).limit(50):
    doc['hf_text_embedding'] = generate_embedding(doc['text'])
    collection.replace_one({'_id': doc['_id']}, doc)

-------------------------------

#### check if exists

In [40]:
print(collection.find_one({}, {"pages.hf_text_embedding": 1}))

{'_id': ObjectId('67cf2f5f67455d88f14b1f88')}


------------------------

In [25]:
# query = "___your_keywords_here___"

example_query = "TLC module"


####################################
##  vector search code structure  ##
####################################

# {
#   "$vectorSearch": {
#     "exact": true | false,
#     "filter": {<filter-specification>},
#     "index": "<index-name>",
#     "limit": <number-of-results>,
#     "numCandidates": <number-of-candidates>,
#     "path": "<field-to-search>",
#     "queryVector": [<array-of-numbers>]
#   }
# }

# !!! queryvector is the generated vector from the specific query used. 
# !!! index is the search index created inside MongoDB by the admin.


####################################
####################################

pipeline = [
    {
        "$vectorSearch": {
            "index": "parsedFiles_index",
            "path": "hf_text_embedding",  # e.g., "0.hf_text_embedding"
            "queryVector": generate_embedding(example_query),
            "limit": 4,
            "numCandidates": 100
        }
    }
    
]

results = collection.aggregate(pipeline)

## Where was the word found?

In [26]:
for i, doc in enumerate(results, 1):
    print(f"Documento {i}")
    print(doc['documentName'])
    print("------")

Documento 1
FILE: A PlusCal User's Manual - C-Syntax Version 1
------
Documento 2
FILE: TIE_W05_ReuterTruman
------
Documento 3
FILE: AEWC Follow up 2019 - Requests to the Management
------
Documento 4
FILE: ssrn-1014233
------
