# <span style="color: yellow"> OCR, Embedding, Index Creation, and Testing </span>

In [None]:
# general imports
import os
import re
import pandas as pd
import io
from io import BytesIO
import numpy as np
from termcolor import colored
from PIL import Image, ImageDraw, ImageFont
import tempfile


# ocr imports
import pytesseract
# import pymupdf
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import fitz
from fitz import open as fitz_open
import win32com.client

# API & Embeddings
from openai import AzureOpenAI
import tiktoken

# db management imports
import pymongo
import requests

# dotenv imports
from dotenv import load_dotenv
load_dotenv()

############################################

user = '___insert_user_here___'

pytesseract.pytesseract.tesseract_cmd = fr"c:\Users\{user}\AppData\Local\Tesseract-OCR\tesseract.exe"

In [None]:
def clean_text(document):
    replacements = [("\r", ""), ("\x07", ""), ("_", " "), (r"\s+", " ")]
    for pat, repl in replacements:
        document = re.sub(pattern=pat, repl=repl, string=document)

    return document.strip()

###########################################
###########################################

def extract_text_from_doc(temp_path):
    """
    Parse a doc/docx file
    """

    word = win32com.client.Dispatch("Word.Application")
    word.Visible = False
    doc = word.Documents.Open(temp_path)
    full_text = []

    # Legge tutto il testo del documento
    for para in doc.Paragraphs:
        full_text.append(para.Range.Text.strip() + "\n")

    doc.Close()
    word.Quit()

    cleaned_text = [page for page in full_text if isinstance(page, str)]
    document = " ".join(cleaned_text)

    return document

###########################################
###########################################

def parse_file(temp_path, fileName):
    """ 
    Parse a PDF or a doc/docx file and return a dictionary with the text of each page.
    1. DOC/DOCX parsing: extract_text_from_doc(filepath)
    2. PDF parsing:
        -- First try (page.extract_text_from_doc): get text from the pdf page. If any text is retrieved with a number of characters > 20, it will be appended to "blocks". 
        -- Second try (pytesseract.image_to_string): if the text is not being retrieved from the first try block, use OCR to parse the image. then append the result to "blocks".
    """

    blocks = ""

    file_json = dict()
    file_json[fileName] = dict()

    ##

    if (fileName.endswith(".doc")) or (fileName.endswith(".docx")):

        document = extract_text_from_doc(fileName)
        clean_document = clean_text(document)

        file_json[fileName]['text'] = clean_document

    else:

        document = PdfReader(temp_path)
        page_images = convert_from_path(temp_path, dpi=300)
        
        for i, page in enumerate(document.pages): 
            page_text = page.extract_text().replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').strip()

            if page_text and len(page_text.strip())> 20:
                clean_page_text = clean_text(page_text)
                blocks += clean_page_text + "\n"

            else:
                try:
                    ocr_text = pytesseract.image_to_string(page_images[i], lang='ita+eng')
                    blocks += ocr_text + "\n"

                except Exception as e:
                            print(f"[OCR error {i+1}] {str(e)}")

        file_json[fileName]['text'] = blocks

    return file_json

## Microsoft Graph file selection & Token refresh

In [None]:
def refresh_token():
    token_url = f"___token_url___"
    resource_url = "https://graph.microsoft.com/.default"

    response = requests.post(token_url, data={
        "grant_type": "client_credentials",
        "client_id": "___clientid___",
        "client_secret": "____clientsecret____",
        "scope": resource_url
    })


    access_token = response.json().get("access_token")
    headers = {'Authorization': f'Bearer {access_token}'}

    return headers

###########################################
###########################################

def retrieve_id(id, folder_ids, headers):

    """
    Recursive function to find all the files at any depth inside MS Graph
    """

    driveId = '____drive_id____'

    url_2 = f'https://graph.microsoft.com/v1.0/drives/{driveId}/items/{id}/children/'

    site_info_2 = requests.get(url_2, headers=headers).json()

    for dic in site_info_2['value']:

        folder_key = dic.keys()

        if 'folder' in folder_key:
            name = dic['name']
            id = dic['id'] # FOLDER

            retrieve_id(id, folder_ids, headers)
        
        elif 'file' in folder_key:
            webUrl = dic['webUrl']
            name = dic['name']
            id = dic['id'] # FILE

            folder_ids.append({'id': id, 'name': name, 'webUrl': webUrl})

In [None]:
headers = refresh_token()

sharepoint_site = "inoffice.sharepoint.com"
site_path = "____sitepath____"
site_info_url = f"https://graph.microsoft.com/v1.0/sites/{sharepoint_site}:{site_path}"
site_info = requests.get(site_info_url, headers=headers).json()
site_id = site_info["id"]

drives_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives"
drives = requests.get(drives_url, headers=headers).json()

folderId = '____folder_id____' # PARENT FOLDER 
folder_ids = list()
retrieve_id(folderId, folder_ids, headers)

### Keep only .pdf, .doc, .docx files

In [30]:
valid = dict()

for dic in folder_ids:
    if (dic['name'].endswith('.pdf') or dic['name'].endswith('doc') or dic['name'].endswith('docx')):
            id = dic['id']
            valid[id] = dict()
            valid[id]['name'] = dic['name']
            valid[id]['webUrl'] = dic['webUrl']

### Retrieve text

In [None]:
headers = refresh_token()

drive_id = '___driveid___'
full_json = list()

iterator = [x for x in range(0,4)] #set here the number of files you want to test the model on. Set len(files) to parse all.

for i, (k,v) in zip(iterator, valid.items()):

    full_text = ""

    fileName = v['name']
    extension = fileName.rsplit(".")[0]
    filepath = v['webUrl']

    print(fileName)

    download_file_reference = filepath.replace("https://inoffice.sharepoint.com/sites/AAAA/BBBB/", "")

    download_url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{download_file_reference}:/content"
    response = requests.get(download_url, headers=headers)

    if response.status_code == 200:
        file_bytes = BytesIO(response.content)

        with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as temp_file:
            try:
                temp_file.write(file_bytes.getvalue())
                temp_path = temp_file.name

                file_json = parse_file(temp_path, fileName)

                file_json[fileName]['reference_link'] = filepath

                full_json.append(file_json)

            except Exception as e:
                print("ERRORE:", e)
            
            finally:
                temp_file.close()
                os.unlink(temp_path)
    
    else:
        print("Response:", response, response.status_code)

    print("="*25)

### Store the file for testing purposes

In [None]:
import json

path = r'___insert_path_here___'

with open(path, 'w', encoding='utf-8') as file:
    json.dump(full_json, file)

##

with open(path, 'r', encoding='utf-8') as file:
    temp_full_json_import = json.load(file)

In [None]:
from openai import AzureOpenAI
import unicodedata

###

def normalize_unicode(text):
    return unicodedata.normalize("NFKC", text)

###

def split_text_smart(text, max_chars=3000):
    chunks = []
    current_chunk = ""

    parts = re.split(r'(\n\n+)', text)

    for part in parts:
        if len(current_chunk) + len(part) <= max_chars:
            current_chunk += part
        else:
            if len(part) > max_chars:
                sub_parts = re.split(r'(?<=[.!?])\s+', part)
                for sub in sub_parts:
                    if len(current_chunk) + len(sub) <= max_chars:
                        current_chunk += sub + " "
                    else:
                        chunks.append(current_chunk.strip())
                        current_chunk = sub + " "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = part

    if current_chunk.strip():
        chunks.append(current_chunk.strip())

    return chunks


###

client = AzureOpenAI(
    api_key='___apikey___',
    api_version='___apiversion___',
    azure_endpoint='___azureendpoint___'
    )

deployment_name = 'gpt-4o'

PROMPT_TEMPLATE = 
"""
    You will receive text generated by an OCR system. The text may contain symbols, meaningless characters, unreadable fragments, or incomplete sentences.

    Your task is to:

    Remove all text that doesn't make sense or is clearly the result of an OCR error.

    Keep only the readable, coherent, and useful text.

    ** Do not add, complete, or invent anything **. If a sentence is incomplete or incomprehensible, remove it.

    Return only the cleaned text, without comments or explanations.

    Here is the text to clean:
    ---
    {OCR_TEXT}
"""

def clean_output(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def process_chunks(chunks, error_lst):
    full_cleaned_text = ""

    for i, chunk in enumerate(chunks):
        prompt = PROMPT_TEMPLATE.format(OCR_TEXT=chunk)
        try:
            response = client.chat.completions.create(
                model=deployment_name,
                temperature=0,
                max_tokens=4000,
                messages=[
                    {"role": "system", "content": "You are an assistant that cleans OCR texts by removing unnecessary content. Do not add anything."},
                    {"role": "user", "content": prompt}
                ]
            )
            cleaned = response.choices[0].message.content
            cleaned = clean_output(cleaned)
            full_cleaned_text += cleaned + " "

        except Exception as e:
            error_lst.append(chunk)
            full_cleaned_text += chunk + " "
            print(f"Errore nel chunk {i+1}: {e}")

    return full_cleaned_text.strip()

### Text formatting and chunks creation

In [None]:
for index, nested_dic in enumerate(temp_full_json_import):
        for k,v in nested_dic.items():
                error_lst = list()

                print(k)

                text = v['text']

                v['text'] = normalize_unicode(text)

                v['text'] = re.sub(pattern=r'[\x00-\x1F\x7F-\x9F]', repl='', string=v['text'])

                chunks = split_text_smart(v['text'])

                v['openAI text'] = process_chunks(chunks, error_lst)

                v['errors'] = error_lst

                print("="*25)

---------------------

## RAG Build

### Refresh client

In [None]:
my_client = os.getenv('myclient')

client = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version = "apiversion",
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)

tk_encoding = tiktoken.encoding_for_model("text-embedding-ada-002")

### Generate embeddings

In [92]:
# model = "deployment_name"
def generate_embeddings(tk_encoding, text, n_tokens=8191, token_overlap=30, model="text-embedding-ada-002"): 

    text_tokens = tk_encoding.encode(text) # list containing text tokens
    chunks = [tk_encoding.decode(text_tokens[i:i+n_tokens]) for i in range(0, len(text_tokens), n_tokens - token_overlap)] # decode encoded text, from i to i+8191 iteratively with a step of 8161

    embeddings = list()
    chunked_text = list()

    for chunk in chunks:
        embeddings.append(client.embeddings.create(input=chunk, model=model).data[0].embedding)
        chunked_text.append(chunk)

    return chunked_text, embeddings

### File filter

In [93]:
for i, v in enumerate(temp_full_json_import):
    fileName = list(v.keys())[0]
    openAI_text = v[fileName]['openAI text']
    chunked_text, embeddings = generate_embeddings(tk_encoding, openAI_text)
    v[fileName]['chunked_text'] = chunked_text
    v[fileName]['embedded_text'] = embeddings

## <span style="color: yellow"> Azure embedded data ingestion </span>

In [137]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient

from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    SearchField,
    ScoringProfile,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
)

from azure.search.documents.indexes.models import (
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    VectorSearchProfile,
    HnswAlgorithmConfiguration
)

In [None]:
from azure.core.pipeline.transport import RequestsTransport

azure_service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
azure_key = os.getenv("AZURE_SEARCH_API_KEY")

# Disabilita SSL verify nel transport
transport = RequestsTransport(connection_verify=False)

azure_client = SearchIndexClient(azure_service_endpoint, AzureKeyCredential(azure_key), transport=transport)

search_client = SearchClient(
    endpoint=azure_service_endpoint,
    index_name="complianceindex",
    credential=AzureKeyCredential(azure_key),
    # override qui
    transport=transport  
)

In [241]:
def index_creation():
    name = "complianceindex"
    fields = [
        SimpleField(name="FileId", type=SearchFieldDataType.String, key=True),
        SimpleField(name="webUrl", type=SearchFieldDataType.String, key=True),
        SearchableField(name="FileName", type=SearchFieldDataType.String, searchable=True, filterable=True),
        SearchableField(name="text", type=SearchFieldDataType.String, searchable=True, filterable=True),
        SearchField(name="embedded_text", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
                        searchable=True, 
                        vector_search_dimensions=1536,
                        vector_search_profile_name="simple-vector-config"
                        )
    ]

    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="simple-vector-config",
                algorithm_configuration_name="simple-algorithms-config",
            )
        ],
        algorithms=[
            HnswAlgorithmConfiguration(
                name="simple-algorithms-config",
                kind="hnsw",
                parameters={
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 500,
                    "metric": "cosine",
                    },
                )
            ],
        )

    index = SearchIndex(
        name=name,
        fields=fields,
        vector_search=vector_search,
    )

    result = azure_client.create_index(index)

    return result

###

def get_index():
    name = "complianceindex"
    result = azure_client.get_index(name)
    return result

###

def upload_document(document):
    result = search_client.upload_documents(documents=document)
    print("Upload of new document succeeded: {}".format(result[0].succeeded))
    return result

###

def create_structure(accum: int, fileName, webUrl, chunked_text, embedded_text):

    structure = {
            "FileId": str(accum),
            "FileName": fileName,
            "webUrl": webUrl,
            "text": chunked_text, 
            "embedded_text": embedded_text
        }

    return structure

### Structure Creation for List to Upload on Azure Index

In [None]:
accum = 0 
lst_to_upload = list()

for i, v in enumerate(temp_full_json_import):
    fileName = list(v.keys())[0]
    fileName_noExtension = list(v.keys())[0].rsplit(".")[0]

    text = v[fileName]['openAI text']
    webUrl = v[fileName]['reference_link']
    chunked_text = v[fileName]['chunked_text']

    for i, chunk in enumerate(chunked_text):

        embedded_text = v[fileName]['embedded_text'][i]
        lst_elem = create_structure(accum, fileName_noExtension, webUrl ,chunk, embedded_text)
        lst_to_upload.append(lst_elem)
        ##
        print(colored(f"▶ Chunk {accum} created for {fileName}\n***", color="yellow", attrs=["bold"]))
        ##
        accum += 1

### Index creation

In [None]:
index_creation()
result = upload_document(lst_to_upload)

### Test

In [None]:
def search_documents(query, top_k=3):
    """
    Search content inside Azure index based on the user query
    """
    results = search_client.search(query, top=top_k)
    output = [doc['text'] for doc in results]
    return output

############

chat_history = []

def chatbot_with_context(user_input):
    """#
    Use this function for testing purposes. The actual function is inside the python backend.
    """
    search_results = search_documents(user_input)
    context = "\n\n".join(search_results)

    chat_history.append({"role": "user", "content": user_input})

    system_prompt = f"""
                    You are an AI assistant. Respond only based on the documents below.
                    If an answer is not contained in the documents, say that you are not sure.
                    Documents:
                    {context}
                    """

    messages = [{"role": "system", "content": system_prompt}] + chat_history
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        temperature=0.3,
        max_tokens=800
    )

    reply = response.choices[0].message.content
    chat_history.append({"role": "assistant", "content": reply})
    return reply