In [1]:
import requests
import os
import json
from pprint import pprint
from dotenv import load_dotenv
import azure.storage.blob
from azure.storage.blob import BlobServiceClient, BlobSasPermissions, generate_blob_sas,generate_container_sas,ContainerClient
from azure.storage.blob import ResourceTypes, AccountSasPermissions, generate_account_sas
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, AnalyzeResult
from openai import AzureOpenAI
import markdown
import tiktoken
from langchain.text_splitter import TextSplitter, MarkdownTextSplitter, RecursiveCharacterTextSplitter, PythonCodeTextSplitter

import os
import sys
import datetime
from datetime import datetime, timedelta
import pyodbc


# SharePoint site URL
# site_url = "https://your-sharepoint-site-url"


# The SharePoint folder whose files you want to list
folder_url = "/Shared Documents"
client_id = ""
tenant_id = ""
client_secret = ""

def read_env_file(file_path):
    # Load the .env file
    load_dotenv(dotenv_path=file_path)

    # Get all environment variables
    env_vars = os.environ

    # Return the environment variables
    return env_vars

read_env_file("secrets.env")

AFR_ENDPOINT = os.environ.get("AFR_ENDPOINT")
AFR_API_KEY = os.environ.get("AFR_API_KEY")
AZURE_ACC_NAME = os.environ.get("AZURE_ACC_NAME")

AZURE_PRIMARY_KEY = os.environ.get("AZURE_PRIMARY_KEY")
STORAGE_ACCOUNT_CONTAINER = os.environ.get("STORAGE_ACCOUNT_CONTAINER")
DESTINATION_ACCOUNT_CONTAINER = os.environ.get("DESTINATION_ACCOUNT_CONTAINER")

OPENAI_ENDPOINT = os.environ.get("OPENAI_ENDPOINT")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_EMBEDDING_MODEL = os.environ.get("OPENAI_EMBEDDING_MODEL")


SQL_PASSWORD = os.environ.get("SQL_SECRET")
SQL_SERVER = os.environ.get("SQL_SERVER")
SQL_DB = os.environ.get("SQL_DB")
SQL_USERNAME = os.environ.get("SQL_USERNAME")

TEXT_ANALYTICS_KEY = os.environ.get("TEXT_ANALYTICS_KEY")
TEXT_ANALYTICS_ENDPOINT = os.environ.get("TEXT_ANALYTICS_ENDPOINT")

driver = '{ODBC Driver 17 for SQL Server}'

site_id = ''  # replace with your actual site id
drive_id = ''  # replace with your actual drive id
SENTENCE_ENDINGS = [".", "!", "?"]
WORDS_BREAKS = list(reversed([",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]))



In [3]:
#read files from Azure storage account
def read_container_files():     
     files_to_read = []
    
     storage_account_connection_string = "DefaultEndpointsProtocol=https;AccountName="+AZURE_ACC_NAME+";AccountKey="+AZURE_PRIMARY_KEY+";EndpointSuffix=core.windows.net"
    
     try:
        blob_service_client   = BlobServiceClient.from_connection_string(storage_account_connection_string)   
        
        sas_token = generate_container_sas(
            blob_service_client.account_name,
            account_key=blob_service_client.credential.account_key,
            container_name=STORAGE_ACCOUNT_CONTAINER,
            resource_types=ResourceTypes(object=True),
            permission=AccountSasPermissions(read=True,list=True),
            expiry=datetime.utcnow() + timedelta(hours=2)
        )
       
        this_container_url = "https://"+AZURE_ACC_NAME+".blob.core.windows.net/"+STORAGE_ACCOUNT_CONTAINER+"?"+sas_token
        #print(this_container_url)
               
       
        container = ContainerClient.from_container_url(this_container_url)
        #print(container)
        blob_list = container.list_blobs()   
        #print(blob_list)
     
        #generate SAS token for each file
        for blob in blob_list:                          
            doit = True            
            
            if doit == True:
                sas_token = generate_blob_sas(
                        account_name=AZURE_ACC_NAME,
                        container_name=STORAGE_ACCOUNT_CONTAINER,
                        blob_name=blob.name,
                        account_key=AZURE_PRIMARY_KEY,
                        permission=BlobSasPermissions(read=True),
                        expiry=datetime.utcnow() + timedelta(hours=2)
                    )
                
                filewithsas=  "https://"+AZURE_ACC_NAME+".blob.core.windows.net/"+STORAGE_ACCOUNT_CONTAINER+"/"+blob.name+"?"+sas_token  

                filewithsas = '{"file":"' + filewithsas + '", "file_name":"' + blob.name + '"}'            
                
                # if counter == number_of_blobs:
                #     fileswithsas = fileswithsas[:-1]
            
                files_to_read.append(filewithsas)            
      
        result = '{"files":['+ ','.join(files_to_read) +']}' 
        return result
          
     except:
        exc_tuple = sys.exc_info()        
        errors = [ { "message": "Failure during read_storage_account_files e: " + str(exc_tuple)}]
        print(errors)


In [4]:
def logging(text):
    print(text)

In [5]:
from typing import Callable, List, Dict, Optional, Generator, Tuple, Union


class TokenEstimator(object):
    GPT2_TOKENIZER = tiktoken.get_encoding("gpt2")

    def estimate_tokens(self, text: Union[str, List]) -> int:

        return len(self.GPT2_TOKENIZER.encode(text, allowed_special="all"))

    def construct_tokens_with_size(self, tokens: str, numofTokens: int) -> str:
        newTokens = self.GPT2_TOKENIZER.decode(
            self.GPT2_TOKENIZER.encode(tokens, allowed_special="all")[:numofTokens]
        )
        return newTokens

TOKEN_ESTIMATOR = TokenEstimator()


In [6]:
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
def key_phrase_extraction(text):

    try:

        ta_credential = AzureKeyCredential(TEXT_ANALYTICS_KEY)
        text_analytics_client = TextAnalyticsClient(
             endpoint=TEXT_ANALYTICS_ENDPOINT, 
             credential=ta_credential)   


        documents = [text]

        response = text_analytics_client.extract_key_phrases(documents = documents)[0]
        return ', '.join(response.key_phrases) 

    except Exception as err:
        print("Encountered exception. {}".format(err))
        
#x = key_phrase_extraction("this is sample text for restaurants and dinners as well as chicken and beef")
#print (x)

In [7]:
import pyodbc
import sqlite3
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
#chunk documents and save it to database

def process_file(file_URL,filename,chunck_size=1024):       

    endpoint = AFR_ENDPOINT
    key = AFR_API_KEY

    document_intelligence_client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))

    if(file_URL != ""):
      print(f"Analyzing form from URL {file_URL}...")
    
      poller = document_intelligence_client.begin_analyze_document(
         "prebuilt-layout", AnalyzeDocumentRequest(url_source=file_URL)
      )    
    
    result = poller.result()     
    process_afr_result(result, filename, file_URL)   



def process_afr_result(result, filename,URL, content_chunk_overlap=100):     
    #print(f"Processing {filename } with {len(result.pages)} pages into database...this might take a few minutes depending on number of pages...")
    chunk_id = 1 
    for page_idx in range(len(result.pages)):
        docs = []
        #pageinfo = result.pages[page_idx]
        #print(f"Processing page {page_idx} of {len(result.pages)}...")
        content_chunk = ""       
        
        for line_idx, line in enumerate(result.pages[page_idx].lines):            
            #print("...Line # {} has text content '{}'".format(line_idx,line.content.encode("utf-8")))
            # Assuming `line.content` is your text
            encoded_content = line.content.encode("utf-8")  # This will give you bytes
            decoded_content = encoded_content.decode("utf-8")  # This will give you string
            # Now you can add it to your content_chunk
            content_chunk += decoded_content + "\n"

            
             
        #now split the chunk        
        content_chunk_size=TOKEN_ESTIMATOR.estimate_tokens(content_chunk)
        content_chunk_size = 1024;
        if content_chunk_size > content_chunk_overlap:
           chunk_list = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                                separators=SENTENCE_ENDINGS + WORDS_BREAKS,
                                chunk_size=content_chunk_size, chunk_overlap=content_chunk_overlap).split_text(content_chunk)
        else:
            chunk_list = [content_chunk]
        
        for chunked_content in chunk_list:
            chunk_size = TOKEN_ESTIMATOR.estimate_tokens(chunked_content)
            #print(f"content {chunked_content} with size {chunk_size}")            
            embeddings = create_embeddings(chunked_content)
            key_phrases = key_phrase_extraction(chunked_content)
            clean_content = chunked_content.replace("'", "''")
            #sql_statement = f"exec dbo.InsertIntoDocuments '{filename}', '{URL}', {chunk_id}, '{clean_content}', 'title...', '{datetime.now()}', '{page_idx}', '{line_idx}', '{embeddings}'"
            #print(sql_statement)
            add_document_to_table(filename, URL, chunk_id, chunked_content, datetime.now(),  page_idx, line_idx, embeddings,key_phrases)
            chunk_id += 1
            # if chunk_id != 1:
            #    break

        # if chunk_id != 1:
        #        break    
        
  
def create_embeddings(text): 

    try:

        if text == "":
            return None
        
        client = AzureOpenAI(
        azure_endpoint = OPENAI_ENDPOINT, 
        api_key=OPENAI_API_KEY,  
        api_version="2024-02-15-preview"
        )

        return client.embeddings.create(input = [text], model=OPENAI_EMBEDDING_MODEL).data[0].embedding
    
    except Exception as e:
        print(f"Error Saving Records for: {e}")
 

def add_document_to_table(document_name, document_url, chunk_id, content,  last_updated,  page_number, line_number, embeddings,key_phrases):
    
    try:

        # Define the connection string                
        connection_string = f"DRIVER={driver};SERVER={SQL_SERVER};DATABASE={SQL_DB};UID={SQL_USERNAME};PWD={SQL_PASSWORD}"
        #print(connection_string)
       
        # Connect to the database
        conn = pyodbc.connect(connection_string)
        #conn = sqlite3.connect(connection_string)

        # Create a new cursor from the connection
        cursor = conn.cursor()
        clean_content = content.replace("'", "''")        
        sql_statement = f"execute dbo.InsertIntoDocuments '{document_name}', '{document_url}', {chunk_id}, '{clean_content}', 'title...', '{last_updated}', '{page_number}', '{line_number}', '{embeddings}', '{key_phrases}'"
        #print(sql_statement)
        cursor.execute(sql_statement)       
        # Execute the stored procedure        
       # cursor.execute("{execute dbo.InsertIntoDocuments (?, ?, ?, ?, ?, ?, ?, ?, ?)}",
       #             document_name.replace("'", " "), document_url, chunk_id, clean_content, 'title...', last_updated,  page_number, line_number, embeddings)

        # Commit the transaction
        conn.commit()

        # Close the connection
        conn.close()

        print("Stored procedure executed successfully.")
    
    except Exception as e:
        print(f"Error Saving Records for: {e}")





URL = "https://ustspdevpocdatalake.blob.core.windows.net/nasa-files/NEJMoa2404881.pdf?sp=racw&st=2024-07-16T17:25:40Z&se=2024-08-11T01:25:40Z&sv=2022-11-02&sr=b&sig=yltXC3evyeG9M6m%2Bh%2Fdp4AyKNlAGwqUnKIOubDwQdsk%3D"
file_name = "NEJMoa2404881.pdf"
process_file(URL,file_name)


Analyzing form from URL https://ustspdevpocdatalake.blob.core.windows.net/nasa-files/NEJMoa2404881.pdf?sp=racw&st=2024-07-16T17:25:40Z&se=2024-08-11T01:25:40Z&sv=2022-11-02&sr=b&sig=yltXC3evyeG9M6m%2Bh%2Fdp4AyKNlAGwqUnKIOubDwQdsk%3D...
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stored procedure executed successfully.
Stor

In [13]:
#now let's read all the darn files
files = read_container_files()
print(files)

data = json.loads(files)

for file_info in data['files']:
    print('File URL:', file_info['file'])
    print('File Name:', file_info['file_name'])
    process_file(file_info['file'],file_info['file_name'])



{"files":[{"file":"https://useducsapocdatalake.blob.core.windows.net/nasa-files/19710011851.pdf?se=2024-05-09T21%3A16%3A27Z&sp=r&sv=2023-11-03&sr=b&sig=3R2HZ7a8jK97Eh6ggn0tEwVHY1/bbe0jjrGAyXBzb1c%3D", "file_name":"19710011851.pdf"},{"file":"https://useducsapocdatalake.blob.core.windows.net/nasa-files/19720008279.pdf?se=2024-05-09T21%3A16%3A27Z&sp=r&sv=2023-11-03&sr=b&sig=whLtTxGgwWYn8DdLSOFDoKqAD9zHMBM8s9oLQEk6nvU%3D", "file_name":"19720008279.pdf"},{"file":"https://useducsapocdatalake.blob.core.windows.net/nasa-files/19730024009.pdf?se=2024-05-09T21%3A16%3A27Z&sp=r&sv=2023-11-03&sr=b&sig=1fnJYNNW7DHxLS04TaR4UwFXRBnQZU9FpC%2BU7ZyUnZc%3D", "file_name":"19730024009.pdf"},{"file":"https://useducsapocdatalake.blob.core.windows.net/nasa-files/19750009792.pdf?se=2024-05-09T21%3A16%3A27Z&sp=r&sv=2023-11-03&sr=b&sig=hxGgg22SasoxKDgYQZiS4gJUuVNRmAoWRpaQBZYn0M4%3D", "file_name":"19750009792.pdf"},{"file":"https://useducsapocdatalake.blob.core.windows.net/nasa-files/19780006038.pdf?se=2024-05-09T