In [None]:
import pdfminer.high_level
import docx
import re
import os
import tiktoken
from openai import AzureOpenAI    
import ast
import pandas as pd
import math
from io import StringIO
import json

#### Functions 

# Extract text from PDF documents
def extract_text_from_pdf(pdf_path):
    # Read text from PDF file
    all_text = pdfminer.high_level.extract_text(pdf_path)
    return all_text

# Extract text from Word documents
def extract_text_from_docx(pdf_path):
    # open word file
    doc = docx.Document(pdf_path)
    # extract text 
    all_text = []
    for para in doc.paragraphs:
        all_text.append(para.text)
    return '\n'.join(all_text)

def extract_text(pdf_path):
    # read file 
    extension = os.path.splitext(pdf_path)[1]
    if extension == '.pdf':
        all_text = extract_text_from_pdf(pdf_path)
    elif extension == '.docx':
        all_text = extract_text_from_docx(pdf_path)
    else:
        all_text = 'Document type not supported'

    return all_text

#Function to preprocess the extracted text of the last version imported:
def preprocess_text(text):
    #Remove unwanted characters and line breaks:
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

#Analyse text with LLM
def LLM_extractor(client, text):
    # train RAG model
    response = client.chat.completions.create(
        # GPT model
        model = "Team_Augmentation_4o_mini",
        # GPT model role
        messages=[
            {
                "role": "system",
                "content": """
Intructions:
1. Based on the given restauran order, extract the most relevant information of the order. 
2. Format the response as a Python dictionary.
3. Ensure that all response is entirely in English, even if the input is in another language.
4. Organice all keys of the dictionary as text with single quotes and avoid doing it as python lists.
5. Adhere to the following structure:
{   'Restaurant': 'Name of the restaurant',
    'Order ID': 'All numerical and number sequence corresponfing to the ID of the order',
    'Food item names': 'ordered item 1, ordered item 2, ordered item 3, ordered item 4',
    'Actions taken': 'action 1, action 2, action 3, actions 4',
    'Time of action': 'date and time the order was made',
    'Total Price': 'total price of the order'
},


"""
            },
            {
                "role": "user",
                "content": text
            }
        ],
        # set the parameters of the gpt model
        temperature=0,
        max_tokens=1000,
        frequency_penalty=0,
        presence_penalty=0,
        stop=["n/n"]
    )
    return response.choices[0].message.content

# Preprocess the response from GPT to delet unwanted characters
def clean_response(text):
    text = text.replace('```python', '').replace('```', '').strip()
    text = text.replace("‘", '"').replace("’", '"')
    text = text.replace("'", '"')
    text = text.replace("`", '"')
    text = re.sub(r'[^\x00-\x7f]+', ' ', text)
    text = re.sub(r'("High_Level_Impression": )([A-Za-z])', r'\1"\2', text)
    text = re.sub(r'("Location": )([A-Za-z])', r'\1"\2', text)
    text = re.sub(r'"s\b', "'s", text)
    text = re.split('{',text)[1]
    text = re.split('}',text)[0]
    text = '{' + text + '}'
    text = ast.literal_eval(text)
    
    return text


# Estimate number pf tokens
def count_tokens(prompt: str, model: str = "gpt-3.5-turbo") -> int:
    # Load the tokenizer corresponding to the specified model
    encoding = tiktoken.encoding_for_model(model)
    # Encode the prompt to get the tokens
    tokens = encoding.encode(prompt)
    
    return len(tokens)

# Avarage calculation
def calculate_mean(data):
    # Calculate the mean of the list
    return sum(data) / len(data)

# Standard deviation calculation
def calculate_standard_deviation(data):
    # Calculate the mean
    mean = calculate_mean(data)
    
    # Calculate the variance
    variance = sum((x - mean) ** 2 for x in data) / len(data)
    
    # Return the square root of the variance (standard deviation)
    return math.sqrt(variance)


#Azure OpenAI conection 
# Import Azure Open AI credentials
with open("Azure_credentials.txt", "r") as file:
    azure_endpoint = file.readline().strip()
    api_key = file.readline().strip()
    api_version = file.readline().strip()

# Connect to Azure Open AI
client = AzureOpenAI(azure_endpoint = azure_endpoint,
                    api_key = api_key ,
                    api_version = api_version)

# Prompt used to train the RAG model
prompt = """
Intructions:
1. Based on the given restauran order, extract the most relevant information of the order. 
2. Format the response as a Python dictionary.
3. Ensure that all response is entirely in English, even if the input is in another language.
4. Organice all keys of the dictionary as text with single quotes and avoid doing it as python lists.
5. Adhere to the following structure:
{   'Restaurant': 'Name of the restaurant',
    'Order ID': 'All numerical and number sequence corresponfing to the ID of the order',
    'Food item names': 'ordered item 1, ordered item 2, ordered item 3, ordered item 4',
    'Actions taken': 'action 1, action 2, action 3, actions 4',
    'Time of action': 'date and time the order was made',
    'Total Price': 'total price of the order'
}


"""
# Declare varaibles where the results will be sotred 
results = []
not_proccessed = {}
tokens_candidate_in  = []
tokens_candidate_out  = []

#Path where the files are stored
folder_path = 'Input_files'
# Iterate over each of the files
for filename in os.listdir(folder_path):
    # get the file path
    pdf_file_path = os.path.join(folder_path, filename)
    try:
        print(f"Processing file: {filename}")
        # extract the text from the file
        cv_text = extract_text(pdf_file_path)
        if cv_text == 'Document type not supported':
            print('Document type not supported')
            not_proccessed[filename] = 'Document type not supported'
        else:
            # Preprocess the extracted text
            cv_text_1 = preprocess_text(cv_text)
            # calcuate the numbers of tokens that will be inputed to the model
            prompt_text = prompt + cv_text_1
            tokens_candidate_in.append(count_tokens(prompt_text, model="gpt-3.5-turbo"))
            # Extract the information using the LLM
            cv_text_2 = LLM_extractor(client, cv_text_1)
            # Get the nomber of tokens of the response
            tokens_candidate_out.append(count_tokens(cv_text_2, model="gpt-3.5-turbo"))
            # Clean the LLM response
            text2 = clean_response(cv_text_2)
            results.append(text2)
    except Exception as e:
        print(e)
        not_proccessed[filename] = e

#Estimate mean and std deviation of the tokens 
mean_tokens_candidate_in = calculate_mean(tokens_candidate_in)
std_dev_tokens_candidate_in = calculate_standard_deviation(tokens_candidate_in)
mean_tokens_candidate_out = calculate_mean(tokens_candidate_out)
std_dev_tokens_candidate_out = calculate_standard_deviation(tokens_candidate_out)

print('Mean number of tokens per document in: {} with a standard deviation of: {}'.format(mean_tokens_candidate_in, std_dev_tokens_candidate_in))
print('Mean number of tokens per document out: {} with a standard deviation of: {}'.format(mean_tokens_candidate_out, std_dev_tokens_candidate_out))

# Convert results to dataframe
df = pd.DataFrame.from_dict(results)
# Convert to JSON
json_data = df.to_json(orient="records", indent=4)
# Export jason file 
df.to_json("extractor_results.txt", orient="records", indent=4)
df