# Importing the libraries

In [1]:
import os
import openai
import fitz
import re
from time import time,sleep
import math
import ast
import numpy as np
import pandas as pd
import tiktoken
import nltk
from openai.embeddings_utils import get_embedding, cosine_similarity

# OpenAI Key

In [2]:
openai.api_key_path = "New API Key.txt"

# Reading the document

In [3]:
# Reading the document
document = fitz.open('C:\\Users\\greashk.USEREADYTECH\\Downloads\\OpenAI\\Question and Answer using OpenAI\\USEReady Employee Handbook.pdf') 

# Summarization

In [4]:
# Summarizing the entire corpus

summary_list = []
for page in document:
    text = page.get_text("text")
    prompt = text + "\n Tl;dr:"
    response = openai.Completion.create(
    model = "text-davinci-003",
    prompt = prompt,
    temperature = 0.7,
    max_tokens = 120,
    top_p = 0.9,
    frequency_penalty = 0.0,
    presence_penalty = 1
  )
    summary_list.append(response["choices"][0]["text"].strip())

In [5]:
summary_text = ' '.join(summary_list)
print(summary_text)

The UserReady Employee Handbook contains important information on the company's policies, procedures, and benefits. It provides details on employee rights and responsibilities, workplace safety, compensation and benefits, and other topics that employees should be familiar with. The handbook is designed to help employees understand their roles and obligations within the company. The USEReady Employee Handbook is an important document that outlines the policies and procedures for employees of the company. It covers topics such as employment, immigration law compliance, equal opportunity employment, employee grievances, internal communication, anti-retaliation and whistleblower policies, employment classifications, personnel data changes, expense reimbursement, and termination of employment. It also outlines the company's hours and working conditions. The USEReady Employee Handbook outlines emergency closing procedures, parking rules, safety and security policies, facility access guidelin

# Pre-processing the data

In [6]:
def pre_process(path):
    
    doc = fitz.open(path)
    lis = []
    for i in doc:
        lis.append(i.get_text("text"))

    corpus = "".join(lis)
    corpus = corpus.replace("\n", "")
    corpus = corpus.replace("\xa0\xa0in\xa0\xa0", " ")
    corpus = corpus.replace("\xa0\xa0\xa09", "")
    corpus = corpus.replace("\xa0\xa0\xa0SEE PROFILE", ", ")
    corpus = corpus.replace('..', '') 
    corpus = corpus.replace('USEREADY EMPLOYEE HANDBOOK', '')
    corpus = re.sub(' +', ' ', corpus)
    corpus = re.sub(" ©2018 USEReady, All Rights reserved. .{2}", "", corpus)
    corpus = re.sub(" ©2018 USEReady, All Rights reserved. .{1}", "", corpus)
    
    corpus = nltk.sent_tokenize(corpus)
    tokenizer = tiktoken.get_encoding("cl100k_base")
    
    small_paragraphs = []
    current_paragraph = ""

    for sentence in corpus:
        # check if adding the current sentence to the paragraph would exceed the maximum length
        if len(tokenizer.encode(current_paragraph)) + len(tokenizer.encode(sentence)) > 300:
            # add the current paragraph to the list and start a new paragraph with the last two sentences of the previous paragraph
            small_paragraphs.append(current_paragraph.strip())
            current_paragraph = sentence + " "
        else:
            # add the current sentence to the current paragraph
            current_paragraph += sentence + " "

    # add the last paragraph to the list
    if len(tokenizer.encode(current_paragraph)) > 0:
        small_paragraphs.append(current_paragraph.strip())
    
    return small_paragraphs

# Converting the data to a Dataframe

In [7]:
embeddings_cache = {}

In [8]:
def dataframe(path):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    
    # Extract the file name from the file path
    file_name = path.split('\\')[-1]

    # Check if embeddings for this file name have already been generated
    if file_name in embeddings_cache:
        #print('Same')
        return embeddings_cache[file_name]

    # Generate embeddings for the new file
    df = pd.DataFrame(pre_process(path), columns = ['Context'])
    df['n_tokens'] = df.Context.apply(lambda x: len(tokenizer.encode(x)))
    df['embeddings'] = df.Context.apply(lambda x: openai.Embedding.create(input = x, engine = 'text-embedding-ada-002')['data'][0]['embedding'])
    
    #df.to_csv('Final QA embeddings.csv', index = False)
    
    # Convert embeddings column from string to list of floats
    #df2 = pd.read_csv('Final QA embeddings.csv')
    #df2['embeddings'] = df2['embeddings'].apply(ast.literal_eval)

    # Cache the embeddings for future use
    embeddings_cache[file_name] = df
    return df

# Text Completion Function

In [9]:
# Creating a function that completes the text for our answer

def gpt3_completion(prompt, engine = 'text-davinci-003', temp = 0.5, top_p = 1.0, tokens = 200, freq_pen = 0.25, pres_pen = 0.0, stop = None):
    max_retry = 5
    retry = 0
    prompt = prompt.encode(encoding = 'ASCII',errors = 'ignore').decode()
    while True:
        try:
            response = openai.Completion.create(
                engine = engine,
                prompt = prompt,
                temperature = temp,
                max_tokens = tokens,
                top_p = top_p,
                frequency_penalty = freq_pen,
                presence_penalty = pres_pen,
                stop = stop)
            text = response['choices'][0]['text'].strip()
            text = re.sub('\s+', ' ', text)
            return text
        except Exception as oops:
            retry += 1
            if retry >= max_retry:
                return "GPT3 error: %s" % oops
            print('Error communicating with OpenAI:', oops)
            sleep(1)

# Similarity Matching Function between Context and Input

In [10]:
# Creating a function that matches the input query with the given context

def search_products(df2, search_query, n = 5):
    embedding = get_embedding(
        search_query,
        engine = "text-embedding-ada-002"
    )
    df2["similarities"] = df2['embeddings'].apply(lambda x: cosine_similarity(x, embedding))
    top_n = df2.sort_values("similarities", ascending=False).head(n)
    
    return top_n.reset_index(drop = True)

# Question / Answer Function

In [11]:
# Creating a function that answers the question

def answer_question(search_query, path):
    search_query = search_query
    path = path
    new = search_products(dataframe(path), search_query, n = 5)
    content = new['Context'][0] + new['Context'][1]
    prompt = f"""Answer the question based on the context below. If the question cannot be answered using the information provided answer with "I don't know".\n\nContext: {content}\n\nQuestion: {search_query}\n\nAnswer: """
    return gpt3_completion(prompt)   

In [12]:
answer_question(search_query = "Can i bring my personal laptop to office?", path = r"C:\Users\greashk.USEREADYTECH\Downloads\OpenAI\Question and Answer using OpenAI\USEReady Employee Handbook.pdf")

'No, personal property should be used with discretion and USEReady assumes no risk for any loss or damage to personal property.'

In [13]:
answer_question(search_query = 'What does screenshot 4 tells us?', path = r"C:\Users\greashk.USEREADYTECH\Downloads\OpenAI\Test.pdf")

'Screenshot 4 tells us about the Keywords Database used to increase the effectiveness of the text messaging campaigns.'

In [14]:
answer_question(search_query = "How many sick leaves are there per yea?", path = r"C:\Users\greashk.USEREADYTECH\Downloads\OpenAI\Question and Answer using OpenAI\USEReady Employee Handbook.pdf")

'6 days'

In [15]:
answer_question(search_query = "Can i reimburse the amount invested on food?", path = r"C:\Users\greashk.USEREADYTECH\Downloads\OpenAI\Question and Answer using OpenAI\USEReady Employee Handbook.pdf")

'It depends on the purpose of the food. If it was for carrying out company business, then you may be eligible for reimbursement.'

In [16]:
answer_question(search_query = "How should i be dressed in office?", path = r"C:\Users\greashk.USEREADYTECH\Downloads\OpenAI\Question and Answer using OpenAI\USEReady Employee Handbook.pdf")

'Generally, employees should wear appropriate clothing, observe high standards of personal hygiene, and dress and groom themselves according to the requirements of their positions. Examples of appropriate workplace attire include slacks, blouses, button-down shirts, khaki pants, and polo shirts.'

In [17]:
answer_question(search_query = "Should I mark attendance daily?", path = r"C:\Users\greashk.USEREADYTECH\Downloads\OpenAI\Question and Answer using OpenAI\USEReady Employee Handbook.pdf")

'Yes, employees should notify their manager each day of their absence.'

In [18]:
answer_question(search_query = "What should I do if i was harrassed by my colleague?", path = r"C:\Users\greashk.USEREADYTECH\Downloads\OpenAI\Question and Answer using OpenAI\USEReady Employee Handbook.pdf")

'You should raise the matter informally with Human Resources. Following initial discussions with Human Resources, you will be asked to choose one of the following options: 1. Decide that no further action is necessary; 2. Discuss the complaint with the alleged harasser; 3. Ask human resources to help resolve the matter through informal and discreet approaches to the individual; or 4. Make a formal complaint using the procedure as detailed above.'

In [19]:
answer_question(search_query = "What will they do if I lost my company laptop?", path = r"C:\Users\greashk.USEREADYTECH\Downloads\OpenAI\Question and Answer using OpenAI\USEReady Employee Handbook.pdf")

'They may charge you for the laptop to the extent permitted by Central, state, and local law.'

In [20]:
answer_question(search_query = "Can I use my cell phone during office hours?", path = r"C:\Users\greashk.USEREADYTECH\Downloads\OpenAI\Question and Answer using OpenAI\USEReady Employee Handbook.pdf")

'No, using cell phones during office hours is not allowed.'

In [21]:
#answer_question(search_query = "how many planets are there in our solar system?")

In [22]:
#answer_question(search_query = "what type of car is jeep compass?")

In [23]:
#answer_question(search_query = "who is the ceo of google?")

In [None]:
df = pd.read_csv('Q-A embeddings.csv')
df['embeddings'] = df['embeddings'].apply(ast.literal_eval)

def extract_text_from_pdfs(pdf_files):
    df = pd.DataFrame(columns=["file", "text"])
    for pdf_file in pdf_files:
        # Open the PDF file
        with BytesIO(pdf_file.read()) as f:
            pdf_reader = PyPDF2.PdfReader(f)
            num_pages = len(pdf_reader.pages)
            text = ""
            # Iterate over all the pages
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                text += page_text
            # Add the file name and the text to the data frame
            df = df.append({"file": pdf_file.name, "text": text}, ignore_index=True)
    return df

def preprocess_text(text_list):
    processed_text = []
    for text in text_list:
        num_words = len(text.split(" "))
        if num_words > 10:
            processed_text.append(text)
    return processed_text

def remove_short_sentences(df):
    df["sentences"] = df["sentences"].apply(preprocess_text)
    return df