In [9]:
import pandas
from textwrap import wrap
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [10]:
import os
import sys
import json
from langchain.schema import Document
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import LLMChain

In [11]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
WATCH_DIRECTORY = os.getenv("WATCH_DIRECTORY")
OPENAI_ENGINE = os.getenv("OPENAI_ENGINE")

In [6]:
def get_pdf_paths(directory):
    """
    This function scans the specified directory and returns the file paths of all PDF files in it.
    """
    pdf_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_paths.append(os.path.join(root, file))
    return pdf_paths

# Usage
directory = WATCH_DIRECTORY
pdf_paths = get_pdf_paths(directory)
print(pdf_paths)

['C:/Users/koush/Synthia_Anaconda/src/synthia/notebooks/data\\Article 39 - Exempt Officers and Sergeants Modified Duty Program.pdf', 'C:/Users/koush/Synthia_Anaconda/src/synthia/notebooks/data\\Association of Building Mechanical and Electrical Inspectors (ABMEI) MOA.pdf', 'C:/Users/koush/Synthia_Anaconda/src/synthia/notebooks/data\\Association of Engineers and Architects IFPTE Local 21 Unit 43 MOA.pdf', 'C:/Users/koush/Synthia_Anaconda/src/synthia/notebooks/data\\Association of Engineers and Architects IFTPE Local 21 Units 4142 MOA.pdf', 'C:/Users/koush/Synthia_Anaconda/src/synthia/notebooks/data\\Association of Legal Professionals of San Jose (ALP).pdf', 'C:/Users/koush/Synthia_Anaconda/src/synthia/notebooks/data\\Association of Maintenance Supervisory Personnel IFPTE Local 21 (AMSP) MOA.pdf', 'C:/Users/koush/Synthia_Anaconda/src/synthia/notebooks/data\\City Association of Management Personnel IFPTE Local 21 (CAMP) MOA.pdf', 'C:/Users/koush/Synthia_Anaconda/src/synthia/notebooks/data\

In [7]:
pages = []
def fetch_pages(pdf_paths):
    for file_path in pdf_paths:
        loader = PyPDFLoader(file_path)
        doc = loader.load()
        for page in doc:
            ind_pages = Document(page_content=page.page_content)
            pages.append(ind_pages)

In [8]:
fetch_pages(pdf_paths)

In [26]:
len(pages)

676

In [27]:
pages[100].page_content

'AEA (Units 41 & 42) MOA  July 1, 2021 – June 30, 2023     Page 8 6.2.7  The parties agree that they have a mu tual i nterest in well -trained Representatives.  \nToward this end, up to four (4) designated  Representative s shall be granted a \nmaximum of eight (8) hours paid release time during each year of this agreement \nto participate in training sessions related to the  provi sions of this agreeme nt, \njointly conducted by the Union and the Office of Employee R elations, accor ding \nto an outline of such training activities to be submitted by the Union and approved \nby the Office of Employee Relations prior to conducting any su ch tra ining \nsessions.  \n \n6.3 Release Time  \n \n6.3.1  Release time from regular City duties shall  be provided to  designated Union \nrepresentatives in accordance with the following provisions.  \n \n6.3.2  Designated Union Representatives .  The following designated Union  \nRepre sentatives shall be e ligible for release time to attend meeting

In [28]:
import tiktoken
tiktoken.encoding_for_model("gpt-4")
tokenizer = tiktoken.get_encoding("cl100k_base")
def tiktoken_len(text):
    tokens = tokenizer.encode(text,disallowed_special=())
    return len(tokens)

In [29]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 20,
    length_function = tiktoken_len,
    separators=["\n\n","\n"," ",""]
)

In [30]:
pages_chunks = text_splitter.split_documents(pages)

In [31]:
len(pages_chunks)

1579

In [32]:
pages_chunks[100].page_content

'ABMEI MOA July 1, 2018 – June 30, 2023      Page 44 Exhibit I  \n \nABMEI SALARY RANGES  \n \nSalary Ranges Effective Fiscal Year 2018 -2019 \n \nJob \nCode  Classification  Steps  Min/Hour  Max/Hour  Min/Annual  Max/Annual  \n3917  Bldg Inspect Combo  FT  9 $39.96  $48.59  $83,116.80  $101,067.20  \n3918  Bldg Inspect Combo  PT  9 $39.96  $48.59  $83,116.80  $101,067.20  \n3915  Bldg Inspect Combo Cert I FT  9 $41.13  $49.97  $85,550.40  $103,937.60  \n3916  Bldg Inspect Combo Cert I PT  9 $41.13  $49.97  $85,550.40  $103,937.60  \n3902  Bldg Inspect Combo Cert II FT  9 $42.23  $51.47  $87,838.40  $107,057.60  \n3903  Bldg Inspect Combo Cert III FT  9 $43.92  $53.52  $91,353.60  $111,321.60  \n3904  Bldg Inspect Combo Cert, Sr FT  9 $46.11  $56.20  $95,908.80  $116,896.00'

In [33]:
def extract_page_contents(object_list):
    return [obj.page_content for obj in object_list]

pages_chunks = extract_page_contents(pages_chunks)

In [34]:
pages_chunks

['Office of Employee Relations  \nExempt Officers’ and Sergeants’ Modified Duty Program  \n                                                                                           \n   Original Effective Date: June 21, 2005  \nRevised Effective Date:  May 11, 2020  \nPage 1 of 6 \n BACKGROUND  \n1. The City and San Jose Police Officers ’ Association (SJPOA) recognize that, despite best \nefforts to promote safety, police officers and sergeants are injured in the line of duty. Such \ninjuries are unfortunate but can be a consequence of police work. The Exempt Officers ’ \nand Sergeants ’ Modified Duty Program (“Program ”) is available to any police officer or \nsergeant that has work -related or non -work related injuries or illnesses which preclude \nhim or her from performing the f ull scope of his or her duties without accommodation.  \n2. The City and SJPOA recognize that police officers and sergeants exist to enforce the law \nand protect public safety. Some essential job duties 

In [37]:
def is_useful_chunk(text,llm_object):
    llm = llm_object
    determining_template = """
    You need to determine if a piece of text is informative or non-informative elements like the headers, footers etc. and can be used to 
    generate a question and a relevant answer for the question.

    {text}

    Here are some instructions which you should follow in the process of determining if piece of text is informative and useful:
    1. Check if ample meaningful content is available for framing a question and suggesting an ideal answer

    Based on your finding, return a boolean value of True or False. If Question & Answer Pairs can be generated, return true else false.
    """
    determining_prompt = PromptTemplate(input_variables=["text"], template=determining_template)
    determining_prompt.format(text=text)
    determining_chain = LLMChain(llm=llm, prompt=determining_prompt, verbose=True)
    response = determining_chain({"text" : text})
    determining_flag = response["text"]
    return determining_flag

In [41]:
useful_text_chunks = []
llm = ChatOpenAI(temperature=0,model_name = OPENAI_ENGINE)
for chunk in pages_chunks:
    flag = is_useful_chunk(chunk,llm)
    if flag:
        useful_text_chunks.append(chunk)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You need to determine if a piece of text is informative or non-informative elements like the headers, footers etc. and can be used to 
    generate a question and a relevant answer for the question.

    Office of Employee Relations  
Exempt Officers’ and Sergeants’ Modified Duty Program  
                                                                                           
   Original Effective Date: June 21, 2005  
Revised Effective Date:  May 11, 2020  
Page 1 of 6 
 BACKGROUND  
1. The City and San Jose Police Officers ’ Association (SJPOA) recognize that, despite best 
efforts to promote safety, police officers and sergeants are injured in the line of duty. Such 
injuries are unfortunate but can be a consequence of police work. The Exempt Officers ’ 
and Sergeants ’ Modified Duty Program (“Program ”) is available to any police officer or 
sergeant that has work -related or non -work related

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).



[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You need to determine if a piece of text is informative or non-informative elements like the headers, footers etc. and can be used to 
    generate a question and a relevant answer for the question.

    beyond the control of the employee. Each employee who is granted a leave pursuant 
to the provisions of this Article shall, upon return f rom leave, be entitled to the 
position within the classification held by the employee at the time the leave 
commenced.  
 
13.3  If the position to which an employee would otherwise be entitled pursuant to the 
above has been deleted from the department's budg et during the term of the 
employee's leave of absence, the employee shall, upon return from leave, be entitled 
to a position within the classification held by the employee at the time the leave 
commenced, provided there is either a vacancy in such classifi cation or an employee 

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).



[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You need to determine if a piece of text is informative or non-informative elements like the headers, footers etc. and can be used to 
    generate a question and a relevant answer for the question.

    1547 Buyer I  28.23  34.33  2,258.40  2,746.40   29.08  35.36  2,326.40  2,828.80  
1542 Buyer II  33.98  41.33  2,718.40  3,306.40   35.00  42.57  2,800.00  3,405.60  
1544 Buyer III  39.35  47.93  3,148.00  3,834.40   40.53  49.37  3,242.40  3,949.60  
5116 Chemist  37.87  46.01  3,029.60  3,680.80   39.01  47.39  3,120.80  3,791.20  
5110 Chemist PT  37.87  46.01  3,029.60  3,680.80   39.01  47.39  3,120.80  3,791.20  
8057 Class Instructor PT  16.07  31.81  1,285.60  2,544.80   16.55  32.76  1,324.00  2,620.80  
3938 Code Enforcement Insp I  33.17  40.29  2,653.60  3,223.20   34.17  41.50  2,733.60  3,320.00

    Here are some instructions which you should follow in the 

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).



[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You need to determine if a piece of text is informative or non-informative elements like the headers, footers etc. and can be used to 
    generate a question and a relevant answer for the question.

    POPRA Side Letter — Hiring Incentive and Referral Bonus 
May 24, 2023 
An individual who is newly hired shall be allowed to disclose a maximum of one (1) referral. The 
applicant must provide the referrer’s name on his or her application prior to submitting the 
application for consideration and certify that the referrer referred and/or assisted in recruiting 
them, and that the applicant will not receive any portion of the POPRA Referral Bonus provided 
to them. 
Employees involved in the recruiting and/or or hiring of new Park Rangers (2423) are not 
eligible to receive the POPRA Referral Bonus. 
The POPRA Referral Bonus shall be in lieu of the existing Citywide referral b

In [42]:
useful_text_chunks

['Office of Employee Relations  \nExempt Officers’ and Sergeants’ Modified Duty Program  \n                                                                                           \n   Original Effective Date: June 21, 2005  \nRevised Effective Date:  May 11, 2020  \nPage 1 of 6 \n BACKGROUND  \n1. The City and San Jose Police Officers ’ Association (SJPOA) recognize that, despite best \nefforts to promote safety, police officers and sergeants are injured in the line of duty. Such \ninjuries are unfortunate but can be a consequence of police work. The Exempt Officers ’ \nand Sergeants ’ Modified Duty Program (“Program ”) is available to any police officer or \nsergeant that has work -related or non -work related injuries or illnesses which preclude \nhim or her from performing the f ull scope of his or her duties without accommodation.  \n2. The City and SJPOA recognize that police officers and sergeants exist to enforce the law \nand protect public safety. Some essential job duties 

In [43]:
len(useful_text_chunks)

1579

In [45]:
def write_json_lines(file_path, qa_pairs):
    with open(file_path, 'a') as f:
        for i, qa_list in enumerate(qa_pairs):
            for j, obj in enumerate(qa_list):
                f.write(json.dumps(obj, ensure_ascii=False))
                if i < len(qa_pairs) - 1 or j < len(qa_list) - 1:
                    f.write('\n')  # Write a newline if not the last object

In [50]:
def generate_questions_answers(text_chunks,llm_object):
    llm = llm_object
    qa_pairs = []
    for chunk in text_chunks:
            # Generate question using LLM
            qa_template = """

            Here's the sample content from the MOU signed between

            City of San Jose and one of the bargaining units aavailable in the City:

            {chunk}

            Follow the below instructions very seriouslly to generate each data point (Don't mess up with Array of JSON Objects Structure mentioned below, make no mistakes):

            1. Each Data point should be a JSON Object, with keys <<chat_history>>, <<question>> and <<ideal>>.

            2. Value of the <<chat_history>> should be []

            3. Value of the <<question>> should be

            the question you will be generating from the above piece of information

            provided to you.

            4. The value of the <<ideal>> key should be the answer of the question you'll be adding to

            <<content>> key in the user's object. Value should be detailed and very relevant.

            5. The value of text key you return should be should be a list of JSON Objects


            Generate a couple of JSON Lines Data Points in the following format (Don't return with new line characters.):

            [{{"chat_history":"[]","question": "The first US president was ","ideal": "George Washington"}}]

            """
            qa_prompt = PromptTemplate(input_variables=["chunk"], template=qa_template)
            qa_prompt.format(chunk=chunk)
            qa_chain = LLMChain(llm=llm, prompt=qa_prompt, verbose=True)
            response = qa_chain({"chunk" : chunk})
            response_text = response['text']
            # Split the response text into separate JSON objects
            response_lines = response_text.strip().split('\n')
            for line in response_lines:
                 try:
                     # Load each JSON object and append it to qa_pairs
                     new_qa_pairs = json.loads(line)
                     qa_pairs.append(new_qa_pairs)
                 except json.JSONDecodeError as e:
                     print("Error decoding JSON:", e)
            """
            new_qa_pairs = json.loads(response['text'])
            qa_pairs.extend(new_qa_pairs)
            """
            print(qa_pairs)
    return qa_pairs

In [1]:
text_chunks = useful_text_chunks
llm = ChatOpenAI(temperature=0,model_name = OPENAI_ENGINE)
qa_pairs = generate_questions_answers(text_chunks,llm)
# Specify the input JSON Lines File Path
val_test_file_path = 'data/val_test_dataset.jsonl'
write_json_lines(val_test_file_path, qa_pairs)

NameError: name 'useful_text_chunks' is not defined