### Notebook for extracting the summaries and topics from the attacments available on the website [sam.gov](https://sam.gov/:).
### Files needed to run the notebook:
    -- 'AI_contracts.csv'
    -- Downloaded attachments (pdf and word files)
### Files generated from the notebook:
    -- 'text_attachments.json'
    -- 'ai_use_case_topics_attachments.csv' 

In [None]:
# Department dictionary to create deprtment codes
dept_dict={'VETERANS AFFAIRS, DEPARTMENT OF': 'VA',
 'ENERGY, DEPARTMENT OF': 'DOE',
 'NATIONAL AERONAUTICS AND SPACE ADMINISTRATION': 'NASA',
 'HEALTH AND HUMAN SERVICES, DEPARTMENT OF': 'HHS',
 'ENVIRONMENTAL PROTECTION AGENCY': 'EPA',
 'COMMERCE, DEPARTMENT OF': 'DOC',
 'INTERIOR, DEPARTMENT OF THE': 'DOI',
 'TREASURY, DEPARTMENT OF THE': 'TREAS',
 'JUSTICE, DEPARTMENT OF': 'DOJ',
 'GENERAL SERVICES ADMINISTRATION': 'GSA',
 'AGENCY FOR INTERNATIONAL DEVELOPMENT':'USAID',
 'THE LEGISLATIVE BRANCH': 'TLB',
 'TRANSPORTATION, DEPARTMENT OF': 'DOT',
 'EDUCATION, DEPARTMENT OF': 'DOE',
 'AGRICULTURE, DEPARTMENT OF': 'USDA',
 'NUCLEAR REGULATORY COMMISSION': 'NRC'}

In [2]:
# Read the AI_contracts file  
import pandas as pd
df_AI_contracts=pd.read_csv('AI_contracts.csv', index_col='Unnamed: 0')
df_AI_contracts=df_AI_contracts[['Title', 'Sol#', 'Department/Ind.Agency', 'AwardNumber', 'AwardDate', 'PostedDate', 'Award$', 'Awardee', 'Link', 'Description']]
df_AI_contracts[ 'Department_code']=df_AI_contracts[ 'Department/Ind.Agency'].map(dept_dict)
df_AI_contracts.head()

In [3]:
import os
from PyPDF2 import PdfReader
import json
from docx import Document
import pandas as pd

# Function to extract text from the pdf file
def extract_text_from_pdf(pdf_path):
    text_list = []
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text = page.extract_text()
            text_list.append(text)
    return text_list

# Function to extract text from the word file
def extract_text_from_word(word_path):
    text_list = []
    doc = Document(word_path)
    for paragraph in doc.paragraphs:
        text_list.append(paragraph.text)
    return text_list

# Function to iterate over pdf files to extract text and save to a dict
def read_pdfs_in_folder(pdf_folder_path, data):
    for filename in os.listdir(pdf_folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(pdf_folder_path, filename)
            text_list = extract_text_from_pdf(file_path)
            data[filename] = text_list
# Function to iterate over word files to extract text and save to a dict
def read_words_in_folder(word_folder_path, data):
    for filename in os.listdir(word_folder_path):
        if filename.endswith('.docx'):
            file_path = os.path.join(word_folder_path, filename)
            text_list = extract_text_from_word(file_path)
            data[filename] = text_list

def read_files_in_folders(pdf_folder_path, word_folder_path, output_json_path):
    data = {}
    read_pdfs_in_folder(pdf_folder_path, data)
    read_words_in_folder(word_folder_path, data)
    
    with open(output_json_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)


current_working_directory = os.getcwd()
pdf_folder_path = os.path.join(current_working_directory, "docs", "pdfs")
word_folder_path = os.path.join(current_working_directory, "docs", "word")
output_json_path = 'text_attachments.json'

# Read files and save the extracted text to the JSON file
read_files_in_folders(pdf_folder_path, word_folder_path, output_json_path)
print(f"Extracted text from files saved to {output_json_path}")

In [4]:
import json
import pandas as pd

# create a datframe from .json file
def create_dataframe_from_json(json_path):
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)
    df = pd.DataFrame(list(data.items()), columns=['Filename', 'Text'])
    return df

json_path='text_attachments.json'
df_AI_contracts_attach = create_dataframe_from_json('text_attachments.json')
df_AI_contracts_attach['no_pages']=df_AI_contracts_attach['Text'].apply(len)
df_AI_contracts_attach.head()

Unnamed: 0,Filename,Text,no_pages
0,1.JL-2672-110923Combined_1.29.24-compressed-v1...,[ \nJet Propulsion Laboratory \nCalifornia In...,496
1,1.JL-2672-110923RFP_noMsnDes-1.29.24-compresse...,[ \nJet Propulsion Laboratory \nCalifornia In...,47
2,10.1MarsHandbook2022-2040-compressed.pdf,[© 2023 California Institute of Technology \n...,449
3,2022.10.28RFI-2022-10-07-MartinQandA.pdf,[ \n \n \nTel: (509) 372- 4071 \nLaurie.Mar...,1
4,36C25724Q0141A00002-CANCELATIONOFREQUIREMENT.pdf,[ \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n...,2


In [5]:
# Preprocessing the extracted text from attachments using NLTK library
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

df_AI_contracts_attach['Text_proc_lst']=df_AI_contracts_attach['Text'].apply(lambda lst:  [preprocess_text(ele) for ele in lst])

In [6]:
pd.set_option('display.max_colwidth', 100)
df_AI_contracts_attach.head()

Unnamed: 0,Filename,Text,no_pages,Text_proc_lst
0,1.JL-2672-110923Combined_1.29.24-compressed-v10-and-later.pdf,[ \nJet Propulsion Laboratory \nCalifornia Institute of Technology \n4800 Oak Grove Drive \nP...,496,[jet propulsion laboratory california institute technology oak grove drive pasadena ca date janu...
1,1.JL-2672-110923RFP_noMsnDes-1.29.24-compressed.pdf,[ \nJet Propulsion Laboratory \nCalifornia Institute of Technology \n4800 Oak Grove Drive \nP...,47,[jet propulsion laboratory california institute technology oak grove drive pasadena ca date janu...
2,10.1MarsHandbook2022-2040-compressed.pdf,[© 2023 California Institute of Technology \nGovernment Support Acknowledged MARS MISSION \nD...,449,[california institute technology government support acknowledged mars mission design handbook je...
3,2022.10.28RFI-2022-10-07-MartinQandA.pdf,"[ \n \n \nTel: (509) 372- 4071 \nLaurie.Martin@p nnl.gov \nOctober 28, 2022 RFI-2022-10- 07...",1,[tel p october artin design build integrate deliver automated platform soil extractions analyses...
4,36C25724Q0141A00002-CANCELATIONOFREQUIREMENT.pdf,[ \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n...,2,[project number applicable code administered number code issued name address contractor req numb...


In [7]:
# df_AI_contracts_attach.Text_proc_lst.iloc[0]
#df_AI_contracts_attach.Text.iloc[100]

In [8]:
# keywords to check in the extracted text
elements_to_check = ['Scope of Work', 'SOW', 'Statement of Objectives', 'Performance Work Statement', 'PWS', 'Scope Work', 'Statement Objectives', 'Statement of Objective','STATEMENT OF WORK', 'STATEMENT WORK']

In [9]:
# Function to check keywords in the extracted text
def check_presence(lst,elements_to_check):
    idxs=[]
    elements_to_check = [ele.lower() for ele in elements_to_check]
    for idx, txt_strng in enumerate(lst):
        for element in elements_to_check:
            if element in txt_strng:
                # print(idx, txt_strng)
                idxs.append(idx)
    return idxs

In [10]:
# Applying the function to extract indices of the pdf pages
pd.set_option('display.max_rows', 100)
df_AI_contracts_attach['Text_proc_lst_index']=df_AI_contracts_attach['Text_proc_lst'].apply(lambda x: check_presence(x, elements_to_check))

In [11]:
import itertools
def extract_elements(row):
    final_text=[]
    elements = row['Text_proc_lst']
    indices = row['Text_proc_lst_index']
    for i in indices:
        final_text.append(elements[i:i+5])
    flattened_list = list(itertools.chain.from_iterable(final_text))
    unique_elements = set(flattened_list)
    unique_elements_list = list(unique_elements)
    return ' '.join(unique_elements_list)

# Apply the function to each row
df_AI_contracts_attach['relevant_text_pages_preproc'] = df_AI_contracts_attach.apply(extract_elements, axis=1)


In [12]:
import itertools
def extract_elements(row):
    final_text=[]
    elements = row['Text']
    indices = row['Text_proc_lst_index']
    for i in indices:
        final_text.append(elements[i:i+5])
    flattened_list = list(itertools.chain.from_iterable(final_text))
    unique_elements = set(flattened_list)
    unique_elements_list = list(unique_elements)
    return ' '.join(unique_elements_list)

# Apply the function to each row
# df_AI_contracts_attach['extracted_elements'] = df_AI_contracts_attach.apply(extract_elements, axis=1)
df_AI_contracts_attach['relevant_text_pages_No_preproc'] = df_AI_contracts_attach.apply(extract_elements, axis=1)

In [13]:
df_AI_contracts_attach_final = df_AI_contracts_attach[df_AI_contracts_attach['relevant_text_pages_preproc']!=''].copy()

In [14]:
df_AI_contracts_attach_final.drop(['Text_proc_lst', 'Text_proc_lst_index'], axis=1, inplace=True)

In [15]:
df_AI_contracts_attach_final

Unnamed: 0,Filename,Text,no_pages,relevant_text_pages_preproc,relevant_text_pages_No_preproc
0,1.JL-2672-110923Combined_1.29.24-compressed-v10-and-later.pdf,[ \nJet Propulsion Laboratory \nCalifornia Institute of Technology \n4800 Oak Grove Drive \nP...,496,mars program office mars mission design handbook novembe r jpl release signature page prepared ...,"RFP No.: JL -2672-110923 \nJanuary 2 9,2024 \n2 1.3 Programmatic Considerations \n \n1.3.1 Di..."
1,1.JL-2672-110923RFP_noMsnDes-1.29.24-compressed.pdf,[ \nJet Propulsion Laboratory \nCalifornia Institute of Technology \n4800 Oak Grove Drive \nP...,47,rfp jl january programmatic considerations discuss whether proposed service sufficiently mature ...,"RFP No. JL -2672 -110923 \nExhibit V, XV \nFlight Software: Software specifically design ed t..."
2,10.1MarsHandbook2022-2040-compressed.pdf,[© 2023 California Institute of Technology \nGovernment Support Acknowledged MARS MISSION \nD...,449,mars program office mars mission design handbook novembe r jpl release page figure minimum assoc...,"Mars Program Office Mars Mission Design Handbook (2022 -2040) \nNovembe r 6, 202 3 JPL D -106..."
5,6.1_BattellePNNLRFP746072.pdf,"[ \nPage 1 of 26 \n \nBattelle Memorial Institute, Pacific Northwest Division \nActing Under P...",26,request proposal number page attachment rfp following detailed sections provided informational p...,Request for Proposal Number: 746072 \nPage 14 of 26 o Personnel actions resulting in a change ...
6,6.2_BattellePNNLRFP746072_SOW.pdf,[Battelle Pacific Northwest Division \nas operator of Pacific Northwest National Laboratory (PN...,19,battelle pnnl rfp statement work april page reserved references attachments scheduling workflow ...,"Battelle PNNL RFP 746072 \nStatement of Work \nApril 11, 2023 \nPage 8 of 19 rotating or gentl..."
8,6.4_BattellePNNLRFP746072_Hazard_Form.pdf,[Battelle PNNL RFP 746072 \nHazard Form \nPage 1 of 1 \n \nAttention Bidder! Anticipated hazar...,1,battelle pnnl rfp hazard form page attention bidder anticipated hazards identified work based st...,Battelle PNNL RFP 746072 \nHazard Form \nPage 1 of 1 \n \nAttention Bidder! Anticipated hazard...
9,80JSC022R0004NOIS2RFPforOn-Ramping.pdf,[NASA Open Innovation Services 2 (NOIS2) On -Ramping \n80JSC022R0004 \nPage 1 of 74 \n \n \n...,74,nasa open innovation services page accordance far part organizational consultant conflicts inter...,NASA Open Innovation Services 2 (NOIS2) On -Ramping \n80JSC022R0004 \nPage 29 of 74 \n \n SEC...
20,ATTACHMENTB-CONTRACTADMINISTRATION-BNOE-NXEQORTHOPEDICSURGICALROBOTICSSYSTEMS.pdf,[Page 1 of 18 \n SECTION B – CONTRACT ADMINISTRATION / INVOICE INSTRUCTIONS \nB.1 CONTRACT ADM...,18,page comprom ised information considers relevant respect unsecured protected health information ...,"Page 16 of 18 \n \n (2) Description of the event, including: \n \n (a) date of occurrence; \n ..."
23,AttachmentC-StatementofWork.pdf,"[STATEMENT OF WORK \n3D Microscopy, Artificial Intelligence -based Quantification, and Modeling...",12,contract develop conduct three correlative imaging quantitative image analysis cqas image silico...,\n 8. Statement of Work \nThis statement of work (SOW) is to conduct three dimensional (3D) co...
32,BattellePNNLRFP746072Am2_Questions.pdf,"[Batelle PNNL RFP 746072 Amendment 2 \nQues�ons/Answers /Clariﬁca�ons \nJune 26, 202 3 \n \nPa...",3,batelle pnnl rfp amendment june page pnnl response scope limited producing microsolv vials ready...,"Batelle PNNL RFP 746072 Amendment 2 \nQues�ons/Answers /Clariﬁca�ons \nJune 26, 202 3 \n \nPage..."


In [16]:
import ollama
def gen_summaries_llama3(text: str):
    response = ollama.chat(model='llama3', messages=[
        {"role" : "system", "content" : "You are a Summary generator. Generate a summary from the provided text."},
        {"role" : "user", "content" : "Convert the following text into a summary of the form 'Summary: <summary>':"},
        {"role" : "user",  "content" : text}
    ])
    title = response['message']['content']
    return title

In [17]:
# Apply the LLAMA summarizer funtion on the preprocessed text
df_AI_contracts_attach_final['Summary_No_preproc'] = df_AI_contracts_attach_final['relevant_text_pages_No_preproc'].apply(lambda x: gen_summaries_llama3(x))

In [18]:
# Apply the LLAMA summarizer funtion on the text without preprocessing
df_AI_contracts_attach_final['Summary_preproc'] = df_AI_contracts_attach_final['relevant_text_pages_preproc'].apply(lambda x: gen_summaries_llama3(x))

In [19]:
import ollama

def gen_topic_llama3(text: str):
    response = ollama.chat(model='llama3', messages=[
        {"role" : "system", "content" : "You are a topic labeller, I want you to identiy the topic for given text from the list of topics given below?, note just give me the topic name and limit your answer to two tokens"},
        {"role" : "user", "content" : """
                                        Topics:
                                    	1.	Accessibility: using AI for translation / interpretation, section 508 compliance, plain language, or other activities to increase accessibility of documents and interactions with the government
	                                    2.	Policy-making and public engagement: use of AI in any stage of developing regulations or gathering input
                                        3.	Asset management: use of AI to manage both physical and digital assets
                                        4.	Hotlines and service desks: use of AI to triage, respond, and refer to calls, texts, emails
                                        5.	Service / benefits access: use of AI to support determining eligibility for services, streamlining applications, etc.
                                        6.	Program integrity: use of AI to detect potential fraud or other wrong-doing in use of public benefits and services
                                        7.	Case management: use of AI to document and summarize interactions, suggest and enable referrals
                                        8.	Service delivery: use of AI to provide direct services either to the public or to state/local/tribal/territorial governments 
                                        9.	People operations: use of AI for purposes related to recruiting, retaining, and off-boarding employees
                                        10.	Internal operations: administrative use cases for AI, e.g. notetaking, virtual assistants
                                        11.	Other
                                    Identify the topic the text belongs to '<topic_name>':"""},
        {"role" : "user",  "content" : text}
    ])
    Topic = response['message']['content']
    return Topic

In [20]:
# Apply the LLAMA topic labelling funtion on the preprocessed text
df_AI_contracts_attach_final['topics_preproc']=df_AI_contracts_attach_final['relevant_text_pages_preproc'].apply(lambda x:  gen_topic_llama3(x))

In [21]:
# Apply the LLAMA topic labelling funtion on the text without preprocessing
df_AI_contracts_attach_final['topics_no_preproc']=df_AI_contracts_attach_final['relevant_text_pages_No_preproc'].apply(lambda x:  gen_topic_llama3(x))

In [23]:
# save the summaries and topics to csv file
df_AI_contracts_attach_final.to_csv("ai_use_case_topics_attachments.csv")