In [1]:
import google.generativeai as palm
from qdrant_client import QdrantClient
import os
import chromadb, pickle
from chromadb.api.types import Documents, Embeddings
from google.generativeai.types import HarmCategory
from google.generativeai.types import HarmBlockThreshold


In [2]:
palm.configure(api_key="")

In [3]:
models = [m for m in palm.list_models() if 'embedText' in m.supported_generation_methods]
model = models[0]
models

[Model(name='models/embedding-gecko-001', base_model_id='', version='001', display_name='Embedding Gecko', description='Obtain a distributed representation of a text.', input_token_limit=1024, output_token_limit=1, supported_generation_methods=['embedText', 'countTextTokens'], temperature=None, top_p=None, top_k=None)]

In [4]:
def split_string_into_chunks_bytes(string, chunk_size=9500):
  """
  Splits a string into chunks of a given size in bytes.

  Args:
      string: The string to split.
      chunk_size: The size of each chunk in bytes.

  Returns:
      A list of strings, where each string is a chunk of the original string.
  """

  chunks = []
  byte_offset = 0
  while byte_offset < len(string.encode()):
    chunk = string[byte_offset:byte_offset + chunk_size]
    byte_offset += chunk_size
    chunks.append(chunk)
  return chunks

text = []
files = os.listdir('parsedData')
for i in range(len(files)):
    file = os.path.join('parsedData', files[i])
    temp = open(file, 'r', encoding='utf-8').read()
    if len(temp.encode('utf-8')) > 9500:
        chunks = split_string_into_chunks_bytes(temp)
        text.extend(chunks)
    else:
        text.append(temp)
    

print(len(text))

661


In [12]:
for i in text:
    if len(i.encode('utf-8')) > 10000:
        print("here")
print(text[:2])
# print(len(split_string_into_chunks_bytes(text[132])[9].encode("utf-8")))


["title\n1 Dead, 1 Seriously Injured in Accident at Bridge Construction Site\ndescription\n1 Dead, 1 Seriously Injured in Accident at Bridge Construction Site\nHaverhill, MA\n- Officials from the federal Occupational Safety and Health Administration (OSHA) are investigating an accident at a bridge construction site on Interstate 495 in Haverhill, on Wednesday.\nAccording to the\nBoston Globe\n,\xa0 one worker died and a second was seriously injured after they fell from an aerial platform onto a barge in the Merrimack River, in Haverhill. Shortly before 10 a.m., the two men fell 40 to 50 feet from a telescopic boom. According to the Essex district attorney’s office, the first worker, a 44-year old man from Manchester was pronounced dead at the scene. The second worker, a 46-year old man, was transported via a medical helicopter to Lahey Hospital in Burlington.\nInvestigators from OSHA responded to the scene, and have yet to determine whether mechanical problems or operator error were to

In [7]:
count = 0

def embed_function(texts: Documents) -> Embeddings:
  # Embed the documents using any supported method
  all_embeddings = []
  global count
  for i in texts:
    all_embeddings.append(palm.generate_embeddings(model, i)['embedding'])
  print(count)
  count += 1
  return all_embeddings


def create_chroma_db(documents, name):
  chroma_client = chromadb.PersistentClient(path="./embeddings1/")
  db = chroma_client.create_collection(name=name, embedding_function=embed_function)
  for i,d in enumerate(documents):
    print(str(i) + " " + str(d))
    db.add(
      documents=d,
      ids=str(i)
    )
  return db



In [9]:
db = create_chroma_db(text[0:10], "palm_hazwoper_test1")
# with open("embeddings/hazwoperpalm(chromadb).pkl","wb") as f:
#     pickle.dump(db,f)

0 title
1 Dead, 1 Seriously Injured in Accident at Bridge Construction Site
description
1 Dead, 1 Seriously Injured in Accident at Bridge Construction Site
Haverhill, MA
- Officials from the federal Occupational Safety and Health Administration (OSHA) are investigating an accident at a bridge construction site on Interstate 495 in Haverhill, on Wednesday.
According to the
Boston Globe
,  one worker died and a second was seriously injured after they fell from an aerial platform onto a barge in the Merrimack River, in Haverhill. Shortly before 10 a.m., the two men fell 40 to 50 feet from a telescopic boom. According to the Essex district attorney’s office, the first worker, a 44-year old man from Manchester was pronounced dead at the scene. The second worker, a 46-year old man, was transported via a medical helicopter to Lahey Hospital in Burlington.
Investigators from OSHA responded to the scene, and have yet to determine whether mechanical problems or operator error were to blame for t

In [13]:
def get_relevant_passage(query, db):
  passage = db.query(query_texts=[query], n_results=1)['documents'][0][0]
  return passage

In [16]:
# Perform embedding search
ans = get_relevant_passage("Tell me about Toxicology?", db)
ans

663


'title\nWhat is Toxicology?\ndescription\nWhat is Toxicology?\nMost people will relate the word\ntoxicology\nto popular medical dramas and crime shows on television. Dr. Richard Weber’s case of cobalt poisoning in the season 16 finale of\nGrey’s Anatomy\nis a good example of medical toxicology. While shows such as\nCSI, NCIS,\nand\nBones\ndepend on\nforensic toxicology\nto solve their crime-scene cases. Some of us may have also heard the word used in science laboratories while in high school. But what exactly does\ntoxicology\nmean? The Merriam-Webster dictionary defines\ntoxicology\nas “a science that deals with poisons and their effect and with the problems involved (such as clinical, industrial, or legal problems)”. The National Institute of Environmental Sciences (\nNIEHS\n) provides a more comprehensive understanding of toxicology:\nToxicology is a field of science that helps us understand the harmful effects that chemicals, substances, or situations, can have on people, animals, 

In [17]:
def make_prompt(query, relevant):
  escaped = relevant.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""You are a customer support agent for the company "Hazwoper-osha", \
    do not play any other role. Use the following pieces of context to answer the question at the end. \
    If you don't know the answer, just say "I can only answer Hazwoper-OSHA related questions, please contact support \
    if you need further assistance", don't try to make up an answer. Use five sentences maximum.
  
  Context: {relevant}
Question: {query}
Kindly Answer Question:""").format(query=query, relevant=escaped)

  return prompt

In [18]:
text_models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]
text_model = text_models[0]
text_model

Model(name='models/text-bison-001', base_model_id='', version='001', display_name='Text Bison', description='Model targeted for text generation.', input_token_limit=8196, output_token_limit=1024, supported_generation_methods=['generateText', 'countTextTokens', 'createTunedTextModel'], temperature=0.7, top_p=0.95, top_k=40)

In [19]:
def answer(model, query, db, temperature=0.01):
  passage = get_relevant_passage(query, db)
  print("Passage: ", passage)
  prompt = make_prompt(query, passage)
  print("Prompt: ", prompt)
  answer = palm.generate_text(
    prompt=prompt,
    model=model,
    candidate_count=3, 
    temperature=temperature, 
    max_output_tokens=1000,
    safety_settings = [
      {
        "category": HarmCategory.HARM_CATEGORY_DEROGATORY,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
      {
        "category": HarmCategory.HARM_CATEGORY_TOXICITY,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
      {
        "category": HarmCategory.HARM_CATEGORY_SEXUAL,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
      {
        "category": HarmCategory.HARM_CATEGORY_MEDICAL,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
      {
        "category": HarmCategory.HARM_CATEGORY_DANGEROUS,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
      {
        "category": HarmCategory.HARM_CATEGORY_VIOLENCE,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
    ]
  )
  return answer

In [20]:
temperature = 0.65
query = "Tell me about Toxicology"
ans = answer(text_model, query, db)

664
Passage:  title
What is Toxicology?
description
What is Toxicology?
Most people will relate the word
toxicology
to popular medical dramas and crime shows on television. Dr. Richard Weber’s case of cobalt poisoning in the season 16 finale of
Grey’s Anatomy
is a good example of medical toxicology. While shows such as
CSI, NCIS,
and
Bones
depend on
forensic toxicology
to solve their crime-scene cases. Some of us may have also heard the word used in science laboratories while in high school. But what exactly does
toxicology
mean? The Merriam-Webster dictionary defines
toxicology
as “a science that deals with poisons and their effect and with the problems involved (such as clinical, industrial, or legal problems)”. The National Institute of Environmental Sciences (
NIEHS
) provides a more comprehensive understanding of toxicology:
Toxicology is a field of science that helps us understand the harmful effects that chemicals, substances, or situations, can have on people, animals, and the 

In [21]:
ans.candidates[0]['output']

'Tell me about Toxicology\n\nToxicology is the study of the harmful effects that chemicals, substances, or situations, can have on people, animals, and the environment.\n\nToxicology is intrinsically related to occupational diseases. Many workers develop occupational diseases due to long-term exposure to chemicals, radioactive materials, and other harmful substances.\n\nOSHA has identified the importance of toxicology in the workplace and enforces safety standards to be followed by employers. These requirements range from the number of chemicals and other hazardous substances that workers can be exposed to; toxic substance safe limits at a worksite; processes and practices that should be implemented; protective measures and equipment that must be used; the need for periodic toxicological testing and medical surveillance for workers; and the recording of workplace injuries, occupational diseases, and accidents due to the use of hazardous substances.'

In [15]:
print(len("Hello World!\n".encode('utf-8')))

13
