In [3]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

file_path = 'data/topics/topics.txt'
out_path = 'data/all_answers_medalpaca13b_prompt.csv'


def load_topics(path):
    with open(path) as f:
        root = ET.fromstring(f.read())
    topic_dict = {}
    for topic in root.findall("topic"):
        topic_id = topic.findtext("id")
        topic_query = topic.findtext("query")
        if topic_id and topic_query:
            topic_dict[topic_id] = topic_query.strip() # .lower()
    topics = pd.DataFrame(topic_dict.items(), columns=["qid", "query"]) 
    # topics["query"] = topics["query"].str.replace(r'\W+', ' ', regex=True)
    return topics
topics = load_topics(file_path)
topics['answer'] = ""
if not os.path.exists(out_path):
    topics.to_csv(out_path, index=False)
else:
    print("File already exists")

In [4]:
import time
import pandas as pd
import requests, json
from dotenv import load_dotenv

load_dotenv()

API_URL = "https://zo0j5p4kmndmm9km.us-east-1.aws.endpoints.huggingface.cloud"

API_KEY = os.getenv("HUGGINGFACE_API_KEY")
headers = {
  "Authorization": f"Bearer {API_KEY}",
  "Content-Type": "application/json"
}


def generate_huggingface_answer(topics, max_retries=3, max_rows=100):
    generated_answers = 0
    for index, row in topics.iterrows():
        if row['answer'] != "":
            continue
        if generated_answers >= max_rows:
            break
        generated_answers += 1
        prompt = row['query']
        # prompt = "Q: " + prompt + "\n A:"
        prompt = f"You are a helpful medical knowledge assistant. Provide useful, complete, and scientifically-grounded answers to common consumer search queries about health.\nQuestion: {prompt}\nComplete Answer:"
        #  medalplace_input = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\nAnswer the following Question.\n{prompt}\n### Response:\n"""
        payload = {"inputs": prompt} 
        payload["parameters"] = {
            "max_new_tokens": 512,
            "temperature": 0.75,
            "top_k": 50,
            "top_p": 0.95,
            "repetition_penalty": 1.2
        }
        payload = json.dumps(payload)
        for i in range(max_retries):
            try:
                print(payload)
                response = requests.request("POST", API_URL, headers=headers, data=payload)
                print(response.status_code)
                answer =  json.loads(response.content.decode("utf-8"))
                print(answer)
                topics.at[index, 'answer'] = answer[0]['generated_text']
                break
            except Exception as e:
                print(f"Error on topic: {prompt}. Retrying ({i+1}/3)...")
                print(f"Error message: {e}")
                time.sleep(30)
                if i == 2:
                    topics.at[index, 'answer'] = ""
    return topics
all_answers = pd.read_csv(out_path)
# replace Nan with empty string
all_answers = all_answers.fillna("")
questions_with_answers = generate_huggingface_answer(all_answers, max_rows= 100)
questions_with_answers.to_csv(out_path, index=False)
# merge topics with all_answers_chatgpt



{"inputs": "You are a helpful medical knowledge assistant. Provide useful, complete, and scientifically-grounded answers to common consumer search queries about health.\nQuestion: What are the most common chronic diseases? What effects do chronic diseases have for the society and the individual?\nComplete Answer:", "parameters": {"max_new_tokens": 512, "temperature": 0.75, "top_k": 50, "top_p": 0.95, "repetition_penalty": 1.2}}
200
[{'generated_text': 'The person is now in good shape! (This means that it\'s OK.) It is best to visit a doctor. (This word can mean different things depending on context; one usage might be "the doctor".) A doctor is a human who practices medicine. Doctors practice medicine because they make house calls.'}]
{"inputs": "You are a helpful medical knowledge assistant. Provide useful, complete, and scientifically-grounded answers to common consumer search queries about health.\nQuestion: best apps daily activity exercise diabetes\nComplete Answer:", "parameters"

KeyboardInterrupt: 