In [2]:
import json
# !python3 -m pip install openai
import openai
import os
import pandas as pd
import sys
import glob
from tqdm import tqdm
import argparse
from pathlib import Path
from tenacity import retry, stop_after_attempt, wait_random_exponential
import multiprocessing
import re

# openai.organization = ''
# openai.api_key = ''
df = pd.read_csv('sample.csv')
df = df.dropna(subset=["Chinese_Name"])
df.to_csv('sample.csv',index=False,encoding='utf-8')

In [None]:

def is_answer_in_valid_form(answer):
    """Check if the GPT's answer is in the expected format.

    This is the format we want:
        Readability: 1

    Note: 4.5 will be extracted as 4.
    """
    answer = answer.strip("\n").strip()
    if re.search("\w*:\s?[0-1]", answer):
        return True
    return False

def run_gpt4_query(filled_prompt):
    response = openai.ChatCompletion.create(
        model= args.model #"gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful data science research assistant who speaks both Chinese and English."},
            {"role": "user", "content": filled_prompt},
        ],
        temperature=0,
    )
    return response

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(15))

### this function prompts the GPT model to generate a response when given a prompt
def generate_categorization(restaurant_cn, prompt_file):
    """Explains a given text for a specific audience.

    Args:
        text (str): The input text to be explained.
        prompt_file (str): The file path to the prompt file.

    Returns:
        str: The explanation of the input text.

    """
    # Read prompt template
    prompt_template = open(prompt_file).read()

    # prompt = prompt_template.replace("{EN_NAME}", restaurant_en)
    prompt = prompt_template.replace("{CN_NAME}", restaurant_cn)
    prompt = prompt.strip("\n").strip()
    prompt = prompt + "\n"
    # print(prompt)
    while True:
        response = run_gpt4_query(prompt)
        response = response["choices"][0]['message']['content'].strip("\n")
        return response
        # if is_answer_in_valid_form(response):
        #     return response
        # else:
        #     print("====>>> Answer not right, re-submitting request...")


    # response = run_gpt4_query(prompt)
    # response = response["choices"][0]['message']['content'].strip("\n")
    # return response

def main():

    # question_type = "prompt_en_Positivity"

    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", type=str, default="./boston_sample.csv")
    parser.add_argument("--prompt_file_path", default="./prompts/prompt.txt", type=str)
    parser.add_argument("--output_folder", type=str, default="./outputs")
    parser.add_argument("--model", default="gpt-3.5-turbo-1106", type=str)
    parser.add_argument('--output_file',type=str, defualt = 'positivity.json')
    args = parser.parse_args()
    ### QUESTION: IS df_test the trianing file?
    df_text = pd.read_csv(args.input_file, encoding="utf-8", delimiter="\t")
    df_text = df_text.iloc[:]
    print(df_text.shape)
    print(df_text)
    output_folder = args.output_folder

    Path(output_folder).mkdir(parents=True, exist_ok=True)

    # map audience to its full string
    short_prompt_folder_name = "102-doctrines-nonexperts"
    # Path(os.path.join(output_folder, short_prompt_folder_name)).mkdir(parents=True, exist_ok=True)
    # Path(os.path.join(output_folder, short_prompt_folder_name, question_type)).mkdir(parents=True, exist_ok=True)
    Path(os.path.join(output_folder)).mkdir(parents=True, exist_ok=True)
    # Path(os.path.join(output_folder, question_type)).mkdir(parents=True, exist_ok=True)

    # normal call to debug
    # for concept_name, text, story in tqdm(zip(df_text.concept.to_list(), df_text.intro_text.to_list(), df_text.story.to_list())):
    #     concept_name = " ".join(concept_name.split("_"))
    #     response = generate_story(text, concept_name, story, args.prompt_file_path)
    #     print(response)
    #     break

    pool = multiprocessing.Pool()

    responses = []

    for restaurant_en, restaurant_cn, sample_id, national_id in tqdm(zip(df_text.English_Name.to_list(), df_text.Chinese_Name.to_list(), df_text.sample_id.to_list(),df_text.national_id.to_list())):
        # concept_name_string = " ".join(concept_name.split("_"))
        response = pool.apply_async(generate_categorization, args=(restaurant_en, restaurant_cn, args.prompt_file_path))
        responses.append([sample_id, national_id, restaurant_en, restaurant_cn, response])
        print('raw response -------------------------------------')
        print(responses)

### QUESTION: How do I collect the boolean-value categories of each restaurant? Or can I ask gpt to export a csv file?
    for sample_id, national_id, restaurant_en, restaurant_cn, response in tqdm(responses):
        json_obj = {"sample_id": sample_id, "national_id": national_id, "English_Name": restaurant_en, "Chinese_Name":restaurant_cn}
        json_obj["Positivity"] = response.get()
        json_obj = json.dumps(json_obj, indent=4)
        # with open(os.path.join(output_folder, short_prompt_folder_name, question_type, "{}.json".format(concept_name)), "w", encoding='UTF-8') as out:
        with open(os.path.join(output_folder, args.output_file, "w", encoding='UTF-8')) as out:

            out.write(json_obj)

    pool.close()
    pool.join()

if __name__ == "__main__":
    main()