In [1]:
import os
import openai
from dotenv import load_dotenv
import time
import re
import random
import pandas as pd

In [2]:
load_dotenv()
key = os.getenv("open_ai_key")

In [3]:
openai.api_key = key
completion = openai.Completion()

In [4]:
max_tokens = 64
temperature = 0.9
top_p = 1.0
best_of = 1

stop = ["\n"]

In [5]:
root_dir = '/'.join(os.getcwd().split('/')[:-2])
dummy_text = pd.read_csv(os.path.join(root_dir, 'src', 'data', 'dummy_articles', 'articles.csv'))

In [6]:
# 사용 예시
role_prompts = [
    "You are a lawyer working for an automotive company. In this role, you are responsible for reviewing articles related to the automotive industry.",
    "You are a legal consultant specialized in the automotive sector. Your job includes evaluating the relevance of articles pertaining to the car industry.",
    "You serve as in-house counsel for a company in the automobile manufacturing sector. Part of your duties includes reviewing written content that might be related to automotive matters.",
    "As an attorney working for a corporation involved in automotive technologies, you are tasked with determining the applicability of various articles to the automotive field."
]

step_prompts = [
    "Step 1 - Understand the text separated by three backticks. Step 2 - As a lawyer, determine whether the text is related to the automotive domain. Step 3 - If the text is related to the automotive domain, respond with 'True'; otherwise, respond with 'False.'"
]

example_prompts = [
    "For example: <Article>: '''Article about the latest advancements in electric vehicle technology''' <Laywer>: True",
    "For example: <Article>: '''Legal analysis of environmental regulations impacting automobile manufacturers''' <Lawyer>: False",
    "For example: <Article>: '''Recipe for a delicious cake''' <Lawyer>: False",
    "For example: <Article>: '''Comparing different types of smartphone processors''' <Lawyer>: False"
]

last_prompts = [
    "Please determine whether this text is related to the automotive domain or not. <Lawyer>: ",
    "Review the text and determine if it is related to the automotive domain and respond True or False. <Lawyer>: ",
    "Please review the text and determine and respond True if it is related to the automotive domain. if not related respond False. <Lawyer>: ",
    "As a legal expert specializing in the automotive sector, your task is to review the text enclosed by three backticks and determine its relevance to the automotive industry. If the text is relevant, please respond with 'True.' If it is not, respond with 'False. <Lawyer>: '"
]

In [7]:
def cleaning_text(text):
    text = text.strip()  # 문장 앞뒤의 공백 제거
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)  # 알파벳과 띄어쓰기를 제외한 모든 문자 제거
    return cleaned_text

In [8]:
def generate_input_text(role_prompts, step_prompts, example_prompts, last_prompts, article):
    """
    Generate input text by randomly choosing a prompt from each category.
    
    Parameters:
    role_prompts (list): List of role prompts
    step_prompts (list): List of step prompts
    example_prompts (list): List of example prompts
    last_prompts (list): List of last prompts
    article (str): The article text
    
    Returns:
    str: Generated input text
    """
    chosen_role_prompt = random.choice(role_prompts)
    chosen_step_prompt = random.choice(step_prompts)
    chosen_example_prompt = random.choice(example_prompts)
    chosen_last_prompt = random.choice(last_prompts)
    
    input_text = (chosen_role_prompt + '\n' + 
                  chosen_step_prompt + '\n' + 
                  chosen_example_prompt + '\n' + 
                  f"'''{article}'''" + '\n' + 
                  chosen_last_prompt)
    
    return input_text

In [13]:
def answer(input_text):
    response = completion.create(
        prompt = input_text,
        model='text-davinci-003',
        max_tokens=max_tokens,
        stop=stop,
        temperature=temperature,
        top_p=top_p,
        best_of=best_of
    )

    return response

In [26]:
result_dict = {}
cleaned_output_dict = {}
time_limit = 60*1
used_token = 0

for index, row in dummy_text.iterrows():
    article = cleaned_text = re.sub(r'[^\w\s]', '', row.Content).strip()
    start_time = time.time()

    elapsed_time = 0
    while elapsed_time < time_limit:
        input_text = generate_input_text(role_prompts, step_prompts, [example_prompts[0]], last_prompts, article)
        response = answer(input_text)

        output = response['choices'][0]['text']
        used_token += int(response['usage']['total_tokens'])

        cleaned_output = cleaning_text(output)

        if "True" in cleaned_output and "False" not in cleaned_output:
            output = True
            break
        elif "False" in cleaned_output and "True" not in cleaned_output:
            output = False
            break

        elapsed_time = time.time() - start_time
            
    else:  # max_tries까지 돌고도 원하는 값을 얻지 못했을 경우
        output = None

    result_dict[row.Category] = output
    cleaned_output_dict[row.Category] = cleaned_output

In [27]:
result_dict

{'Travel': False,
 'Technology': False,
 'Science': False,
 'Food and Cooking': False,
 'Literature': False,
 'Personal Development': False,
 'Entertainment': False,
 'Environmental Issues': False,
 'Art and Creativity': False,
 'Automotive': True}

In [28]:
cleaned_output_dict

{'Travel': 'False',
 'Technology': 'False',
 'Science': 'False',
 'Food and Cooking': 'False',
 'Literature': 'False',
 'Personal Development': 'False',
 'Entertainment': 'False',
 'Environmental Issues': 'False',
 'Art and Creativity': 'False',
 'Automotive': 'True'}

In [29]:
used_token

5897