In [1]:
from nltk.corpus import wordnet
from groq import Groq
from openai import OpenAI
from dotenv import load_dotenv  # dotenv 모듈 import
import nltk
import json
import re
import time
import os
import tiktoken
import openai

load_dotenv()  # .env 파일 로드

## OpenAI 클라이언트 초기화
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  # 환경 변수에서 API 키 가져오기
)

nltk.download('wordnet')

class Extract_entity_type:
    def __init__(self):
        self.start_index = 0
        self.end_index = 0
        self.error_list = []
        self.entity_list = []
        self.type_list = []
        self.entity_name = ""

    def index_input(self):
        self.start_index = int(input("start_index: "))
        self.end_index = int(input("end_index: "))

    def list_up(self):
        with open('entity2id.txt', 'r') as file:
            lines = file.readlines()

        # 첫 번째 줄(엔티티 수)은 무시하고 두 번째 줄부터 처리
        for i in range(1, len(lines)):
            parts = lines[i].strip().split(' ')  # 공백으로 구분
            if len(parts) == 2:  # '엔티티 번호' 형식일 경우
                entity, index = parts
                self.entity_list.append(entity)  # 엔티티만 추가
                
    def create_prompt (self, text):
        syn = wordnet.synset(text)
        parts = syn.name().split('.')
        n_part  = [part for part in parts if 'n' or 'v' in part]

        n_value = n_part[1]

        if n_value == 'n':
            pumsa = 'noun'
        else:
            pumsa = 'verb'

        prompt = f"""I am trying to enhance a triple dataset by adding entity types 
            to the head and tail of the triple. 
            Instead of specific categories, provide a **broad and general** entity type 
            for a {pumsa} word '{text}' which has a definition of "{syn.definition()}".
            * The entity type should be **high-level and generic**, rather than overly specific.
            * The entity type must not be a POS (part-of-speech).
            * Generate a single JSON consisting of the entity type as follows:
            - {{ "entity_type": "the broad entity type of the word" }}"""

        return prompt
    
    def prompt_enter(self,prompt):
        messages = []

        content = prompt
        messages.append({"role":"user", "content":content})

        chat_completion = client.chat.completions.create(
            messages=messages,
            model="gpt-4o-mini",
        )

        chat_response = chat_completion.choices[0].message.content
        return chat_response

    @staticmethod  # 정적 메서드로 선언
    def ext_type(response):
        """JSON에서 entity_type만 추출"""
        try:
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                data = json.loads(json_match.group())
                return data['entity_type']
        except json.JSONDecodeError:
            pass
        return None

    def output_process(self, ind, res):  # 타입 추출 및 에러 리스트 생성
        j = self.ext_type(res)  # 🔥 'self.' 추가
        return j

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kodonghwan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
entity_list = []

with open('entity2id.txt', 'r') as file:
    lines = file.readlines()

    # 첫 번째 줄(엔티티 수)은 무시하고 두 번째 줄부터 처리
    for i in range(1, len(lines)):
        parts = lines[i].strip().split(' ')  # 공백으로 구분
        if len(parts) == 2:  # '엔티티 번호' 형식일 경우
            entity, index = parts
            entity_list.append(entity)  # 엔티티만 추가

In [3]:
# 토크나이저 이름을 이용하는 방법
encoder = tiktoken.get_encoding("cl100k_base")

def count_tokens(text, model="gpt-4o-mini"): # 텍스트 토큰화 함수
    encoder = tiktoken.encoding_for_model(model)
    tokens = encoder.encode(text)
    return len(tokens)

In [5]:
a = Extract_entity_type()
a.list_up()
a.index_input()

Extracted_type_dict = {}  # 엔티티와 엔티티 타입 저장

model = "gpt-4o-mini"
input_all_tokens = 0
output_all_tokens = 0

# 엔티티 추출 및 처리
for i in range(a.start_index, a.end_index):
    entity = a.entity_list[i]
    prompt = a.create_prompt(entity)
    input_token = count_tokens(prompt, model)
    input_all_tokens += input_token

    entity_type = None  
    while entity_type is None:
        response = a.prompt_enter(prompt)
        output_token = count_tokens(response, model)
        output_all_tokens += output_token

        entity_type = a.output_process(i, response)
        print(entity_type)

        if entity_type is None:
            print(f"{i} -> {entity} → No entity type found, retrying...")
            time.sleep(2) # 실패 시 2초 대기 후 재시도

    entity_type = entity_type.lower().replace(" ", "").replace("_", "")
    Extracted_type_dict[entity] = entity_type
    print(f"{i} -> {entity} → {entity_type} -> {input_token + output_token} -> {input_all_tokens} -> {output_all_tokens}")
    time.sleep(2)  # 각 요청마다 2초 대기 (API 부하 방지)

    filename = f"./text/ChatGPT4o_extracted_entities_{a.start_index}_{a.end_index}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        for entity, entity_type in Extracted_type_dict.items():
            f.write(f"{entity} {entity_type}\n")

mathematical concept
0 -> diagonal.n.04 → mathematicalconcept -> 175 -> 159 -> 16
Government Body
1 -> legislative_branch.n.01 → governmentbody -> 155 -> 300 -> 30
information
2 -> confirmation.n.02 → information -> 142 -> 429 -> 43
chemical compound
3 -> tetrachlorethylene.n.01 → chemicalcompound -> 157 -> 572 -> 57
marine organism
4 -> acanthurus.n.01 → marineorganism -> 152 -> 710 -> 71
plant
5 -> genus_gaillardia.n.01 → plant -> 152 -> 849 -> 84
historical region
6 -> north_vietnam.n.01 → historicalregion -> 181 -> 1015 -> 99
clothing
7 -> macintosh.n.02 → clothing -> 148 -> 1149 -> 113
human habitation
8 -> cliff_dwelling.n.01 → humanhabitation -> 157 -> 1292 -> 127
communication professional
9 -> telegrapher.n.01 → communicationprofessional -> 149 -> 1427 -> 141
cognitive process
10 -> cope.v.01 → cognitiveprocess -> 143 -> 1555 -> 156
city
11 -> los_angeles.n.01 → city -> 164 -> 1706 -> 169
individual
12 -> subscriber.n.02 → individual -> 156 -> 1849 -> 182
action
13 -> repel.v.