In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANG_SMITH")
os.environ["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY")

In [3]:
# 필요 모듈 임포트

from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI

In [14]:
# 스키마 만들기

class MedicalBilling(BaseModel):
    patient_id: int
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float

examples = [
    {
        "example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]

In [15]:
# 프롬프트 템플릿 만들기

OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

In [20]:
# 데이터 생성하기

synthetic_data_generator = create_openai_data_generator(
    output_schema=MedicalBilling,
    llm=ChatOpenAI(temperature=1),
    prompt=prompt_template,
)

In [22]:
# 데이터 함성 드디어 시작

synthetic_results = synthetic_data_generator.generate(
    subject="medical_billing",
    extra="the name must be chosen at random. Make it something you wouldn't normally choose.",
    runs=10
)

[MedicalBilling(patient_id=123456, patient_name='Alice Johnson', diagnosis_code='R07.0', procedure_code='99204', total_charge=600.0, insurance_claim_amount=400.0),
 MedicalBilling(patient_id=987654, patient_name='Eleanor Williams', diagnosis_code='I10', procedure_code='99215', total_charge=400.0, insurance_claim_amount=350.0),
 MedicalBilling(patient_id=987654, patient_name='Jacob Smith', diagnosis_code='E78.5', procedure_code='99213', total_charge=550.0, insurance_claim_amount=450.0),
 MedicalBilling(patient_id=654321, patient_name='Harper Thompson', diagnosis_code='M17.11', procedure_code='99203', total_charge=500.0, insurance_claim_amount=300.0),
 MedicalBilling(patient_id=123456, patient_name='Xavier Montgomery', diagnosis_code='F32.9', procedure_code='99214', total_charge=600.0, insurance_claim_amount=500.0),
 MedicalBilling(patient_id=246810, patient_name='Evangeline Patel', diagnosis_code='G56.0', procedure_code='99212', total_charge=450.0, insurance_claim_amount=350.0),
 Medica

In [23]:
# 다른 방식으로. Dataset generate 위에는 table(표) 형태

from langchain_experimental.synthetic_data import (
    DatasetGenerator,
    create_data_generation_chain,
)
from langchain_openai import ChatOpenAI

In [25]:
model = ChatOpenAI(temperature=0.7)
chain = create_data_generation_chain(model)
chain({"fields": ["blue", "yellow"], "preferences": {"style": "Make it in a style of a weather forecast."}})

{'fields': ['blue', 'yellow'],
 'preferences': {'style': 'Make it in a style of a weather forecast.'},
 'text': "In today's weather forecast, expect a vibrant display of colors with the sky painted in shades of blue and yellow, creating a picturesque scene that is sure to brighten your day."}

In [28]:
chain(
    {
        "fields": [
            {"actor": "Tom Hanks", "movies": ["Forrest Gump", "Green Mile"]},
            {"actor": "Mads Mikkelsen", "movies": ["Hannibal", "Another round"]},
        ],
        "preferences": {"minimum_length": 200, "style": "gossip"},
    }
)

{'fields': [{'actor': 'Tom Hanks', 'movies': ['Forrest Gump', 'Green Mile']},
  {'actor': 'Mads Mikkelsen', 'movies': ['Hannibal', 'Another round']}],
 'preferences': {'minimum_length': 200, 'style': 'gossip'},
 'text': 'Tom Hanks, known for his iconic roles in "Forrest Gump" and "Green Mile", and Mads Mikkelsen, famous for his chilling performances in "Hannibal" and "Another round", are two incredibly talented actors who have captivated audiences worldwide with their diverse range and captivating on-screen presence.'}

In [29]:
inp = [
    {
        "Actor": "Tom Hanks",
        "Film": [
            "Forrest Gump",
            "Saving Private Ryan",
            "The Green Mile",
            "Toy Story",
            "Catch Me If You Can",
        ],
    },
    {
        "Actor": "Tom Hardy",
        "Film": [
            "Inception",
            "The Dark Knight Rises",
            "Mad Max: Fury Road",
            "The Revenant",
            "Dunkirk",
        ],
    },
]

generator = DatasetGenerator(model, {"style": "informal", "minimal length": 500})
dataset = generator(inp)

In [30]:
dataset

[{'fields': {'Actor': 'Tom Hanks',
   'Film': ['Forrest Gump',
    'Saving Private Ryan',
    'The Green Mile',
    'Toy Story',
    'Catch Me If You Can']},
  'preferences': {'style': 'informal', 'minimal length': 500},
  'text': 'Tom Hanks, known for his incredible performances in iconic films such as "Forrest Gump," "Saving Private Ryan," "The Green Mile," "Toy Story," and "Catch Me If You Can," continues to captivate audiences with his versatile acting skills and undeniable charm.'},
 {'fields': {'Actor': 'Tom Hardy',
   'Film': ['Inception',
    'The Dark Knight Rises',
    'Mad Max: Fury Road',
    'The Revenant',
    'Dunkirk']},
  'preferences': {'style': 'informal', 'minimal length': 500},
  'text': 'Tom Hardy is known for his diverse acting roles, from the mind-bending thriller "Inception" to the action-packed "Mad Max: Fury Road," showcasing his versatility and talent in the film industry. His captivating performances in "The Dark Knight Rises," "The Revenant," and "Dunkirk"

In [31]:
# 스키마 생성

from typing import List

from langchain.chains import create_extraction_chain_pydantic
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from pydantic import BaseModel, Field

class Actor(BaseModel):
    Actor: str = Field(description="name of an actor")
    Film: List[str] = Field(description="list of names of films they starred in")

In [36]:
llm = OpenAI()
parser = PydanticOutputParser(pydantic_object=Actor)

prompt = PromptTemplate(
    template="Extract fields from a given text.\n{format_instructions}\n{text}\n",
    input_variables=["text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# _input = prompt.format_prompt(text=dataset[0]["text"])
# output = llm(_input.to_string())
_input = prompt.format(text=dataset[0]["text"])
output = llm(_input)

parsed = parser.parse(output)
parsed

Actor(Actor='Tom Hanks', Film=['Forrest Gump', 'Saving Private Ryan', 'The Green Mile', 'Toy Story', 'Catch Me If You Can'])

In [37]:
(parsed.Actor == inp[0]["Actor"]) & (parsed.Film == inp[0]["Film"])

True

In [40]:
extractor = create_extraction_chain_pydantic(pydantic_schema=Actor, llm=model)
extracted = extractor.run(dataset[1]["text"])
extracted

RuntimeError: no validator found for <class '__main__.Actor'>, see `arbitrary_types_allowed` in Config