In [None]:
from functools import lru_cache

import instructor
from openai import AzureOpenAI
from datetime import datetime


# Configure Azure OpenAI client
@lru_cache(maxsize = None)
def get_instructor_openai_azure_client():
    """the native openai azure client creattion."""
    client = AzureOpenAI(
            azure_endpoint= "",
            api_key="",
            api_version=""
        )
    return instructor.from_openai(client)


In [12]:
from pydantic import BaseModel, Field
from typing import Optional, List

class Guarantor(BaseModel):
    guarantor_name: Optional[str] = Field(None, description="Name of the guarantor")
    guarantor_relationship: Optional[str] = Field(None, description="Relationship with the guarantor")
    guarantor_contact_number: Optional[str] = Field(None, description="Contact number of the guarantor")
    guarantor_company_name: Optional[str] = Field(None, description="Company name of the guarantor") 

# 这个类是业务系统发送原始数据的格式
class CustomerRawData(BaseModel):
    name: Optional[str] = Field(None, description="Name of the customer")
    nric: Optional[str] = Field(None, description="Last 4 digits of NRIC number including the last alphabet")
    date_of_birth: Optional[str] = Field(None, description="Date of birth, in YYYY-mm-dd format")
    registered_address: Optional[str] = Field(None, description="Registered address")
    postal_code: Optional[str] = Field(None, description="Postal code")
    
    employer: Optional[str] = Field(None, description="Current employer")
    length_of_employment: Optional[str] = Field(None, description="Length of employment at the current company")
    
    car_model: Optional[str] = Field(None, description="car brand and model.")
    loan_granted_amount: Optional[str] = Field(None, description="Loan granted amount for the car")
    monthly_payment: Optional[str] = Field(None, description="Monthly payment for the car")
    main_driver: Optional[str] = Field(None, description="Who will be the main driver for the car.")
    
    num_guarantors: Optional[int] = Field(None, description="How many guarantors are provided for this loan?")
    guarantors: List[Guarantor] = Field(None, description="List of guarantors")
    
    
PROPMT_TEMPLATE = """
## Common Fields Comparison
Here is the information of data fields(both from database and extracted from the audio)
{common_fields_info}

## Guarantor Recognition and Comparison
Here is the information of the extracted guarantor:
{extracted_guarantor_info}

Here is the information of the database guarantors(Among those guarantors, is any one of them matches the extracted guarantor?):
You will compare the extracted guarantor with the most similar database guarantor.
{database_guarantor_info}

## Comparison Instruction
For all comparisons, you need to consider the effect of ASR transcription error, any errors due to ASR are considered as acceptable(a success match)
When comparing nric, you should only consider the last 4 digits(if the extracted nric is 1234A, you should only consider 234A)
When comparing address, either postal code or house number match is considered as a success match.
When comparing person names, company names, if any keyword matches(ignoring the asr error), it is a considered as a success match.
When considering relationship, pay attention that synonyms are also acceptable, such as "father" and "dad", "mother" and "mum", "brother" and "bro", spouse and "husband", etc.
When comparing length of employment, you should only consider the db value can be approximately equal to the extracted value.(less than 1 year)
"""

# 这个是大模型实体提取的输出格式
class CustomerInfo(BaseModel):
    """
    CustomerInfo model for storing customer details related to car purchase and employment.
    
    Note: if there are multiple guarantors, just randomly pick one of them.
    """
    name: Optional[str] = Field(None, description="Name of the customer")
    nric: Optional[str] = Field(None, description="Last 4 digits of NRIC number including the last alphabet")
    date_of_birth: Optional[str] = Field(None, description="Date of birth, in YYYY-mm-dd format")
    registered_address: Optional[str] = Field(None, description="Registered address")
    postal_code: Optional[str] = Field(None, description="Postal code")
    
    employer: Optional[str] = Field(None, description="Current employer")
    length_of_employment: Optional[str] = Field(None, description="Length of employment at the current company")
    
    car_model: Optional[str] = Field(None, description="car brand and model.")
    loan_granted_amount: Optional[str] = Field(None, description="Loan granted amount for the car")
    monthly_payment: Optional[str] = Field(None, description="Monthly payment for the car")
    main_driver: Optional[str] = Field(None, description="Who will be the main driver for the car.")
    
    num_guarantors: Optional[int] = Field(None, description="How many guarantors are provided for this loan?")
    guarantor_name: Optional[str] = Field(None, description="Name of the guarantor")
    guarantor_relationship: Optional[str] = Field(None, description="Relationship with the guarantor")
    guarantor_contact_number: Optional[str] = Field(None, description="Contact number of the guarantor")
    guarantor_company_name: Optional[str] = Field(None, description="Company name of the guarantor")
    
    def compare_with(self, db_info:dict) -> str:
        """生成信息校验提示语"""
        flatten_dict = self.model_dump(exclude = [
            'guarantor_name', 
            'guarantor_relationship', 
            'guarantor_contact_number', 
            'guarantor_company_name'])
        
        flatten_fields_info = "\n".join([
            f"||Field name: {k:20s} \t DatabaseValue: {str(db_info.get(k, None)):20s} \t ExtractedValue: {str(flatten_dict.get(k, None)):20s}||"
            for k, v in flatten_dict.items()
        ])
        
        guarantor_info = (
            f"extracted_guarantor_name: {self.guarantor_name}\n" 
            f"extracted_guarantor_relationship: {self.guarantor_relationship}\n"
            f"extracted_guarantor_contact_number: {self.guarantor_contact_number}\n"
            f"extracted_guarantor_company_name: {self.guarantor_company_name}")
        
        
        db_guarantors = db_info.get('guarantors', [])
        
        db_guarantor_info ="\n".join([f"Guarantor{i + 1} -- GuarantorName: {g['guarantor_name']}, GuarantorRelationship: {g['guarantor_relationship']}, GuarantorContactNumber: {g['guarantor_contact_number']}, GuarantorCompanyName: {g['guarantor_company_name']}"
            for i, g in enumerate(db_guarantors)])

        return PROPMT_TEMPLATE.format(
            common_fields_info = flatten_fields_info,
            extracted_guarantor_info = guarantor_info,
            database_guarantor_info = db_guarantor_info
        )
        
customer_request = CustomerRawData(
    name="John Doe",
    nric="S1234567A",
    date_of_birth="1985-06-15",
    registered_address="10 Orchard Road, #05-01, Singapore 238824",
    postal_code="238824",
    employer="Tech Solutions Pte Ltd",
    length_of_employment="Five years",
    car_model="Toyota Camry",
    loan_granted_amount="10000",
    monthly_payment="500",
    main_driver="I will be",
    num_guarantors=1,
    guarantors=[
        Guarantor(
            guarantor_name="Mr. David Lee",
            guarantor_relationship="father",
            guarantor_contact_number="91234567",
            guarantor_company_name="Tech Solutions Pte Ltd"
        ),
        Guarantor(
            guarantor_name="Ms. Jane Doe",
            guarantor_relationship="mother",
            guarantor_contact_number="98765432",
            guarantor_company_name="Tech Solutions Pte Ltd"
        )
    ]
)

customer_extracted = CustomerInfo(
    name = "John to",
    nric = "S1234567A",
    date_of_birth = "1985-06-15",
    registered_address = "10 Orchard Road, #05-01, Singapore 238824",
    postal_code = "238824",
    employer = "Tech Solutions Pte Ltd",
    length_of_employment = "Five years",
    car_model = "Toyota Kamery",
    guarantor_name = "Dvid Li",
    guarantor_company_name="Tech solution company",
)


prompt = customer_extracted.compare_with(customer_request.model_dump())
print(prompt)


## Common Fields Comparison
Here is the information of data fields(both from database and extracted from the audio)
||Field name: name                 	 DatabaseValue: John Doe             	 ExtractedValue: John to             ||
||Field name: nric                 	 DatabaseValue: S1234567A            	 ExtractedValue: S1234567A           ||
||Field name: date_of_birth        	 DatabaseValue: 1985-06-15           	 ExtractedValue: 1985-06-15          ||
||Field name: registered_address   	 DatabaseValue: 10 Orchard Road, #05-01, Singapore 238824 	 ExtractedValue: 10 Orchard Road, #05-01, Singapore 238824||
||Field name: postal_code          	 DatabaseValue: 238824               	 ExtractedValue: 238824              ||
||Field name: employer             	 DatabaseValue: Tech Solutions Pte Ltd 	 ExtractedValue: Tech Solutions Pte Ltd||
||Field name: length_of_employment 	 DatabaseValue: Five years           	 ExtractedValue: Five years          ||
||Field name: car_model            	 Da

In [None]:

MatcherCustomerPrompt = """
You will be given 2 pieces of information, one is extracted from a conversation, one is extracted from a database system.
Your task is to compare the 2 piece of information and determine if they match or not.

Note: The data extracted from database may contain multiple guarantors, however, we only need to compare one of them.
This means you need to analyze the conversation and find the correct guarantor to compare with the database.
Today's datetime is: {today_datetime}

Return structured result that indicate whether the 2 pieces of information match or not.
"""

class InformationChecker:
    def __init__(self, 
                 pydantic_model:BaseModel, 
                 system_prompt:str,
                 model:str = "gpt-4o"):
        
        self.client = get_instructor_openai_azure_client()
        self.pydantic_model = pydantic_model
        self.model = model
        self.system_prompt = system_prompt

    def run(self, prompt:str) -> BaseModel:
        today_datetime = datetime.today().strftime("%Y-%m-%d")
        
        system_prompt = self.system_prompt.format(today_datetime = today_datetime)
        
        return self.client.chat.completions.create(
            model=self.model, 
            messages=[
                {"role": "system", "content": system_prompt}, 
                {"role": "user", "content": prompt}],
            response_model=self.pydantic_model)
    

class CustomerCheckResult(BaseModel):
    """
    Compare data, one is extracted from the conversation, one is extracted from the database system.
    Check Against the database system, if the extract information is correct or not.
    Since there can be errors in speech recognized texts, we need to ignore errors arises from ASR
    for example, 123A and 1238 can be a success match, because the A pronounced like 8, the error is due to ASR and is acceptable.
    
    If the value is not appear in either database or Audio, the check result should be None.
    
    Since Guarantor is hard to match, do include reasoning for guarantors.
    """
    reasoning:Optional[List[dict]] = Field(None, description = "list of dicts, each dict key is the field name, value is the reasoning of the check resultfor the key")
    name: Optional[bool] = Field(None, description = "A keyword(first/last/middle name) match is enough")
    nric: Optional[bool] = Field(None, description = "Only match the last 4 digits(exact match, don't consider ASR effect)")
    date_of_birth: Optional[bool] = Field(None, description = "Exact match, don't consider asr effect")
    registered_address: Optional[bool] = Field(None, description = "either postal address match or house number match is enough")
    postal_code: Optional[bool] = Field(None, description = "")
    
    employer: Optional[bool] = Field(None,description = "Keyword partial match is enough")
    length_of_employment: Optional[bool] = Field(None, description = "difference is less than 1 year")
    
    car_model: Optional[bool] = Field(None, desc = "Keyword partial match is enough")
    loan_granted_amount: Optional[bool] = Field(None, desc = "Exact match")
    monthly_payment: Optional[bool] = Field(None, desc = "Exact match")
    main_driver: Optional[bool] = Field(None, desc = "A keyword(first/last/middle name) match is enough")
    
    num_guarantors: Optional[bool] = Field(None, desc = "Exact match")
    guarantor_name: Optional[bool] = Field(None, desc = "A keyword(first/last/middle name) match is enough")
    guarantor_relationship: Optional[bool] = Field(None, desc = "Consider synonyms")
    guarantor_contact_number: Optional[bool] = Field(None, desc = "Exact match")
    guarantor_company_name: Optional[bool] = Field(None, desc = "Keyword partial match is enough")


checker = InformationChecker(pydantic_model=CustomerCheckResult,
                             system_prompt=MatcherCustomerPrompt,
                             model="gpt-4o")

result = checker.run(prompt)

In [17]:
result.model_dump()

{'reasoning': [{'name': "The extracted name has 'John' which matches the first name in the database. 'to' might be an ASR error or not relevant, thus matching is acceptable."},
  {'nric': 'Both values match entirely for the last 4 digits, so this is a success match.'},
  {'date_of_birth': 'The extracted date matches exactly with the database value, so this is a success match.'},
  {'registered_address': 'Both values match entirely, indicating success.'},
  {'postal_code': 'Both postal codes match, thus a success.'},
  {'employer': "The extracted value 'Tech Solutions Pte Ltd' matches entirely with the database, hence acceptable."},
  {'length_of_employment': "Both indicate 'Five years,' which is an exact match. Acceptable."},
  {'car_model': "This could be an ASR error as 'Kamery' phonetically resembles 'Camry,' suggesting match is acceptable."},
  {'loan_granted_amount': 'Data missing in the extracted information; unable to compare.'},
  {'monthly_payment': 'Data missing in the extrac

In [27]:
PROPMT_TEMPLATE = """
## Fields to compare
Here is the information of data fields(both from database and extracted from the audio)
{common_fields_info}

## Comparison Instruction
For all comparisons, you need to consider the effect of ASR transcription error, any errors due to ASR are considered as acceptable(a success match)
When comparing nric, you should only consider the last 4 digits(if the extracted nric is 1234A, you should only consider 234A)
When comparing address, either postal code or house number match is considered as a success match.
When comparing person names, company names, if any keyword matches(ignoring the asr error), it is a considered as a success match.
When considering relationship, pay attention that synonyms are also acceptable, such as "father" and "dad", "mother" and "mum", "brother" and "bro", spouse and "husband", etc.
When comparing length of employment, you should only consider the db value can be approximately equal to the extracted value.(less than 1 year)
"""

class GuarantorInfo(BaseModel):
    """
    A guarantor is a person who guarantees another person's loan (loaner). 
    A customer service representative needs to collection his/her information and the relationship with the loaner.
    """
    name: Optional[str] = Field(None, description="Name of the guarantor")
    nric: Optional[str] = Field(None, description="Last 4 digits of NRIC number including the last alphabet")
    date_of_birth: Optional[str] = Field(None, description="Date of birth")
    registered_address: Optional[str] = Field(None, description="Registered address")
    postal_code: Optional[str] = Field(None, description="Postal code")
    
    employer: Optional[str] = Field(None, description="Current employer of the guarantor")
    length_of_employment: Optional[str] = Field(None, description="Length of employment at the current company")
    
    car_model: Optional[str] = Field(None, description="The guarantee's car brand and model.")
    loan_granted_amount: Optional[str] = Field(None, description="Loan granted amount for the car")
    monthly_payment: Optional[str] = Field(None, description="Monthly payment for the car")
    main_driver: Optional[str] = Field(None, description="Who will be the main driver for the car.")
    
    
    guarantee_relationship: Optional[str] = Field(None, description="Relationship with the loaner")
    has_other_guarantors: Optional[bool] = Field(None, description="Does the loaner have other guarantors?")
    guarantee_full_name: Optional[str] = Field(None, description="Full name of the loaner")
    guarantee_contact_number: Optional[str] = Field(None, description="Contact number of the loaner")
    guarantee_company_name: Optional[str] = Field(None, description="Company name of the loaner")  
    
    def compare_with(self, db_info:dict) -> str:
        """生成信息校验提示语"""
        flatten_dict = self.model_dump()
        
        flatten_fields_info = "\n".join([
            f"||Field name: {k:20s} \t DatabaseValue: {str(db_info.get(k, None)):20s} \t ExtractedValue: {str(flatten_dict.get(k, None)):20s}||"
            for k, v in flatten_dict.items()
        ])
        

        return PROPMT_TEMPLATE.format(
            common_fields_info = flatten_fields_info,
        )
        


In [28]:
MatcherGuarantorPrompt = """
You will be given 2 pieces of information, one is extracted from a conversation, one is extracted from a database system.
Your task is to compare the 2 piece of information and determine if they match or not.
Today's datetime is: {today_datetime}

Return structured result that indicate whether the 2 pieces of information match or not.
"""

In [29]:
class GuarantorCheckResult(BaseModel):
    """
    Compare data, one is extracted from the conversation, one is extracted from the database system.
    Check Against the database system, if the extract information is correct or not.
    Since there can be errors in speech recognized texts, we need to ignore errors arises from ASR
    for example, 123A and 1238 can be a success match, because the A pronounced like 8, the error is due to ASR and is acceptable.
    
    If the value is not appear in either database or Audio, the check result should be None.
    """
    reasoning:Optional[List[dict]] = Field(None, description = "list of dicts, each dict key is the field name, value is the reasoning of the check resultfor the key")
    name: Optional[bool] = Field(None, description = "A keyword(first/last/middle name) match is enough")
    nric: Optional[bool] = Field(None, description = "Only match the last 4 digits(exact match, don't consider ASR effect)")
    date_of_birth: Optional[bool] = Field(None, description = "Exact match, don't consider asr effect")
    registered_address: Optional[bool] = Field(None, description = "either postal address match or house number match is enough")
    postal_code: Optional[bool] = Field(None, description = "")
    
    employer: Optional[bool] = Field(None,description = "Keyword partial match is enough")
    length_of_employment: Optional[bool] = Field(None, description = "difference is less than 1 year")
    
    car_model: Optional[bool] = Field(None, desc = "Keyword partial match is enough")
    loan_granted_amount: Optional[bool] = Field(None, desc = "Exact match")
    monthly_payment: Optional[bool] = Field(None, desc = "Exact match")
    main_driver: Optional[bool] = Field(None, desc = "A keyword(first/last/middle name) match is enough")
    
    guarantee_relationship: Optional[bool] = Field(None, description="Consider synonyms")
    has_other_guarantors: Optional[bool] = Field(None, description="Exact match")
    guarantee_full_name: Optional[bool] = Field(None, description="A keyword(first/last/middle name) match is enough")
    guarantee_contact_number: Optional[bool] = Field(None, description="Exact match")
    guarantee_company_name: Optional[bool] = Field(None, description="Keyword partial match is enough")

In [30]:
guarantor_extracted = GuarantorInfo(
    name = "John toeeee",
    nric = "S1234567A",
    date_of_birth = "1985-06-15",
    registered_address = "10 Orchard Road, #05-01, Singapore 238824",
    postal_code = "238824",
    employer = "Tech Solutions Pte Ltd",
    length_of_employment = "Five years",
    car_model = "Toyota Kamery",
    loan_granted_amount = "10000",
    monthly_payment = "500",
    main_driver = "I will be",
    
    guarantee_relationship="father",
    has_other_guarantors=True,
    guarantee_full_name="Dvid Li",
    guarantee_contact_number="91234567",
)

guarantor_db = GuarantorInfo(
    name = "John to",
    nric = "S1234567A",
    date_of_birth = "1985-06-15",
    registered_address = "10 Orchard Road, #05-01, Singapore 238824",
    postal_code = "238824",
    employer = "Tech Solutions Pte Ltd",
    length_of_employment = "Five years",
    car_model = "Toyota Kamery",
    loan_granted_amount = "10000",
    monthly_payment = "500",
    main_driver = "I will be",
    
    guarantee_relationship="father",
    has_other_guarantors=True,
    guarantee_full_name="Dvid Li",
    guarantee_contact_number="91234567",
)

prompt = guarantor_extracted.compare_with(guarantor_db.model_dump())

In [31]:
checker = InformationChecker(pydantic_model=GuarantorCheckResult,system_prompt=MatcherGuarantorPrompt,model="gpt-4o")


In [32]:
result = checker.run(prompt)

In [35]:
result.model_dump()

{'reasoning': [{'name': "The extracted name has an ASR transcription error with added characters 'eeee'. Ignoring this, the name 'John to' still matches successfully."},
  {'nric': 'Both NRIC values match exactly, confirming correctness.'},
  {'date_of_birth': 'The date of birth matches exactly, confirming correctness.'},
  {'registered_address': 'Both address values match exactly, including postal code and house number.'},
  {'postal_code': 'Postal codes match exactly.'},
  {'employer': 'Employer names match exactly.'},
  {'length_of_employment': "Both lengths of employment are stated as 'Five years', so they match."},
  {'car_model': 'Car models are stated identically.'},
  {'loan_granted_amount': 'Loan amounts are stated identically.'},
  {'monthly_payment': 'Monthly payments match exactly.'},
  {'main_driver': "Both data sources state 'I will be' as the main driver, confirming a match."},
  {'guarantee_relationship': "The relationship 'father' matches exactly; no synonyms need cons