<a href="https://colab.research.google.com/github/JSJeong-me/Retriever/blob/main/11-Tax-Form-Tuples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Let's start by reading the contents of the uploaded XML file to understand its structure
file_path = './tax-form.xml'

with open(file_path, 'r') as file:
    xml_content = file.read()

xml_content[:500]  # Displaying the first 500 characters to get a sense of the file's content and structure


'<?xml version="1.0" encoding="UTF-8"?>\n<TaxInvoice xmlns="urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0 http://www.kec.or.kr/standard/Tax/TaxInvoiceSchemaModule_1.0.xsd">\n\t<ExchangedDocument>\n\t\t<ID>202403050957000025946488</ID>\n\t\t<IssueDateTime>20240305134131</IssueDateTime>\n\t</ExchangedDocument>\n\t'

In [5]:
import xml.etree.ElementTree as ET
import re

# Parse the XML content
tree = ET.ElementTree(ET.fromstring(xml_content))
root = tree.getroot()

# Define a namespace dictionary to handle namespaces in the XML
namespaces = {
    'ns': 'urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0'
}

# Search for the total amount (assuming it might be labeled in a manner similar to 'TotalAmount', '셰금합계', or similar terms)
total_amount = None
for elem in root.iter():
    if re.search(r'(totalamount|셰금합계)', elem.tag, re.IGNORECASE):
        total_amount = elem.text
        break

total_amount


'9082'

In [7]:
import pandas as pd
import xml.etree.ElementTree as ET

# Load the XML file
file_path = './tax-form.xml'

# Parse the XML file
tree = ET.parse(file_path)
root = tree.getroot()

# Create a list to hold the parsed data
data = []

# Iterate over each element in the XML file
for elem in root:
    record = {}
    for subelem in elem:
        record[subelem.tag] = subelem.text
    data.append(record)

# Create a DataFrame
df = pd.DataFrame(data)

df.head()


Unnamed: 0,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}ID,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}IssueDateTime,{http://www.w3.org/2000/09/xmldsig#}SignedInfo,{http://www.w3.org/2000/09/xmldsig#}SignatureValue,{http://www.w3.org/2000/09/xmldsig#}KeyInfo,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}IssueID,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}TypeCode,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}DescriptionText,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}PurposeCode,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}InvoicerParty,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}InvoiceeParty,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}SpecifiedMonetarySummation,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}SequenceNumeric,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}InvoiceAmount,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}InformationText,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}NameText,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}PurchaseExpiryDateTime,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}TotalTax
0,202403050957000025946488,20240305134131.0,,,,,,,,,,,,,,,,
1,,,\n,\nH6tagj4mPRnTvNsQWPDAYd/eL8Q4J8SqvNJJSNZASd4X...,\n,,,,,,,,,,,,,
2,,20240229.0,,,,2024022942000172amai7a9c,101.0,,1.0,,,,,,,,,
3,,,,,,,,,,\n\t\t\t,\n\t\t\t,\n\t\t\t,,,,,,
4,,,,,,,,,,,,,1.0,9082.0,,판매수수료,20240229.0,\n\t\t\t


In [8]:
# Function to clean up the column headers by removing the namespaces
def clean_column_name(col_name):
    try:
        return col_name.split('}')[-1]
    except:
        return col_name

# Apply the cleaning function to each column header
df.columns = [clean_column_name(col) for col in df.columns]

df.head()


Unnamed: 0,ID,IssueDateTime,SignedInfo,SignatureValue,KeyInfo,IssueID,TypeCode,DescriptionText,PurposeCode,InvoicerParty,InvoiceeParty,SpecifiedMonetarySummation,SequenceNumeric,InvoiceAmount,InformationText,NameText,PurchaseExpiryDateTime,TotalTax
0,202403050957000025946488,20240305134131.0,,,,,,,,,,,,,,,,
1,,,\n,\nH6tagj4mPRnTvNsQWPDAYd/eL8Q4J8SqvNJJSNZASd4X...,\n,,,,,,,,,,,,,
2,,20240229.0,,,,2024022942000172amai7a9c,101.0,,1.0,,,,,,,,,
3,,,,,,,,,,\n\t\t\t,\n\t\t\t,\n\t\t\t,,,,,,
4,,,,,,,,,,,,,1.0,9082.0,,판매수수료,20240229.0,\n\t\t\t


In [None]:
# Now let's convert the JSON data to a list of tuples. Each tuple will represent a key-value pair in the JSON.

def json_to_tuples(json_obj, parent_key=''):
    tuples_list = []

    if isinstance(json_obj, dict):
        for key, value in json_obj.items():
            new_key = f"{parent_key}.{key}" if parent_key else key
            tuples_list.extend(json_to_tuples(value, new_key))
    elif isinstance(json_obj, list):
        for index, item in enumerate(json_obj):
            tuples_list.extend(json_to_tuples(item, f"{parent_key}[{index}]"))
    else:
        tuples_list.append((parent_key, json_obj))

    return tuples_list

# Convert the JSON data to tuples
tuples_data = json_to_tuples(json_data)

# Displaying a portion of the tuples list
tuples_data[:10]  # Displaying the first 10 tuples for a glimpse.


In [1]:
import xml.etree.ElementTree as ET
import json

def xml_to_json(node):
    """
    Recursively converts an XML node to a JSON-like dictionary.
    """
    json_node = {}

    if node.text and node.text.strip():
        json_node["text"] = node.text.strip()

    children = list(node)
    if children:
        child_dict = {}
        for child in children:
            child_node = xml_to_json(child)
            if child.tag in child_dict:
                if isinstance(child_dict[child.tag], list):
                    child_dict[child.tag].append(child_node)
                else:
                    child_dict[child.tag] = [child_dict[child.tag], child_node]
            else:
                child_dict[child.tag] = child_node
        json_node.update(child_dict)

    for attr, value in node.attrib.items():
        json_node[f"@{attr}"] = value

    return json_node

def json_to_tuples(json_obj, parent_key=''):
    """
    Recursively converts a JSON-like dictionary to a list of tuples.
    """
    tuples_list = []

    if isinstance(json_obj, dict):
        for key, value in json_obj.items():
            new_key = f"{parent_key}.{key}" if parent_key else key
            tuples_list.extend(json_to_tuples(value, new_key))
    elif isinstance(json_obj, list):
        for index, item in enumerate(json_obj):
            tuples_list.extend(json_to_tuples(item, f"{parent_key}[{index}]"))
    else:
        tuples_list.append((parent_key, json_obj))

    return tuples_list

def xml_file_to_json_tuples(file_path):
    """
    Reads an XML file and converts its content to JSON-formatted tuples.
    """
    tree = ET.parse(file_path)
    root = tree.getroot()
    json_data = xml_to_json(root)
    return json_to_tuples(json_data)

# Example usage
file_path = 'tax-form.xml'  # Replace with the path to your XML file
tuples_data = xml_file_to_json_tuples(file_path)

# Print a few tuples for demonstration
print(tuples_data[:10])


[('{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}ExchangedDocument.{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}ID.text', '202403050957000025946488'), ('{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}ExchangedDocument.{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}IssueDateTime.text', '20240305134131'), ('{http://www.w3.org/2000/09/xmldsig#}Signature.{http://www.w3.org/2000/09/xmldsig#}SignedInfo.{http://www.w3.org/2000/09/xmldsig#}CanonicalizationMethod.@Algorithm', 'http://www.w3.org/TR/2001/REC-xml-c14n-20010315'), ('{http://www.w3.org/2000/09/xmldsig#}Signature.{http://www.w3.org/2000/09/xmldsig#}SignedInfo.{http://www.w3.org/2000/09/xmldsig#}SignatureMethod.@Algorithm', 'http://www.w3.org/2001/04/xmldsig-more#rsa-sha256'), ('{http://www.w3.org/2000/09/xmldsig#}Signature.{http://www.w3.org/2000/09/xmldsig#}SignedInfo.{htt

In [2]:
tuples_data

[('{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}ExchangedDocument.{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}ID.text',
  '202403050957000025946488'),
 ('{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}ExchangedDocument.{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}IssueDateTime.text',
  '20240305134131'),
 ('{http://www.w3.org/2000/09/xmldsig#}Signature.{http://www.w3.org/2000/09/xmldsig#}SignedInfo.{http://www.w3.org/2000/09/xmldsig#}CanonicalizationMethod.@Algorithm',
  'http://www.w3.org/TR/2001/REC-xml-c14n-20010315'),
 ('{http://www.w3.org/2000/09/xmldsig#}Signature.{http://www.w3.org/2000/09/xmldsig#}SignedInfo.{http://www.w3.org/2000/09/xmldsig#}SignatureMethod.@Algorithm',
  'http://www.w3.org/2001/04/xmldsig-more#rsa-sha256'),
 ('{http://www.w3.org/2000/09/xmldsig#}Signature.{http://www.w3.org/2000/09/xmldsig#}Sig

Prompt: 위 tuple data에서 세금합계금액은?



위의 튜플 데이터에서 '세금합계금액'은 'SpecifiedMonetarySummation' 태그 아래에 위치한 'TaxTotalAmount' 항목과 관련이 있습니다. 이에 따라, '세급합계금액'은 '908'로 나타나 있습니다. 이는 데이터 리스트 중에서 '{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}TaxInvoiceTradeSettlement.{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}SpecifiedMonetarySummation.{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}TaxTotalAmount.text' 항목에 해당합니다.

Prompt: 위 tuple data에서 공급받는자 사업자 등록번호는?

공급받는자의 사업자 등록번호를 찾기 위해, 주어진 튜플 데이터를 살펴보겠습니다. 이 데이터에서 관련 정보는 `"TaxInvoiceTradeSettlement"`과 `"InvoiceeParty"` 관련 항목에서 찾을 수 있습니다.

특히, `"TaxInvoiceTradeSettlement.{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}InvoiceeParty.{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}ID.text"` 항목을 보면 공급받는자의 사업자 등록번호를 알 수 있습니다.

해당 튜플에서, 공급받는자의 사업자 등록번호는 `'1233297178'`입니다.