<a href="https://colab.research.google.com/github/JSJeong-me/Retriever/blob/main/10-Tax-Form.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Let's start by reading the contents of the uploaded XML file to understand its structure
file_path = './tax-form.xml'

with open(file_path, 'r') as file:
    xml_content = file.read()

xml_content[:500]  # Displaying the first 500 characters to get a sense of the file's content and structure


'<?xml version="1.0" encoding="UTF-8"?>\n<TaxInvoice xmlns="urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0 http://www.kec.or.kr/standard/Tax/TaxInvoiceSchemaModule_1.0.xsd">\n\t<ExchangedDocument>\n\t\t<ID>202403050957000025946488</ID>\n\t\t<IssueDateTime>20240305134131</IssueDateTime>\n\t</ExchangedDocument>\n\t'

In [5]:
import xml.etree.ElementTree as ET
import re

# Parse the XML content
tree = ET.ElementTree(ET.fromstring(xml_content))
root = tree.getroot()

# Define a namespace dictionary to handle namespaces in the XML
namespaces = {
    'ns': 'urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0'
}

# Search for the total amount (assuming it might be labeled in a manner similar to 'TotalAmount', '셰금합계', or similar terms)
total_amount = None
for elem in root.iter():
    if re.search(r'(totalamount|셰금합계)', elem.tag, re.IGNORECASE):
        total_amount = elem.text
        break

total_amount


'9082'

In [7]:
import pandas as pd
import xml.etree.ElementTree as ET

# Load the XML file
file_path = './tax-form.xml'

# Parse the XML file
tree = ET.parse(file_path)
root = tree.getroot()

# Create a list to hold the parsed data
data = []

# Iterate over each element in the XML file
for elem in root:
    record = {}
    for subelem in elem:
        record[subelem.tag] = subelem.text
    data.append(record)

# Create a DataFrame
df = pd.DataFrame(data)

df.head()


Unnamed: 0,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}ID,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}IssueDateTime,{http://www.w3.org/2000/09/xmldsig#}SignedInfo,{http://www.w3.org/2000/09/xmldsig#}SignatureValue,{http://www.w3.org/2000/09/xmldsig#}KeyInfo,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}IssueID,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}TypeCode,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}DescriptionText,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}PurposeCode,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}InvoicerParty,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}InvoiceeParty,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}SpecifiedMonetarySummation,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}SequenceNumeric,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}InvoiceAmount,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}InformationText,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}NameText,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}PurchaseExpiryDateTime,{urn:kr:or:kec:standard:Tax:ReusableAggregateBusinessInformationEntitySchemaModule:1:0}TotalTax
0,202403050957000025946488,20240305134131.0,,,,,,,,,,,,,,,,
1,,,\n,\nH6tagj4mPRnTvNsQWPDAYd/eL8Q4J8SqvNJJSNZASd4X...,\n,,,,,,,,,,,,,
2,,20240229.0,,,,2024022942000172amai7a9c,101.0,,1.0,,,,,,,,,
3,,,,,,,,,,\n\t\t\t,\n\t\t\t,\n\t\t\t,,,,,,
4,,,,,,,,,,,,,1.0,9082.0,,판매수수료,20240229.0,\n\t\t\t


In [8]:
# Function to clean up the column headers by removing the namespaces
def clean_column_name(col_name):
    try:
        return col_name.split('}')[-1]
    except:
        return col_name

# Apply the cleaning function to each column header
df.columns = [clean_column_name(col) for col in df.columns]

df.head()


Unnamed: 0,ID,IssueDateTime,SignedInfo,SignatureValue,KeyInfo,IssueID,TypeCode,DescriptionText,PurposeCode,InvoicerParty,InvoiceeParty,SpecifiedMonetarySummation,SequenceNumeric,InvoiceAmount,InformationText,NameText,PurchaseExpiryDateTime,TotalTax
0,202403050957000025946488,20240305134131.0,,,,,,,,,,,,,,,,
1,,,\n,\nH6tagj4mPRnTvNsQWPDAYd/eL8Q4J8SqvNJJSNZASd4X...,\n,,,,,,,,,,,,,
2,,20240229.0,,,,2024022942000172amai7a9c,101.0,,1.0,,,,,,,,,
3,,,,,,,,,,\n\t\t\t,\n\t\t\t,\n\t\t\t,,,,,,
4,,,,,,,,,,,,,1.0,9082.0,,판매수수료,20240229.0,\n\t\t\t
