In [None]:
!pip install pandas openpyxl

#libraries
import os
import xml.etree.ElementTree as ET
import pandas as pd

#Mapping and extraxting year 
def extract_context_year_mapping(root, nsmap):
    context_mapping = {}
    for context in root.findall('.//xbrli:context', nsmap):
        context_id = context.get('id')
        period = context.find('.//xbrli:period/xbrli:instant', nsmap)
        if period is None:
            period = context.find('.//xbrli:period/xbrli:endDate', nsmap)
        if period is not None:
            context_mapping[context_id] = period.text[:4]
    return context_mapping

#Extracting FSA features  
def extract_fsa_features(file_content, nsmap, file_path):
    root = ET.fromstring(file_content.encode('utf-8'))
    context_year_mapping = extract_context_year_mapping(root, nsmap)
    data = []
    for elem in root.findall('.//fsa:*', nsmap):
        element_name = elem.tag.partition('}')[-1]  # Removing namespace
        context_ref = elem.get('contextRef')
        year = context_year_mapping.get(context_ref, 'Unknown')
        unit_ref = elem.get('unitRef', 'No unit')  
        decimals = elem.get('decimals', 'N/A')  
        data.append({
            'Element': element_name,
            'Value': elem.text.strip() if elem.text else '',
            'Year': year,
            'Unit': unit_ref,
            'Decimals': decimals,
            'File': os.path.basename(file_path)
        })
    
    return data


#processing prefix with spaces
def preprocess_xml_content(file_path):
    """Read the XML file and preprocess to fix invalid URIs."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    content = content.replace('http://xbrl.dcca.dk/Regnskab 2.0 Basis', 'http://xbrl.dcca.dk/Regnskab2.0Basis')
    return content

# File location 
directory_path = "INSERT HERE"

#batch for easier handling
nsmap = {'xbrli': 'http://www.xbrl.org/2003/instance', 'fsa': 'http://xbrl.dcca.dk/fsa'}
file_counter = 0
batch_number = 1
all_data = []

all_filenames = [f for f in os.listdir(directory_path) if f.endswith(".xml")]

for filename in all_filenames:
    file_path = os.path.join(directory_path, filename)
    file_content = preprocess_xml_content(file_path)
    file_data = extract_fsa_features(file_content, nsmap, file_path)
    all_data.extend(file_data)
    file_counter += 1
    
    if file_counter == 500 or filename == all_filenames[-1]:
        df = pd.DataFrame(all_data)
        excel_path = f"#{batch_number}"
        df.to_excel(excel_path, index=False)
        print(f"Data for batch {batch_number} exported successfully to {excel_path}")
        
        batch_number += 1
        all_data = []
        file_counter = 0
