In [58]:
import xml.etree.ElementTree as ET
import pandas as pd
import time

from difflib import SequenceMatcher

## Preprocessing and extract XSD elements

In [39]:
def parse_tag_meaning(file_path):
    tag_meanings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if "=>" in line:
                key, value = line.split("==>")
                tag_meanings[key.strip()] = value.strip()
    return tag_meanings

def extract_xsd_elements(xsd_file):
    tree = ET.parse(xsd_file)
    root = tree.getroot()
    elements = []
    for elem in root.iter():
        if 'name' in elem.attrib:
            elements.append(elem.attrib['name'])
    return elements

## Find 1 to 1 matches

In [42]:
def find_best_matches(tag_meanings, xsd_elements):
    matches = []
    for tag, meaning in tag_meanings.items():
        best_match = None
        highest_similarity = 0
        for element in xsd_elements:
            similarity = SequenceMatcher(None, tag, element).ratio()
            if similarity > highest_similarity:
                highest_similarity = similarity
                best_match = element
        matches.append({
            "Tag": tag,
            "Meaning": meaning,
            "Match (XSD Element)": best_match,
            "Similarity (%)": round(highest_similarity * 100, 2)
        })
    return matches

## Plot result

In [45]:
def save_best_matches_to_csv(best_matches, output_file):
    df = pd.DataFrame(best_matches)
    sorted_df = df.sort_values(by="Similarity (%)", ascending=False)
    sorted_df.to_csv(output_file, index=False)
    print(f"Result saved to {output_file} file.")

In [61]:
tags_meaning_file = 'TagsMeaning.txt'
xsd_file = 'IPC-2581C.xsd'

tag_meanings = parse_tag_meaning(tags_meaning_file)
xsd_elements = extract_xsd_elements(xsd_file)

start_time = time.time()

best_matches = find_best_matches(tag_meanings, xsd_elements)

end_time = time.time()
print(f"Execution time 1 to 1: {end_time - start_time} seconds")

output_file = 'best_matches_output.csv'
save_best_matches_to_csv(best_matches, output_file)


Execution time 1 to 1: 1.6905243396759033 seconds
Result saved to best_matches_output.csv file.
