In [3]:
import re
import time
import string
import pandas as pd

import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt

from difflib import get_close_matches
from difflib import SequenceMatcher, get_close_matches

## Preprocessing and extract XSD elements

In [5]:
xml_file1 = "IPC-2581C.xsd"
xml_file2 = "test_1.xml"
tags_meaning_file = "TagsMeaning.txt"

In [6]:
def preprocess_text(text):
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text.lower().split()

def extract_xsd_elements(root):
    xsd_elements = []
    for element in root.iter('{http://www.w3.org/2001/XMLSchema}simpleType'):
        xsd_elements.append(preprocess_text(element.attrib['name']))
    return xsd_elements

In [7]:
def match_tags_with_xsd(tag_meaning_data, xsd_elements, match_type="exact"):
    matches = []
    for tag, meaning in tag_meaning_data:
        if match_type == "exact":
            match = find_exact_matches(tag, meaning, xsd_elements)
            matches.append((tag, meaning, match, 100 if match != "No Match" else 0))  # Assign 100% for exact matches
        elif match_type == "approximate":
            close_matches = find_approximate_matches(tag, meaning, xsd_elements)
            matches.append((tag, meaning, close_matches))
    return matches

## Exact match method

In [9]:
def find_exact_matches(tag, meaning, xsd_elements):
    tag_meaning_combined = " ".join(preprocess_text(tag) + preprocess_text(meaning))
    matches = get_close_matches(tag_meaning_combined, [" ".join(elem) for elem in xsd_elements], n=5, cutoff= 1)
    return matches[0] if matches else "No Match"

In [10]:
def save_exact_match_to_csv(matches, output_file):
    
    cleaned_matches = []
    for record in matches:
        if len(record) == 4:
            cleaned_matches.append(record)
        else:
            tag, meaning, match = record
            cleaned_matches.append((tag, meaning, match, 0.0))
    
    data = []
    for tag, meaning, match, score in cleaned_matches:
        data.append({
            "Tag": tag,
            "Meaning": meaning,
            "Best Match (XSD Element)": match,
            "Similarity (%)": score
        })
    
    df = pd.DataFrame(data)
    sorted_df = df.sort_values(by="Similarity (%)", ascending=False)
    sorted_df.to_csv(output_file, index=False)
    
    print(f"Exact matches saved to '{output_file}' successfully.")

## Approximate match method

In [12]:
def find_approximate_matches(tag, meaning, xsd_elements, threshold=40):
    tag_meaning_combined = " ".join(preprocess_text(tag) + preprocess_text(meaning))
    xsd_words = [" ".join(elem) for elem in xsd_elements]
    
    scores = []
    for elem in xsd_words:
        similarity_percentage = SequenceMatcher(None, tag_meaning_combined, elem).ratio() * 100
        if similarity_percentage >= threshold:
            scores.append((elem, similarity_percentage))
    
    return scores if scores else [("No Close Match", 0)]

In [20]:
def save_approximate_matches_to_csv(matches, output_file):
    
    data = [
        {
            "Tag": tag,
            "Meaning": meaning,
            "Best Match (XSD Element)": match_list[0][0] if match_list else "No Match",
            "Similarity (%)": match_list[0][1] if match_list and match_list[0][1] != "No Close Match" else "0.00%"
        }
        for tag, meaning, match_list in matches
    ]
    
    df = pd.DataFrame(data)
    sorted_df = df.sort_values(by="Similarity (%)", ascending=False)
    sorted_df.to_csv(output_file, index=False)
    
    print(f"Approximate matches saved to '{output_file}' successfully.")

## Plot result

In [23]:
tree = ET.parse('IPC-2581C.xsd')
root = tree.getroot()
xsd_elements = extract_xsd_elements(root)

tag_meaning_data = []
with open('TagsMeaning.txt', 'r', encoding='utf-8') as file:
    for line in file:
        if '==>' in line:
            tag, meaning = line.strip().split('==>')
            tag_meaning_data.append((tag.strip(), meaning.strip()))

In [29]:
start_time = time.time()

exact_matches = match_tags_with_xsd(tag_meaning_data, xsd_elements, match_type="exact")
output_file_1 = 'exact_matches_output.csv'
save_exact_match_to_csv(exact_matches, output_file_1)

end_time = time.time()
print(f"Execution time exact matches: {end_time - start_time} seconds")

start_time = time.time()

approximate_matches = match_tags_with_xsd(tag_meaning_data, xsd_elements, match_type="approximate")
output_file_2 = 'approximate_matches_output.csv'
save_approximate_matches_to_csv(approximate_matches, output_file_2)

end_time = time.time()
print(f"Execution time approximate matches: {end_time - start_time} seconds")

Exact matches saved to 'exact_matches_output_noprep.csv' successfully.
Execution time exact matches: 0.024883508682250977 seconds
Approximate matches saved to 'approximate_matches_output_noprep.csv' successfully.
Execution time approximate matches: 0.6329030990600586 seconds
