In [1]:
import xml.etree.ElementTree as ET
import os
import random
import sqlite3
import numpy as np
import pandas as pd
import shutil
import math

In [2]:
def indent(elem, level=0):
    i = "\n" + level*"  "
    j = "\n" + (level-1)*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for subelem in elem:
            indent(subelem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = j
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = j
    return elem

In [3]:
def changeXMLValue(root_el, demo_data, min_int_val, max_int_val):
    el_change = root_el
    while len(el_change) > 0:
        el_index = random.randint(0, len(el_change) - 1)
        el_change = el_change[el_index]
    el_change.text = insertData(demo_data, min_int_val, max_int_val)
    return root_el

In [4]:
def updateXML(el, num_of_changes, demo_data, xml_tag_arr, xml_attr_arr, attr_chance, min_int_val, max_int_val):
    for change in range(num_of_changes):
        add_el = random.randint(0, 1)
        if add_el == 0:
            # add element
            el = createChild(el, 1, 1, demo_data, xml_tag_arr, xml_attr_arr, 1, 1, attr_chance, min_int_val, max_int_val)
        else:
            # changes value
            el = changeXMLValue(el, demo_data, min_int_val, max_int_val)
    return el

In [5]:
def insertData(demo_data, min_int, max_int):
    
    num_type = random.randint(0, 1)
    
    if num_type == 0:
        data_type = random.randint(1, len(demo_data.keys()) - 1)
        data = demo_data.iloc[random.randint(0, len(demo_data.index) - 1)][data_type]
    else:
        data = str(random.randint(min_int, max_int))
    return data

In [6]:
def createChild(root, depth, max_depth, demo_data, xml_tag_arr, xml_attr_arr, min_num_of_els, max_num_of_els, attr_chance, min_int_val, max_int_val):
    
    num_of_els = random.randint(min_num_of_els, max_num_of_els)
    
    for el_num in range(num_of_els):
        
        attr_1 = random.randint(0, attr_chance)
        attr_2 = random.randint(0, attr_chance)
        num_of_attr = 0
        if attr_1 != 0 and attr_2 != 0:
            num_of_attr = 2
        elif attr_1 != 0 or attr_2 != 0:
            num_of_attr = 1
        
        attr = {}
        for i in range(num_of_attr):
            rand_attr = xml_attr_arr[random.randint(0, len(xml_attr_arr) - 1)]
            attr[rand_attr] = insertData(demo_data, min_int_val, max_int_val)
        tag_name = xml_tag_arr[random.randint(0, len(xml_tag_arr) - 1)].replace(' ', '-')
        el = ET.SubElement(root, tag_name, attr)

        if depth == max_depth:
            el.text = insertData(demo_data, min_int_val, max_int_val)
        else:
            has_child = random.randint(0, 1)
            if has_child:
                el = createChild(el, depth + 1, max_depth, demo_data, xml_tag_arr, xml_attr_arr, min_num_of_els, max_num_of_els, attr_chance, min_int_val, max_int_val)
            else:
                el.text = insertData(demo_data, min_int_val, max_int_val)
    return root

In [None]:
num_of_files = 100
num_of_versions = 4

path = "demo_xml/"

max_depth = random.randint(1, 5)
child_elements = 10

# load demo database
conn = sqlite3.connect("Demo_Data_DB.sqlite")
query = "SELECT * FROM DEMO_DATA;"
demo_data = pd.read_sql_query(query, conn)

# load xml tag and attribute database
conn = sqlite3.connect("Demo_XML_1.sqlite")
query = "SELECT tag_name, attr FROM Demo_XML;"
xml_tag_attr = pd.read_sql_query(query, conn)

for file_num in range(num_of_files):
    
    xml_tag_arr= []
    xml_attr_arr= []
    
    min_int_val = -1 * (file_num + 1)*1000
    max_int_val = (file_num + 1)*1000
    
    # select possible tags for file
    for i in range(file_num + 1):
        index = random.randint(0, len(xml_tag_attr.index) - 1)
        xml_attr_arr.append(xml_tag_attr['attr'][index])
        xml_tag_arr.append(xml_tag_attr['tag_name'][index])
        
    max_depth = file_num % 5
    min_num_of_els = 1
    max_num_of_els = math.floor(file_num / 5) + 1
    attr_chance = math.floor((num_of_files - file_num) / 10)
    root = ET.Element('root')
    el = createChild(root, 0, max_depth, demo_data, xml_tag_arr, xml_attr_arr, 
                     min_num_of_els, max_num_of_els, attr_chance, min_int_val, max_int_val)
    tree = ET.ElementTree(indent(el))
    xml_name = 'xml_f' + str(file_num) + '_v0.xml'
    tree.write(xml_name, xml_declaration=True, encoding='utf-8')
    shutil.move(xml_name, path + xml_name)
    
    for version in range(num_of_versions - 1):
        
        el = updateXML(el, 1, demo_data, xml_tag_arr, xml_attr_arr, attr_chance, min_int_val, max_int_val)
        tree = ET.ElementTree(indent(el))
        xml_name = 'xml_f' + str(file_num) + '_v' + str(version + 1) + '.xml'
        tree.write(xml_name, xml_declaration=True, encoding='utf-8')
        shutil.move(xml_name, path + xml_name)