In [None]:
import xml.etree.ElementTree as et
import glob
import re
import pandas as pd
from collections import OrderedDict
import sqlalchemy as sa
    
res = []

def compute_xml_data(lst,elem, func, level=0):
    func(lst,elem,level)
    for child in elem.getchildren():
        compute_xml_data(lst,child, func, level+1)

def gather_level(lst,elem,level):
    lst.append(('-'*level+elem.tag, elem.text))

xml_files = glob.glob('./*.xml')

for xml_file in xml_files:
    lst = []
    root = et.parse(xml_file)
    compute_xml_data(lst,root.getroot(), gather_level)
    res.append(lst)

def get_table_names(xml_data_list): 
    table_names = []
    for t in xml_data_list:
        if t[0]:
            if re.match(r'(-{http://webservices.irb.com/}\w)', t[0], re.IGNORECASE):
                name = t[0][30:].strip()
                table_names.append(name)
    return table_names
                
tables = get_table_names(res[0])

for xml_file in xml_files:
    lst = []
    root = et.parse(xml_file)
    compute_xml_data(lst,root.getroot(), gather_level)

# build all the dataframe for the xml
def build_xml_dfs(xmls):
    personID_tag = '-{http://webservices.irb.com/}PersonID'
    personnel_profile_tag = '{http://webservices.irb.com/}PersonnelProfile'
    teamsRepresented_tag = '-{http://webservices.irb.com/}TeamsRepresented'
    team_tag = '--{http://webservices.irb.com/}Team'
    testBreakdown_tag = '-{http://webservices.irb.com/}TestBreakdown'
    eventAppearances_tag = '-{http://webservices.irb.com/}EventAppearances'
    appearance_tag = '--{http://webservices.irb.com/}Appearance'
    
    personID_idx = [xml[0] for xml in xmls].index(personID_tag)
    personID = xmls[personID_idx][1]
    
    personnel_profile_list = []
    teamsRepresented_list = []
    testBreakdown_list = []
    eventAppearances_list = []
    
    for t in xmls:
        if t[0] == personnel_profile_tag: 
            personnel_profile_start_idx = [xml[0] for xml in xmls].index(personnel_profile_tag)
            teamsRepresented_start_idx = [xml[0] for xml in xmls].index(teamsRepresented_tag)
            testBreakdown_start_idx = [xml[0] for xml in xmls].index(testBreakdown_tag)
            eventAppearances_start_idx = [xml[0] for xml in xmls].index(eventAppearances_tag)

            for m in xmls[personnel_profile_start_idx+1:teamsRepresented_start_idx]:
                if m[0] is not teamsRepresented_tag:
                    personnel_profile_list.append((m[0][30:].strip(),m[1]))
                else:
                    continue
            for o in xmls[teamsRepresented_start_idx+1:testBreakdown_start_idx]:
                if o[0] == team_tag:
                    sub_list = []
                    sub_list.append(('PersonID',personID))
                    teamsRepresented_list.append(sub_list)
                elif o[0] is not testBreakdown_tag:
                    sub_list.append((o[0][32:].strip(),o[1]))
                else:
                    continue
            for t1 in xmls[testBreakdown_start_idx+1:eventAppearances_start_idx]:
                if t1[0] == team_tag:
                    sub_list = []
                    sub_list.append(('PersonID',personID))
                    testBreakdown_list.append(sub_list)
                elif t1[0] is not eventAppearances_tag:
                    sub_list.append((t1[0][32:].strip(),t1[1]))
                else:
                    continue
            for time_line in xmls[eventAppearances_start_idx+1:]:
                if time_line[0] == appearance_tag:
                    sub_list = []
                    sub_list.append(('PersonID',personID))
                    eventAppearances_list.append(sub_list)
                elif time_line[0]:
                    sub_list.append((time_line[0][32:].strip(),time_line[1]))
                else:
                    break                 
               
    records_dict = {'personnel_profile': personnel_profile_list, 'teamsRepresented': teamsRepresented_list, 'testBreakdown': testBreakdown_list, 'eventAppearances':eventAppearances_list}
    personnel_profile_df = pd.DataFrame.from_records(personnel_profile_list).transpose()
    personnel_profile_df = personnel_profile_df.rename(columns=personnel_profile_df.iloc[0]).drop(personnel_profile_df.index[0])

    teamsRepresented_df = [pd.DataFrame.from_records(lst).transpose() for lst in teamsRepresented_list if lst]
    teamsRepresented_df = [df.rename(columns=df.iloc[0]).drop(df.index[0]) for df in teamsRepresented_df]

    testBreakdown_df = [pd.DataFrame.from_records(lst).transpose() for lst in testBreakdown_list if lst]
    testBreakdown_df = [df.rename(columns=df.iloc[0]).drop(df.index[0]) for df in testBreakdown_df ]
    
    eventAppearances_df = [pd.DataFrame.from_records(lst).transpose() for lst in eventAppearances_list if lst]
    eventAppearances_df = [df.rename(columns=df.iloc[0]).drop(df.index[0]) for df in eventAppearances_df]
     
    personnel_profile_dfs_merged = personnel_profile_df
    if teamsRepresented_df: 
        teamsRepresented_dfs_merged = pd.concat(teamsRepresented_df)
    else:
        teamsRepresented_dfs_merged = pd.DataFrame()
    if testBreakdown_df:
        testBreakdown_dfs_merged = pd.concat(testBreakdown_df)
    else:
        testBreakdown_dfs_merged = pd.DataFrame()
    if eventAppearances_df:
        eventAppearances_dfs_merged = pd.concat(eventAppearances_df)
    else:
        eventAppearances_dfs_merged = pd.DataFrame()
    
    return {'personnel_profile':personnel_profile_dfs_merged,
            'teamsRepresented':teamsRepresented_dfs_merged,
            'testBreakdown':testBreakdown_dfs_merged,
            'eventAppearances':eventAppearances_dfs_merged}    
            
all_xml_dfs = []
personnel_profile_dfs_list = []
teamsRepresented_dfs_list = []
testBreakdown_dfs_list = []
eventAppearances_dfs_list = []

for xml_data in res:
    all_xml_dfs.append(build_xml_dfs(xml_data))
    
    
for df_dict in all_xml_dfs:
    for k,v in df_dict.items():
        if k == 'personnel_profile':
            personnel_profile_dfs_list.append(v)
        if k == 'teamsRepresented':
            teamsRepresented_dfs_list.append(v)
        if k == 'testBreakdown':
            testBreakdown_dfs_list.append(v)
        if k == 'eventAppearances':
            eventAppearances_dfs_list.append(v)
            
if personnel_profile_dfs_list:
    personnel_profile_dfs_merged_all = pd.concat(personnel_profile_dfs_list)
else:
    personnel_profile_dfs_merged_all = pd.DataFrame()
if teamsRepresented_dfs_list:
    teamsRepresented_dfs_merged_all = pd.concat(teamsRepresented_dfs_list)
else: 
    teamsRepresented_dfs_merged_all = pd.DataFrame()
if testBreakdown_dfs_list:
    testBreakdown_dfs_merged_all = pd.concat(testBreakdown_dfs_list)
else:
    testBreakdown_dfs_merged_all = pd.DataFrame()
if eventAppearances_dfs_list:
    eventAppearances_dfs_merged_all = pd.concat(eventAppearances_dfs_list)
else: 
    eventAppearances_dfs_merged_all = pd.DataFrame()

# load dataframe into Postgres
def write_data_to_sql(df, table_name):        
    try:
        engine = sa.create_engine(connection_string, echo=True);
        # add a table id?
        # df["TableId"] = tableId;
        if not df.empty:
            df.to_sql(tablename, engine, if_exists='append', index=False)       
    except Exception as e:
        print(e)

personnel_profile_dfs_merged_all.to_csv('../_3_data_clean/playerprofile_personnel_profile.csv',index=False)
teamsRepresented_dfs_merged_all.to_csv('../_3_data_clean/playerprofile_teamsRepresented.csv',index=False)
testBreakdown_dfs_merged_all.to_csv('../_3_data_clean/playerprofile_testBreakdown.csv',index=False)
eventAppearances_dfs_merged_all.to_csv('../_3_data_clean/playerprofile_eventAppearances.csv',index=False)
print('Completed!')


In [None]:
#run below script to find out the xml format, only run if format is due to change. 
import xml.etree.ElementTree as et
import glob
import re
import pandas as pd
from collections import OrderedDict
import sqlalchemy as sa
    
res = []

def compute_xml_data(elem, func, level=0):
    func(elem,level)
    for child in elem.getchildren():
        compute_xml_data(child, func, level+1)

def gather_level(elem,level):
    print('-'*level+elem.tag, elem.text)
#     res.append(('-'*level+elem.tag, elem.text))

xml_files = glob.glob('./*.xml')
xml_file = xml_files[0]
root = et.parse(xml_file) 
xml_data = compute_xml_data(root.getroot(), gather_level)

def get_table_names(xml_data_list): 
    table_names = []
    for t in xml_data_list:
        if t[0]:
            if re.match(r'(-{http://webservices.irb.com/}\w)', t[0], re.IGNORECASE):
                name = t[0][30:].strip()
                table_names.append(name)
    return table_names

# print(res)

print(get_table_names(res))

# compute_xml_data(root.getroot(), gather_level)
