In [None]:
import xml.etree.ElementTree as et
import glob
import re
import pandas as pd
from collections import OrderedDict
import sqlalchemy as sa
    
res = []
xml_data_dict = {}

def compute_xml_data(lst,elem, func, level=0):
    func(lst,elem,level)
    for child in elem.getchildren():
        compute_xml_data(lst,child, func, level+1)

def gather_level(lst,elem,level):
    lst.append(('-'*level+elem.tag, elem.text))

xml_files = glob.glob('./*.xml')

for xml_file in xml_files:
    lst = []
    root = et.parse(xml_file)
    compute_xml_data(lst,root.getroot(), gather_level)
    res.append(lst)

def get_table_names(xml_data_list): 
    table_names = []
    for t in xml_data_list:
        if t[0]:
            if re.match(r'(-{http://webservices.irb.com/}\w)', t[0], re.IGNORECASE):
                name = t[0][30:].strip()
                table_names.append(name)
    return table_names
                
tables = get_table_names(res[0])

for xml_file in xml_files:
    lst = []
    root = et.parse(xml_file)
    compute_xml_data(lst,root.getroot(), gather_level)

# build all the dataframe from the xml, see next script for dataframe reference.
def build_xml_dfs(xmls):
    MatchID_tag = '--{http://webservices.irb.com/}MatchID'
    match_info_tag = '-{http://webservices.irb.com/}MatchInformation'
    officials_tag = '-{http://webservices.irb.com/}Officials'
    official_tag = '--{http://webservices.irb.com/}Official'
    team1TeamSheet_tag = '-{http://webservices.irb.com/}Team1TeamSheet'
    team2TeamSheet_tag = '-{http://webservices.irb.com/}Team2TeamSheet'
    player_tag = '--{http://webservices.irb.com/}Player'
    timeline_tag = '-{http://webservices.irb.com/}Timeline'
    timelineEntry_tag = '--{http://webservices.irb.com/}TimelineEntry'
    
    MatchID_idx = [xml[0] for xml in xmls].index(MatchID_tag)
    MatchID = xmls[MatchID_idx][1]
    
    match_info_list = []
    officials_list = []
    team1TeamSheet_list = []
    team2TeamSheet_list = []
    timeline_list = []
    
    for t in xmls:
        if t[0] == match_info_tag: 
            match_info_start_idx = [xml[0] for xml in xmls].index(match_info_tag)
            officials_start_idx = [xml[0] for xml in xmls].index(officials_tag)
            team1TeamSheet_start_idx = [xml[0] for xml in xmls].index(team1TeamSheet_tag)
            team2TeamSheet_start_idx = [xml[0] for xml in xmls].index(team2TeamSheet_tag)
            timeline_start_idx = [xml[0] for xml in xmls].index(timeline_tag)

            for m in xmls[match_info_start_idx+1:officials_start_idx]:
                if m[0] is not officials_tag:
                    match_info_list.append((m[0][31:].strip(),m[1]))
                else:
                    continue
            for o in xmls[officials_start_idx+1:team1TeamSheet_start_idx]:
                if o[0] == official_tag:
                    sub_list = []
                    sub_list.append(('MatchID',MatchID))
                    officials_list.append(sub_list)
                elif o[0] is not team1TeamSheet_tag:
                    sub_list.append((o[0][32:].strip(),o[1]))
                else:
                    continue
            for t1 in xmls[team1TeamSheet_start_idx+1:team2TeamSheet_start_idx]: #25/132
                if t1[0] == player_tag:
                    sub_list = []
                    sub_list.append(('MatchID',MatchID))
                    team1TeamSheet_list.append(sub_list)
                elif t1[0] is not team2TeamSheet_tag:
                    sub_list.append((t1[0][32:].strip(),t1[1]))
                else:
                    continue
            for t2 in xmls[team2TeamSheet_start_idx+1:timeline_start_idx]:
                if t2[0] == player_tag:
                    sub_list = []
                    sub_list.append(('MatchID',MatchID))
                    team2TeamSheet_list.append(sub_list)
                elif t2[0] is not team2TeamSheet_tag:
                    sub_list.append((t2[0][32:].strip(),t2[1]))
                else:
                    continue
            for time_line in xmls[timeline_start_idx+1:]:
                if time_line[0] == timelineEntry_tag:
                    sub_list = []
                    sub_list.append(('MatchID',MatchID))
                    timeline_list.append(sub_list)
                elif time_line[0]:
                    sub_list.append((time_line[0][32:].strip(),time_line[1]))
                else:
                    break                 

    records_dict = {'match_info': match_info_list, 'officials': officials_list, 'team1TeamSheet': team1TeamSheet_list, 'team2TeamSheet':team2TeamSheet_list,'timeline':timeline_list}
    
    match_info_df = pd.DataFrame.from_records(match_info_list).transpose()
    match_info_df = match_info_df.rename(columns=match_info_df.iloc[0]).drop(match_info_df.index[0])
    
    officials_df = [pd.DataFrame.from_records(lst).transpose() for lst in officials_list]
    officials_df = [df.rename(columns=df.iloc[0]).drop(df.index[0]) for df in officials_df]
    
    team1TeamSheet_df = [pd.DataFrame.from_records(lst).transpose() for lst in team1TeamSheet_list]
    team1TeamSheet_df = [df.rename(columns=df.iloc[0]).drop(df.index[0]) for df in team1TeamSheet_df]
    
    team2TeamSheet_df = [pd.DataFrame.from_records(lst).transpose() for lst in team2TeamSheet_list]
    team2TeamSheet_df = [df.rename(columns=df.iloc[0]).drop(df.index[0]) for df in team2TeamSheet_df]
    
    timeline_df = [pd.DataFrame.from_records(lst).transpose() for lst in timeline_list]
    timeline_df = [df.rename(columns=df.iloc[0]).drop(df.index[0]) for df in timeline_df]
    
    match_info_dfs_merged = match_info_df
    officials_dfs_merged = pd.concat(officials_df)
    team1TeamSheet_dfs_merged = pd.concat(team1TeamSheet_df)
    team2TeamSheet_dfs_merged = pd.concat(team2TeamSheet_df)
    timeline_dfs_merged = pd.concat(timeline_df)
    
    return {'match_info':match_info_dfs_merged,
            'officials':officials_dfs_merged,
            'team1TeamSheet':team1TeamSheet_dfs_merged,
            'team2TeamSheet':team2TeamSheet_dfs_merged,
            'timeline':timeline_dfs_merged}

all_xml_dfs = []
match_info_dfs_list = []
officials_dfs_list = []
team1TeamSheet_dfs_list = []
team2TeamSheet_dfs_list = []
timeline_dfs_list = []

for xml_data in res:
    all_xml_dfs.append(build_xml_dfs(xml_data))

for df_dict in all_xml_dfs:
    for k,v in df_dict.items():
        if k == 'match_info':
            match_info_dfs_list.append(v)
        if k == 'officials':
            officials_dfs_list.append(v)
        if k == 'team1TeamSheet':
            team1TeamSheet_dfs_list.append(v)
        if k == 'team2TeamSheet':
            team2TeamSheet_dfs_list.append(v)
        if k == 'timeline':
            timeline_dfs_list.append(v)

match_info_dfs_merged_all = pd.concat(match_info_dfs_list)
officials_dfs_merged_all = pd.concat(officials_dfs_list)
team1TeamSheet_dfs_merged_all = pd.concat(team1TeamSheet_dfs_list)
team2TeamSheet_dfs_merged_all = pd.concat(team2TeamSheet_dfs_list)
timeline_dfs_merged_all = pd.concat(timeline_dfs_list)

# load dataframe into Postgres
def write_data_to_sql(df, table_name):        
    try:
        engine = sa.create_engine(connection_string, echo=True);
        # add a table id?
        # df["TableId"] = tableId;
        if not df.empty:
            df.to_sql(tablename, engine, if_exists='append', index=False)       
    except Exception as e:
        print(e)

match_info_dfs_merged_all.to_csv('../_3_data_clean/matchwithtimeline_matchinformation.csv',index=False)
officials_dfs_merged_all.to_csv('../_3_data_clean/matchwithtimeline_officials.csv',index=False)
team1TeamSheet_dfs_merged_all.to_csv('../_3_data_clean/matchwithtimeline_team1teamsheet.csv',index=False)
team2TeamSheet_dfs_merged_all.to_csv('../_3_data_clean/matchwithtimeline_team2teamsheet.csv',index=False)
timeline_dfs_merged_all.to_csv('../_3_data_clean/matchwithtimeline_timeline.csv',index=False)
print('Completed!')


In [None]:
#run below script to find out the xml format, only run if format is due to change. 
import xml.etree.ElementTree as et
import glob
import re
import pandas as pd
from collections import OrderedDict
import sqlalchemy as sa
    
res = []

def compute_xml_data(elem, func, level=0):
    func(elem,level)
    for child in elem.getchildren():
        compute_xml_data(child, func, level+1)

def gather_level(elem,level):
    print('-'*level+elem.tag, elem.text)
#     res.append(('-'*level+elem.tag, elem.text))

xml_files = glob.glob('./*.xml')


xml_data = compute_xml_data(root.getroot(), gather_level)

def get_table_names(xml_data_list): 
    table_names = []
    for t in xml_data_list:
        if t[0]:
            if re.match(r'(-{http://webservices.irb.com/}\w)', t[0], re.IGNORECASE):
                name = t[0][30:].strip()
                table_names.append(name)
    return table_names

# print(res)

print(get_table_names(res))

# compute_xml_data(root.getroot(), gather_level)