In [1]:
import pandas as pd
import json
import os
import re

### Part1 Wrangling the XML data

In [2]:
# Read the data
filepath = "australian-sport-thesaurus-student.xml"
with open(filepath, 'r') as f:
    xmlString = f.read()

In [3]:
# Have a galance for the first 2000 characters
xmlString[:2000]

"<?xml version='1.0' encoding='UTF-8'?>\n<Terms>\n  <Term>\n    <Title>.177 (4.5mm) Airgun</Title>\n    <Description>The standard airgun calibre for international target shooting.</Description>\n    <RelatedTerms>\n      <Term>\n        <Title>Shooting sport equipment</Title>\n        <Relationship>Narrower Term</Relationship>\n      </Term>\n    </RelatedTerms>\n  </Term>\n  <Term>\n    <Title>.22</Title>\n    <Description>A rimfire calibre, much used in target shooting and often synonymous with the term smallbore.</Description>\n    <RelatedTerms>\n      <Term>\n        <Title>Shooting sport equipment</Title>\n        <Relationship>Narrower Term</Relationship>\n      </Term>\n    </RelatedTerms>\n  </Term>\n  <Term>\n    <Title>.22 Long Rifle</Title>\n    <Description>The standard .22 rimfire cartridge for target rifle and pistol use.</Description>\n    <RelatedTerms>\n      <Term>\n        <Title>Shooting sport equipment</Title>\n        <Relationship>Narrower Term</Relationship>\n 

In [4]:
## Print the string to see the structure for these xml files
print "from the head \n\n"
print xmlString[:500]
print "\n\n"
print "from the tail \n\n"
print xmlString[-500:]

from the head 


<?xml version='1.0' encoding='UTF-8'?>
<Terms>
  <Term>
    <Title>.177 (4.5mm) Airgun</Title>
    <Description>The standard airgun calibre for international target shooting.</Description>
    <RelatedTerms>
      <Term>
        <Title>Shooting sport equipment</Title>
        <Relationship>Narrower Term</Relationship>
      </Term>
    </RelatedTerms>
  </Term>
  <Term>
    <Title>.22</Title>
    <Description>A rimfire calibre, much used in target shooting and often synonymous with the term smal



from the tail 


tball terms and techniques</Title>
        <Relationship>Narrower Term</Relationship>
      </Term>
    </RelatedTerms>
  </Term>
  <Term>
    <Title>Zudnick </Title>
    <Description>A skier leans their upper body forward toward the tips of their skis. The skis are kept close together. </Description>
    <RelatedTerms>
      <Term>
        <Title>Freestyle Skiing terms and techniques</Title>
        <Relationship>Narrower Term</Relationship>
      </Term>
 

### Here we can find 
- This xml file is begining with the string of "<?xml version='1.0' encoding='UTF-8'?>" , this part will not be useful for the file json file extractiong, this part will be remove in the following part
- All the contents are surrended between the tag __"Terms"__
- For each observation, are surrended by the tag __"Term"__
- For each observation, there are three parts of information which are going to be extracted, they are 
    - Title
    - Description
    - Related Terms
- The related terms contain the title and relationship information 


In [5]:
## Split the whole xml file into a list, and each element represents an observation
splitxml = xmlString.split("\n  </Term>\n")
splitxml_new = []
for item in splitxml:
    splitxml_new.append(item + "\n  </Term>\n")

In [6]:
# Check the split result
print splitxml_new[0]
print "\n"
print splitxml_new[1]
print "\n"
print splitxml_new[-2]
print "\n"
print splitxml_new[-1]

<?xml version='1.0' encoding='UTF-8'?>
<Terms>
  <Term>
    <Title>.177 (4.5mm) Airgun</Title>
    <Description>The standard airgun calibre for international target shooting.</Description>
    <RelatedTerms>
      <Term>
        <Title>Shooting sport equipment</Title>
        <Relationship>Narrower Term</Relationship>
      </Term>
    </RelatedTerms>
  </Term>



  <Term>
    <Title>.22</Title>
    <Description>A rimfire calibre, much used in target shooting and often synonymous with the term smallbore.</Description>
    <RelatedTerms>
      <Term>
        <Title>Shooting sport equipment</Title>
        <Relationship>Narrower Term</Relationship>
      </Term>
    </RelatedTerms>
  </Term>



  <Term>
    <Title>Zudnick </Title>
    <Description>A skier leans their upper body forward toward the tips of their skis. The skis are kept close together. </Description>
    <RelatedTerms>
      <Term>
        <Title>Freestyle Skiing terms and techniques</Title>
        <Relationship>Narrower T

In [7]:
# Modify the first element by removeing the necessary
splitxml_new[0] = splitxml_new[0].replace("<?xml version='1.0' encoding='UTF-8'?>\n<Terms>\n","")
# Remove the last element as there is no content we need
splitxml_new.pop()

'</Terms>\n\n  </Term>\n'

####  Complete wrangling for the splitxml

### Part 2 Extracting the information

As we can see from the part1, the strategy for extracting the content we need can be concluded into three steps
- Extracting the "Description" Information first, as this part can be extracted directly 
- For the "Title" information, it appread both in the file root and the related terms, so here we will extract all the content under "Related Terms" first, then we remove related terms from the original xml and then we can easlily obtain the file title content
- For the related terms, we can find the title and relationship information easily by using the regex


In [8]:
# collect all the description
description_list = []
# build the regex patter for description
description_pattern = re.compile(r'(?<=<Description>)(.*?)(?=</Description>)')
for item in splitxml_new:
    desc = re.findall(description_pattern, item)
    description_list.append(desc)

In [9]:
# Extract all the features in Related terms 
relatedterms_list = []
# By using this pattern, we can obtain all the content under related terms
relationterm_pattern = re.compile(r'<RelatedTerms>[\s\S]*?<\/RelatedTerms>')
for item in splitxml_new:
    # Get the related terms, if it not apprear, then get the null list 
    rt = re.findall(relationterm_pattern, item)
    relatedterms_list.append(rt)  

##### As we can find, for some observations, there are more than one terms under the related term, so here we will explore the whole contents by using a for loop, and then collected them into a list

In [10]:
# Extract all the features in Terms under the related terms
term_pattern = re.compile(r'<Term>[\s\S]*?<\/Term>')
rterm_list = []
for item in relatedterms_list:
    # here to avoid the error if there are no terms under the terms
    if len(item) == 0:
        rterm_list.append([])
        continue
    rterm = re.findall(term_pattern, item[0])
    rterm_list.append(rterm)

In [11]:
# Extracting the content from the terms 
target = []
# pattern for extracting the title information
r_title_pattern = re.compile(r'(?<=<Title>)(.*?)(?=</Title>)')
# pattern for extracting the relationship information
relationship_pattern = re.compile(r'(?<=<Relationship>)(.*?)(?=</Relationship>)')
for item in rterm_list:
    # if there are no terms in the list, then we just add an empty list to the target
    if len(item) == 0:
        target.append([])
    else:
        r_term_list = []
        # look through all the term elements
        for ii in item:
            # a dict to hold the information
            new_dict = {}
            # Find the title part
            title_ = re.findall(r_title_pattern,ii)
            # if the title appear
            if len(title_) != 0:
                new_dict['Title'] = title_[0]
            # Find the relationship part
            relationship_ = re.findall(relationship_pattern, ii)
            # if the relationship apprea
            if len(relationship_) != 0:
                new_dict['Relationship'] = relationship_[0]
            # append this dict to the list    
            r_term_list.append(new_dict)
        # append the list to the target list
        target.append(r_term_list)
    

In [12]:
# Random check for the result
target[0:5]

[[{'Relationship': 'Narrower Term', 'Title': 'Shooting sport equipment'}],
 [{'Relationship': 'Narrower Term', 'Title': 'Shooting sport equipment'}],
 [{'Relationship': 'Narrower Term', 'Title': 'Shooting sport equipment'}],
 [{'Relationship': 'Narrower Term', 'Title': 'Shooting sport equipment'}],
 [{'Relationship': 'Used For', 'Title': '1 Kilometre TT'},
  {'Relationship': 'Used For', 'Title': '1km Time Trial'},
  {'Relationship': 'Used For', 'Title': '1km Time Trial'},
  {'Relationship': 'Used For', 'Title': '1km TT'},
  {'Relationship': 'Used For', 'Title': 'One km Time Trial'}]]

#### Now we can remove the related terms from the xml and prepare for the next step

In [13]:
## remove the related terms here which will help us to get the TITLE information easily 
remove_related_list = []
for item in splitxml_new:
    remove_ralated = re.sub(relationterm_pattern,'',item)
    remove_related_list.append(remove_ralated)

In [14]:
# Grab the Title information 
title_list = []
title_pattern = re.compile(r'(?<=<Title>)(.*?)(?=</Title>)')
for item in remove_related_list:
    title = re.findall(title_pattern, item)
    title_list.append(title)

In [15]:
### Check all the result we have 
print len(title_list)
print len(description_list)
print len(target)

7863
7863
7863


In [16]:
## Generate the final dict file
final_list = []
data_length = len(target)
i = 0
while i < data_length:
    # build a dict for each observation
    data_dict = {}
    # get the description information
    des = description_list[i]
    # assert information here
    if len(des) != 0:
        data_dict['Description'] = description_list[i][0]
    # related information 
    rt = target[i]
    if len(rt) != 0:
        data_dict['RelatedTerms'] = rt
    # title information
    title_part = title_list[i]
    if len(title_part) != 0:
        data_dict['Title'] = title_list[i][0]
    # append all the observation
    final_list.append(data_dict)
    i = i + 1
# Final dict for this task
final_dict = {}
final_dict["thesaurus"] = final_list

In [17]:
# Write out the result
writepath = 'sport.dat'
json = json.dumps(final_dict)
f = open('sport.dat',"w")
f.write(json)
f.close()