In [4]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime
from nltk import tokenize

In [5]:
# Folder where the xml files are located
FILE_DIR = "Input_files/CIESIN_XML_FIles"

In [6]:
# Creates a list of files and prints out the number of files found in the FILE_DIR folder
filename_list = []
for filename in os.listdir(FILE_DIR):
    if filename.endswith('.xml'):
        filename_list.append(filename)
print(len(filename_list))

8


In [7]:
filename_list

['ipcc-gridded-emissions.xml',
 'ipcc-ar4-observed-climate-impacts.xml',
 'ipcc-ar5-observed-climate-impacts-v2-01.xml',
 'ipcc-synthetic-vulnerability-climate-2005-2050-2100.xml',
 'ipcc-is92-emissions-scenarios-v1-1.xml',
 'ipcc-fluor-gases-emissions.xml',
 'ipcc-socio-economic-baseline.xml',
 'ipcc-emissions-v1-1.xml']

In [8]:
# Reads in all the XML files and maps them to the MDW schema (as per the mapping agreed with CEISIN)

import_list = []
for xml_file in filename_list:
    import_dict = {}
    print("FILENAME: {}".format(xml_file))
    etree = ET.parse(os.path.join("Input_files/CIESIN_XML_FIles", xml_file))
    root = etree.getroot()

    for citation in root.iter('citation'):
        
        for citeinfo in citation.findall('citeinfo'):    
            for child in citeinfo:
                #TITLE
                if child.tag == "title":
                    import_dict["Title"] = child.text
                #DOI
                if child.tag == "onlink":
                    import_dict["DOI Name"] = child.text
                    import_dict['Access URL'] = child.text
                #PUBLICATION DATE
                if child.tag == 'pubdate':
                    import_dict["Publication Date"] = datetime.strptime(child.text, '%Y%m%d').strftime('%d/%m/%Y')
                #FORMAT
                if child.tag == 'geoform':
                    import_dict['Format'] = child.text
    
    
    for description in root.iter('descript'):
        #DESCRIPTION
        for desc in description.findall('abstract'):
            import_dict["Description"] = desc.text.replace('\n',"").replace('                ',"")
        
            #ABSTRACT
            import_dict['Abstract'] = tokenize.sent_tokenize(
                desc.text.replace('\n',"").replace('                ',""))[0]
        
        
        #PURPOSE
        for purpose in description.findall('purpose'):
            import_dict['Purpose'] = purpose.text
    
    #Associated Media
    ass_media_list = []
    for distinfo in root.iter('distinfo'):
        for stdorder in distinfo.findall('stdorder'):
            for digform in stdorder.findall('digform'):
                for digtopt in digform.findall('digtopt'):
                    for onlinopt in digtopt.findall('onlinopt'):
                        for computer in onlinopt.findall('computer'):
                            for networka in computer.findall('networka'):
                                for networkr in networka.findall('networkr'):
                                    if networkr.text != None:
                                        ass_media_list.append(networkr.text.strip())
    import_dict['Associated Media'] = ass_media_list
    
    
    #Access Service
    import_dict['Access Service'] = 'Users must register with NASA\'s User Registration System'
    
    #KEYWORDS
    for keywords in root.iter('keywords'):
        keywords_list = []
        for theme in keywords.iter('theme'):
            for themekey in theme.iter('themekey'):
                keywords_list.append(themekey.text)
        import_dict['Keywords'] = keywords_list
    
    for distribution_info in root.iter('distrib'):
        for contact_info in distribution_info.findall('cntinfo'):
            #PUBLISHER - Name
            for contact_org_primary in contact_info.findall('cntorgp'):
                for contact_org in contact_org_primary.findall('cntorg'):
                    import_dict['Pub_Name'] = contact_org.text
            #PUBLISHER - Email
            for contact_email in contact_info.findall('cntemail'):
                import_dict['Pub_Contact Point'] = contact_email.text
                import_dict['Contact Point'] = contact_email.text
    #PUBLISHER - Logo
    for browser in root.iter('browse'):
        for browsen in browser.findall('browsen'):
            import_dict['Pub_Logo'] = browsen.text.strip()
    #PUBLISHER - Description
    import_dict['Pub_Description'] = 'https://sedac.ciesin.columbia.edu/about'
    
    for spatial_data_org in root.iter('spdom'):
        for bbox in spatial_data_org.findall('bounding'):
            for westbc in bbox.findall('westbc'):
                import_dict['Lower Left Longitude'] = westbc.text
            for eastbc in bbox.findall('eastbc'):
                import_dict['Upper Right Longitude'] = eastbc.text
            for northbc in bbox.findall('northbc'):
                import_dict['Upper Right Latitude'] = northbc.text
            for southbc in bbox.findall('southbc'):
                import_dict['Lower Left Latitude'] = southbc.text
    
    # Spatial Coverage
    import_dict['Spatial Coverage'] = ''
    
    # spatial aggregation
    spa_agg_string = ''
    for idinfo in root.iter('idinfo'):
        for keywords in idinfo.findall('keywords'):
            for theme in keywords.findall('theme'):
                for themekt in theme.findall('themekt'):
                    if themekt.text == "Data Granularity":
                        for themekey in theme.findall('themekey'):
                            spa_agg_string += themekey.text + ", "
            import_dict['Spatial Aggregation'] = spa_agg_string[:-2]
    
    # spatial resolution
    for spref in root.iter('spref'):
        for horizsys in spref.findall('horizsys'):
            for geograph in horizsys.findall('geograph'):
                for latres in geograph.findall('latres'):
                    lat_res = latres.text
                for longres in geograph.findall('longres'):
                    long_res = longres.text
                    if (lat_res != None) & (long_res != None):
                        import_dict['Spatial Resolution'] = lat_res + " x " + long_res
                    else:
                        import_dict['Spatial Resolution'] = ''
    
    # start date & end date
    for timeperd in root.iter('timeperd'):
        for timeinfo in timeperd.findall('timeinfo'):
            for rngdates in timeinfo.findall('rngdates'):
                for begdate in rngdates.findall('begdate'):
                    import_dict['Start Date'] = datetime.strptime(begdate.text, "%Y%m%d")
                for enddate in rngdates.findall('enddate'):
                    import_dict['End Date'] = datetime.strptime(enddate.text, "%Y%m%d")
    
    # Resource creator
    for citation in root.iter('citation'):
        for citeinfo in citation.findall('citeinfo'):
            for origin in citeinfo.findall('origin'):
                import_dict['Resource Creator'] = origin.text
    
        #Is referenced by
        '''
        is_referenced_by = []
        for lworkcit in citation.iter('lworkcit'):
            for citeinfo in lworkcit.findall('citeinfo'):
                for origin in citeinfo.findall('origin'):
                    for onlink in origin.findall('onlink'):
                        is_referenced_by.append(onlink.text)
        '''
    import_dict['Is Referenced By'] = ''
    
    # references
    references_list = []
    for idinfo in root.iter('idinfo'):
        for crossref in idinfo.findall('crossref'):
            for citeinfo in crossref.findall('citeinfo'):
                for onlink in citeinfo.findall('onlink'):
                    references_list.append(onlink.text)
    import_dict['References'] = references_list
    
    #For fields with no match:
    import_dict['Is Part Of'] = ''
    import_dict['Tools'] = ''
    import_dict['Temporal Resolution']= '' 
    import_dict['Language'] = 'en'
    import_dict['Alternate Identifier'] = ''
    import_dict['Source'] = ''
    import_dict['License'] = ''
    import_dict['Investigations'] = '' 
    import_dict['Pub_Identifier'] = ''
    import_dict['Pub_Description'] = 'https://sedac.ciesin.columbia.edu/about'
    import_dict['Is Referenced By'] = ''  
    import_dict['Qualified Relation'] = ''
    import_dict['Jurisdiction'] = ''
    
    import_list.append(import_dict)

FILENAME: ipcc-gridded-emissions.xml
FILENAME: ipcc-ar4-observed-climate-impacts.xml
FILENAME: ipcc-ar5-observed-climate-impacts-v2-01.xml
FILENAME: ipcc-synthetic-vulnerability-climate-2005-2050-2100.xml
FILENAME: ipcc-is92-emissions-scenarios-v1-1.xml
FILENAME: ipcc-fluor-gases-emissions.xml
FILENAME: ipcc-socio-economic-baseline.xml
FILENAME: ipcc-emissions-v1-1.xml


In [9]:
# Reorders the columns to the same order as the input spreadsheet
df = pd.DataFrame(import_list)
column_order = ['Title', 'Abstract', 'Contact Point', 'Keywords', 'DOI Name', 'Alternate Identifier',
                'Publication Date', 'Pub_Identifier', 'Pub_Name', 'Pub_Logo', 'Pub_Description', 'Pub_Contact Point', 
                'Description', 'Associated Media', 'Is Part Of', 'Spatial Coverage', 'Spatial Aggregation',
                'Spatial Resolution', 'Start Date', 'End Date', 'Temporal Resolution', 
                'Lower Left Latitude', 'Lower Left Longitude', 'Upper Right Latitude', 'Upper Right Longitude',
                'Purpose', 'Source', 'License', 'Resource Creator', 'Investigations',
                'Is Referenced By', 'References', 'Access URL', 'Access Service',
                'Jurisdiction', 'Language', 'Format', 'Qualified Relation', 'Tools']
df = df[column_order]
df

Unnamed: 0,Title,Abstract,Contact Point,Keywords,DOI Name,Alternate Identifier,Publication Date,Pub_Identifier,Pub_Name,Pub_Logo,...,Investigations,Is Referenced By,References,Access URL,Access Service,Jurisdiction,Language,Format,Qualified Relation,Tools
0,IPCC Special Report on Emissions Scenarios (SR...,The Intergovernmental Panel on Climate Change ...,ciesin.info@ciesin.columbia.edu,"[Global, Continent, Environment, Climate, EART...",https://doi.org/10.7927/H4RV0KMH,,31/07/2000,,NASA Socioeconomic Data and Applications Cente...,https://sedac.ciesin.columbia.edu/downloads/ma...,...,,,"[https://doi.org/10.7927/H4N29TWJ, https://doi...",https://doi.org/10.7927/H4RV0KMH,Users must register with NASA's User Registrat...,,en,"raster, tabular",,
1,IPCC Fourth Assessment Report (AR4) Observed C...,The Intergovernmental Panel on Climate Change ...,ciesin.info@ciesin.columbia.edu,"[Continent, Global, Climatology/Meteorology/At...",https://doi.org/10.7927/H4542KJV,,31/12/2008,,NASA Socioeconomic Data and Applications Cente...,https://sedac.ciesin.columbia.edu/downloads/ma...,...,,,"[https://doi.org/10.7927/H4N29TWJ, https://doi...",https://doi.org/10.7927/H4542KJV,Users must register with NASA's User Registrat...,,en,"tabular, map service",,
2,IPCC Fifth Assessment Report (AR5) Observed Cl...,The Intergovernmental Panel on Climate Change ...,ciesin.info@ciesin.columbia.edu,[EARTH SCIENCE > HUMAN DIMENSIONS > ENVIRONMEN...,https://doi.org/10.7927/H4FT8J0X,,28/06/2017,,NASA Socioeconomic Data and Applications Cente...,https://sedac.ciesin.columbia.edu/downloads/ma...,...,,,"[https://doi.org/10.7927/H4N29TWJ, https://doi...",https://doi.org/10.7927/H4FT8J0X,Users must register with NASA's User Registrat...,,en,tabular,,
3,Synthetic Assessment of Global Distribution of...,The Synthetic Assessment of Global Distributio...,ciesin.info@ciesin.columbia.edu,"[Country, Climatology/Meteorology/Atmosphere, ...",https://doi.org/10.7927/H4XG9P2R,,31/12/2006,,NASA Socioeconomic Data and Applications Cente...,https://sedac.ciesin.columbia.edu/downloads/ma...,...,,,"[https://doi.org/10.7927/H4N29TWJ, https://doi...",https://doi.org/10.7927/H4XG9P2R,Users must register with NASA's User Registrat...,,en,"document, map, map service",,
4,"IPCC IS92 Emissions Scenarios (A, B, C, D, E, ...",The Intergovernmental Panel on Climate Change ...,ciesin.info@ciesin.columbia.edu,"[Global, Continent, Country, Environment, Clim...",https://doi.org/10.7927/H41C1TT4,,31/07/2000,,NASA Socioeconomic Data and Applications Cente...,https://sedac.ciesin.columbia.edu/downloads/ma...,...,,,"[https://doi.org/10.7927/H4N29TWJ, https://doi...",https://doi.org/10.7927/H41C1TT4,Users must register with NASA's User Registrat...,,en,tabular,,
5,IPCC Special Report on Emissions Scenarios (SR...,The Intergovernmental Panel on Climate Change ...,ciesin.info@ciesin.columbia.edu,"[Global, Continent, Environment, Climate, EART...",https://doi.org/10.7927/H4HD7SKJ,,31/07/2000,,NASA Socioeconomic Data and Applications Cente...,https://sedac.ciesin.columbia.edu/downloads/ma...,...,,,"[https://doi.org/10.7927/H4N29TWJ, https://doi...",https://doi.org/10.7927/H4HD7SKJ,Users must register with NASA's User Registrat...,,en,tabular,,
6,IPCC Socio-Economic Baseline Dataset,The Intergovernmental Panel on Climate Change ...,ciesin.info@ciesin.columbia.edu,"[Global, Continent, Country, Environment, Soci...",https://doi.org/10.7927/H4WM1BB7,,31/12/1998,,NASA Socioeconomic Data and Applications Cente...,https://sedac.ciesin.columbia.edu/downloads/ma...,...,,,"[https://doi.org/10.7927/H4N29TWJ, https://doi...",https://doi.org/10.7927/H4WM1BB7,Users must register with NASA's User Registrat...,,en,tabular,,
7,IPCC Special Report on Emissions Scenarios (SR...,The Intergovernmental Panel on Climate Change ...,ciesin.info@ciesin.columbia.edu,"[Global, Continent, Environment, Climate, EART...",https://doi.org/10.7927/H4N29TWJ,,31/07/2000,,NASA Socioeconomic Data and Applications Cente...,https://sedac.ciesin.columbia.edu/downloads/ma...,...,,,"[https://doi.org/10.7927/H4RV0KMH, https://doi...",https://doi.org/10.7927/H4N29TWJ,Users must register with NASA's User Registrat...,,en,tabular,,


In [10]:
# Write the output to an excel spreadsheet
date = datetime.date(datetime.now())
with pd.ExcelWriter('output_files/CEISIN_metadata_{}.xlsx'.format(date)) as writer:  
    df.to_excel(writer, sheet_name='metadata', index=False)