In [54]:
import os
import xml.etree.ElementTree as ET
from os import path, makedirs
import csv
import pandas as pd

In [55]:
root_dir = "/nrcan_p2"
data_dir = path.join(root_dir,"data")
geoscan_files_dir = path.join(data_dir,"01_raw","20201006","geoscan")
raw_dir = path.join(geoscan_files_dir, "raw")
zip_dir = path.join(raw_dir, "zip")
pdf_dir = path.join(raw_dir, "pdf")

In [56]:
tree = ET.parse(path.join(geoscan_files_dir, 'EAIDown.xml'))

In [57]:
root = tree.getroot()

In [58]:
print(root.text)
print(root.tag)
print(root.attrib)



{https://geoscan.nrcan.gc.ca/schema/osdp_feed/1.0/}dataroot
{'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'https://geoscan.nrcan.gc.ca/schema/osdp_feed/1.0/ https://geoscan.nrcan.gc.ca/schema/osdp_feed/1.0/geoscan.xsd'}


In [59]:
# Looking at first element of root 
for child in list(root)[0]:
    print(child.tag, child.attrib, child.text)
    print('\n')
#     pprint(child.tag)#, child.attrib, child.text)
#     pprint(child.attrib)
#     pprint(child.text)

{http://purl.org/dc/elements/1.1/}contributor {} GEOSCAN


{http://purl.org/dc/elements/1.1/}title {'{http://www.w3.org/XML/1998/namespace}lang': 'en'} Rock-Eval/TOC data for ten southwest Alberta wells (townships 16 to 30, ranges 2 to 10W5)


{http://purl.org/dc/elements/1.1/}creator {} Watson, C


{http://purl.org/dc/elements/1.1/}creator {} Jayachandran, P T


{http://purl.org/dc/elements/1.1/}creator {} Spanswick, E


{http://purl.org/dc/elements/1.1/}creator {} Donovan, E F


{http://purl.org/dc/elements/1.1/}creator {} Danskin, D W


{http://purl.org/dc/elements/1.1/}subject {'{http://www.w3.org/XML/1998/namespace}lang': 'en'} wells


{http://purl.org/dc/elements/1.1/}subject {'{http://www.w3.org/XML/1998/namespace}lang': 'en'} lithology


{http://purl.org/dc/elements/1.1/}subject {'{http://www.w3.org/XML/1998/namespace}lang': 'en'} thermal maturation


{http://purl.org/dc/elements/1.1/}subject {'{http://www.w3.org/XML/1998/namespace}lang': 'en'} hydrocarbon migration


{http://p

In [137]:
for child in root[0]:
    print(child.tag)

{http://purl.org/dc/elements/1.1/}contributor
{http://purl.org/dc/elements/1.1/}title
{http://purl.org/dc/elements/1.1/}creator
{http://purl.org/dc/elements/1.1/}creator
{http://purl.org/dc/elements/1.1/}creator
{http://purl.org/dc/elements/1.1/}creator
{http://purl.org/dc/elements/1.1/}creator
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1/}subject
{http://purl.org/dc/elements/1.1

In [75]:
def parse_keywords(xml_article):
    
    # TODO: split keywords into individual words to find in the title 
    
    """
    Returning a set of words defined as Subjects of the article that match the language for the 
    Language attribute of the article. 
    """
    # TODO: few articles have lang not english or french
    lang = xml_article.findall("{http://purl.org/dc/elements/1.1/}language")[0].text
    
    identifiers = xml_article.findall("{http://purl.org/dc/elements/1.1/}identifier")
    for identifier in identifiers:
        text = identifier.text
        if "geoscanid:" in text:
            geo_id = text.split(":")[1]
    
    keywords = set()
            
    for subject in xml_article.findall("{http://purl.org/dc/elements/1.1/}subject"):

        # English is eng in language field but en in subject field
        # French is fre in language field but fr in subject field
        if list(subject.attrib.values())[0] in lang:
            # if no subjects matching language
            try:
                keywords.add(subject.text.lower())
            except:
                pass

    return geo_id, keywords


In [81]:
def create_keyword_dict(root):

    """
    Creates dictionary containing an article's geoscan ID and keywords.
    """
    
    keyword_dict = {}
    
    for article in root:
        geo_id, keywords = parse_keywords(article)
        keyword_dict[geo_id] = keywords
        
    return keyword_dict

In [47]:
def search_title(xml_article, keywords=None):
    
    """
    Search article title if it contains keywords.
    """
    
    title = xml_article.findall("{http://purl.org/dc/elements/1.1/}title")[0].text.lower()
    print(title)
    return title

In [51]:
for c, i in enumerate(root[2:3]):
    geo_id, keywords = parse_keywords(i)
    print(geo_id, keywords)


4681 {'paleontology', 'fossil distribution, geographic', 'invertebrata', 'fossil lists'}


In [49]:
# a few of the articles that are not designated as english or french
# 7579
# 8284
# 11026 

In [76]:
keyword_dict = create_keyword_dict(root)

In [78]:
# number of articles metadata is provided for 
len(keyword_dict)

12316

In [94]:
# total number of files extracted as pdf or zip in this directory (there are more in total)
print('zip: ', len(os.listdir(zip_dir)))
print('pdf: ', len(os.listdir(pdf_dir)))
len(os.listdir(zip_dir)) + len(os.listdir(pdf_dir))

zip:  4873
pdf:  7183


12056

In [29]:
geoscan_files_dir

'/nrcan_p2/data/01_raw/20201006/geoscan'

In [51]:
mypath = path.join(geoscan_files_dir, 'GEOSCAN-extract-20200211144755.xml_df.parquet')
# path.join(geoscan_files_dir, 'GEOSCAN')

In [52]:
df = pd.read_parquet(mypath)

In [53]:
df

Unnamed: 0,{http://purl.org/dc/elements/1.1/}contributor,{http://purl.org/dc/elements/1.1/}title_en,{http://purl.org/dc/elements/1.1/}creator,{http://purl.org/dc/elements/1.1/}subject_en,{http://purl.org/dc/elements/1.1/}subject_fr,{http://purl.org/dc/elements/1.1/}source_en,{http://purl.org/dc/elements/1.1/}source_fr,{http://purl.org/dc/elements/1.1/}description_en,{http://purl.org/dc/elements/1.1/}description_fr,{http://purl.org/dc/elements/1.1/}date,...,{http://purl.org/dc/elements/1.1/}publisher_fr,{http://purl.org/dc/elements/1.1/}format_en,{http://purl.org/dc/elements/1.1/}format_fr,{http://purl.org/dc/elements/1.1/}identifier_info,{http://purl.org/dc/elements/1.1/}coverage,{http://purl.org/dc/elements/1.1/}identifier,{http://purl.org/dc/elements/1.1/}title_fr,{http://purl.org/dc/elements/1.1/}rights_en,{http://purl.org/dc/elements/1.1/}rights_fr,{http://purl.org/dc/elements/1.1/}title
0,[GEOSCAN],"[Voggite, a new hydrated Na-Zr hydroxide-phosp...","[Roberts, A C, Sabina, A P, Ercit, T S, Grice,...","[phosphates, carbonates, minerals, optical pro...","[minéraux de phosphates, carbonates, minéraux,...","[Canadian Mineralogist vol. 28, no. 1, p. 155...","[Canadian Mineralogist vol. 28, no. 1, p. 155...",[None],[None],[1990],...,,,,,,,,,,
1,[GEOSCAN],[The inversion of time-domain airborne electro...,"[Keating, P B, Crossley, D J]","[geophysical surveys, geophysical interpretati...","[levés géophysiques, interprétations géophysiq...","[Geophysics vol. 55, no. 6, p. 705-711; 10.11...","[Geophysics vol. 55, no. 6, p. 705-711; 10.11...",[Airborne electromagnetic (EM) methods were de...,[None],[1990],...,[None],[on-line],[en ligne],[doi/10.1190/1.1442882],,,,,,
2,[GEOSCAN],"[Lithosphere folds in the Eurekan orogen, Arct...","[Stephenson, R A, Ricketts, B D, Cloetingh, S ...","[tectonophysics, orogenies, structural feature...","[tectonophysique, orogénies, caractéristiques ...","[Geology vol. 18, no. 7, p. 603-606; 10.1130/...","[Geology vol. 18, no. 7, p. 603-606; 10.1130/...",[Cornwall and Princess Margaret arches are maj...,[None],[1990],...,[None],[on-line],[en ligne],,"[POLYGON((76.0000 -64.0000, 76.0000 -104.0000,...",[info:doi/10.1130/0091-7613(1990)018<0603:LFIT...,,,,
3,[GEOSCAN],,[Geological Survey of Canada],"[structural analyses, structural interpretatio...","[analyses structurales, interpretations struct...",[Geological Survey of Canada; Géologie de l'Or...,[Commission géologique du Canada; Géologie de ...,[None],[None],[1991],...,,[on-line],[en ligne],[doi/10.4095/270],"[POLYGON((74.0000 -62.0000, 74.0000 -124.0000,...",,[Introduction [Chapitre 12: Phases de déformat...,[https://open.canada.ca/en/open-government-lic...,[https://ouvert.canada.ca/fr/licence-du-gouver...,
4,[GEOSCAN],[Archaean Geology; Dating Old Gold Deposits],"[Thomas, G, Whalley, B J P]","[gold, mineral deposits, radiometric dates, ur...","[or, gisements minéraux, datations radiométriq...","[Nature vol. 346, no. 6287, p. 792-793]","[Nature vol. 346, no. 6287, p. 792-793]",[None],[None],[1990],...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92658,[GEOSCAN],[Clumped isotope temperature calibration for c...,"[Jautzy, J J, Martine, M M, Dhillon, R S, Bern...",[None],,[Geochemical Perspectives Letters],[Geochemical Perspectives Letters],[Clumped isotope (],[None],,...,[None],[on-line],[en ligne],,,,,,,
92659,[GEOSCAN],[An Overview on Isotopic Divergences - Causes ...,"[Savard, M M, Daux, V]",[None],,[Climate of the Past],[Climate of the Past],[Climatic reconstructions based on tree-ring i...,[None],,...,[None],[on-line],[en ligne],,,,,,,
92660,[GEOSCAN],[Catalogue of Mines Branch Publications],[Canada Department of Mines],[None],,"[Canada Mines Branch, Publication 337, , 28 pa...","[Canada Direction des mines, Publication 337, ...",[None],[None],[1917],...,[Canada Division des mines],,,[doi/10.4095/321799],,,,,,
92661,[GEOSCAN],"[Catalogue of Mines Branch Publications, with ...",[Canada Department of Mines],[None],,"[Canada Mines Branch, Publication 337, , 35 pa...","[Canada Direction des mines, Publication 337, ...",[None],[None],[1921],...,[Canada Division des mines],,,,,,,,,


In [48]:
df['{http://purl.org/dc/elements/1.1/}identifier_geoscanid']

0           [134]
1           [146]
2           [253]
3           [270]
4           [373]
           ...   
92658    [321797]
92659    [321798]
92660    [321799]
92661    [321800]
92662    [321801]
Name: {http://purl.org/dc/elements/1.1/}identifier_geoscanid, Length: 92663, dtype: object