# TOP British Columbia Geology Papers
We will download the following paper repositories
* GeoFile http://cmscontent.nrs.gov.bc.ca/geoscience/PublicationCatalogue/GeoFile/
* GeologyExplorationMining http://cmscontent.nrs.gov.bc.ca/geoscience/PublicationCatalogue/GeologyExplorationMining/
* GeoscienceBC http://cmscontent.nrs.gov.bc.ca/geoscience/PublicationCatalogue/GeoscienceBC/
* Paper http://cmscontent.nrs.gov.bc.ca/geoscience/PublicationCatalogue/Paper/
* Petrolium Geoscience Publications http://cmscontent.nrs.gov.bc.ca/geoscience/PublicationCatalogue/PetroleumGeosciencePublications/



# Status:
* GeoFile (Done)
* GeologyExplorationMining Done)
* GeoscienceBC (Done)
* Paper (Done)
* Petrolium Geoscience Publications (Done)

In [None]:
import pandas as pd
import numpy as np
from os import listdir,environ,rename 
from sys import argv
from os.path import isfile,join,basename
from shutil import rmtree,move
import glob
from bs4 import BeautifulSoup
import os
from zipfile import ZipFile
import requests
from requests_html import HTMLSession,AsyncHTMLSession
import json

In [4]:
root_dir = "/nrcan_p2"
data_dir = join(root_dir,"data")
bc_root_dir=join(data_dir,"01_raw","20210108","bc")
geofile_dir=join(bc_root_dir,"geofile")
geology_exploration_mining_dir=join(bc_root_dir,"geology_exploration_mining")
geoscience_bc_dir=join(bc_root_dir,"geoscience_bc")
paper_dir=join(bc_root_dir,"paper")
petrolium_geoscience_publications_dir=join(bc_root_dir,"petrolium_geoscience_publications")

In [6]:
MAKE_DIRS=False
GET_LINKS=False
DOWNLOAD = False
if MAKE_DIRS:
    os.makedirs(geofile_dir)
    os.mkdir(geology_exploration_mining_dir)
    os.mkdir(geoscience_bc_dir)
    os.mkdir(paper_dir)
    os.mkdir(petrolium_geoscience_publications_dir)

In [51]:
def get_parsed_content(url):
    html_content = requests.get(url, verify=False).text
    soup = BeautifulSoup(html_content, "html.parser")
    return soup

def download_articles(article_links,http_root,destination_dir,max_length=100):
    for article_link in article_links:
        r = requests.get(article_link,stream=True, verify=False)
        chunk_size=10000000
        article_name = article_link.replace(http_root,"").replace("/","_")
        article_name = article_name[-1*max_length:]
        with open (join(destination_dir,article_name),"wb") as f:
            for chunk in r.iter_content(chunk_size):
                f.write(chunk)

def download_all_pdf(http_root,link_root,destination_dir):
    content=get_parsed_content(f'{http_root}{link_root}')
    article_links= []
    for link in content.find_all('a', href=True):
        if (href := link['href']).startswith(f'{link_root}') and ".pdf" in href:
            article_links.append(f'{http_root}{href}')
    
    download_articles(article_links,f'{http_root}{link_root}',destination_dir)
    
def get_files_count(directory,print_caption=None):
    nb_files = len([name for name in os.listdir(directory) if os.path.isfile(directory +"/"+name)])
    if print_caption:
        print (f'{print_caption}: {nb_files}')
    return nb_files

In [23]:
download_all_pdf("http://cmscontent.nrs.gov.bc.ca","/geoscience/PublicationCatalogue/GeoFile/",geofile_dir)

In [24]:
download_all_pdf("http://cmscontent.nrs.gov.bc.ca","/geoscience/PublicationCatalogue/GeologyExplorationMining/",geology_exploration_mining_dir)

In [25]:
download_all_pdf("http://cmscontent.nrs.gov.bc.ca","/geoscience/PublicationCatalogue/GeoscienceBC/",geoscience_bc_dir)

In [26]:
download_all_pdf("http://cmscontent.nrs.gov.bc.ca","/geoscience/PublicationCatalogue/Paper/",paper_dir)

In [27]:
download_all_pdf("http://cmscontent.nrs.gov.bc.ca","/geoscience/PublicationCatalogue/PetroleumGeosciencePublications/",petrolium_geoscience_publications_dir)

In [55]:
total_files = 0
for directory in [geofile_dir,geology_exploration_mining_dir,geoscience_bc_dir,paper_dir,petrolium_geoscience_publications_dir]:
    total_files += get_files_count(directory,directory)
print(f'Total: {total_files}')

/nrcan_p2/data/01_raw/20210108/bc/geofile: 218
/nrcan_p2/data/01_raw/20210108/bc/geology_exploration_mining: 164
/nrcan_p2/data/01_raw/20210108/bc/geoscience_bc: 51
/nrcan_p2/data/01_raw/20210108/bc/paper: 1622
/nrcan_p2/data/01_raw/20210108/bc/petrolium_geoscience_publications: 218
Total: 2273
