In [1]:
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from urllib.parse import urljoin
import json
from fabric import Connection
import os
import requests
import re
import lxml
import cchardet
import xarray as xr
import re
import pandas as pd

# Get deposition data from the EMEPS thredds server

Getting annual data from the metno thredds server. At this date (14-04-2021) there six year of reporting available (2015--2020).
The landing page for the EMEPS data is: https://thredds.met.no/thredds/fou-kl/emep.html

In this notebook, we will get the EMEPS data for coordinates of interest using the fimex tool: https://wiki.met.no/fimex/start and interact remotely with the dataset using OpenDAP.
We are only interested in yearly data  and there are few years available. Therefore hardcode the links to the yearly information. 
First, we crawl to the thredds server to find all the links to the yearly data.


## Links to the OpenDAP
We parse the thredds webpage using beautiful soup. The assumptions behind our crawling are that the data we are interested in are under the "Reporting" folders and that the netcdf files containing the yearly data contain the string "year" in their filename.

In [2]:
%%time
baseURL = 'https://thredds.met.no/thredds/fou-kl/emep.html'
only_a_tags = SoupStrainer("a", href=True)

s=requests.Session()

#Getting only tags with links. Only the tags containing the re_str will be included
def getSoup(url,re_str):
    request=s.get(url)
    soup=BeautifulSoup(request.text,'lxml',parse_only=only_a_tags)
    link_soup=soup.find_all('a',text=re.compile(re_str))
    links=[]
    for i in link_soup:
        links.append(urljoin(url,i['href']))
    return links

allReports = getSoup(baseURL,'Reporting')

#Listing all the yearly datafiles for all report years
allLinks={}
for i in allReports:
    display(i)
    year = int(re.search('[0-9]{4}(?=\_Reporting)',i)[0])
    allLinks[year] = getSoup(i,'year')

# display(allLinks)

# Getting OPENDAP link to nc file
opendap = {}
for i in allLinks.keys():
    dummy = []
    for j in allLinks[i]:
        dummy.append(getSoup(j,'^/thredds/dodsC/')[0])
    opendap[i] = dummy
    
# display(opendap)    

only_table = SoupStrainer("table")
for i in opendap.keys():
    cnt=-1
    for j in opendap[i]:
        cnt+=1
        request = s.get(j)
        soup = BeautifulSoup(request.text,parse_only=only_table)
        link = soup.find(text='Data URL:').find_parent('tr').select_one('input').get_attribute_list('value')[0]
        opendap[i][cnt] = link
        
display(opendap)
    
s.close()

'https://thredds.met.no/thredds/catalog/data/EMEP/2020_Reporting/catalog.html'

'https://thredds.met.no/thredds/catalog/data/EMEP/2019_Reporting/catalog.html'

'https://thredds.met.no/thredds/catalog/data/EMEP/2018_Reporting/catalog.html'

'https://thredds.met.no/thredds/catalog/data/EMEP/2017_Reporting/catalog.html'

'https://thredds.met.no/thredds/catalog/data/EMEP/2016_Reporting/catalog.html'

'https://thredds.met.no/thredds/catalog/data/EMEP/2015_Reporting/catalog.html'

{2020: ['https://thredds.met.no/thredds/dodsC/data/EMEP/2020_Reporting/EMEP01_rv4_35_year.2019met_2018emis.nc',
  'https://thredds.met.no/thredds/dodsC/data/EMEP/2020_Reporting/EMEP01_rv4_35_year.2018met_2018emis.nc'],
 2019: ['https://thredds.met.no/thredds/dodsC/data/EMEP/2019_Reporting/EMEP01_L20EC_rv4_33_year.2018met_2017emis.nc',
  'https://thredds.met.no/thredds/dodsC/data/EMEP/2019_Reporting/EMEP01_L20EC_rv4_33_year.2017met_2017emis.nc',
  'https://thredds.met.no/thredds/dodsC/data/EMEP/2019_Reporting/EMEP01_L20EC_rv4_33_year.2016met_2016emis_rep2019.nc',
  'https://thredds.met.no/thredds/dodsC/data/EMEP/2019_Reporting/EMEP01_L20EC_rv4_33_year.2015met_2015emis_rep2019.nc',
  'https://thredds.met.no/thredds/dodsC/data/EMEP/2019_Reporting/EMEP01_L20EC_rv4_33_year.2014met_2014emis_rep2019.nc',
  'https://thredds.met.no/thredds/dodsC/data/EMEP/2019_Reporting/EMEP01_L20EC_rv4_33_year.2013met_2013emis_rep2019.nc',
  'https://thredds.met.no/thredds/dodsC/data/EMEP/2019_Reporting/EMEP01

CPU times: user 1.98 s, sys: 57.1 ms, total: 2.04 s
Wall time: 5.34 s


## Extracting information for a given set of coordinates
We will extract data for a particular set of coordinates using the fimex utility.

In [3]:
#Loading  xlsx data
stations_df = pd.read_excel('sites for N.dep.xlsx',usecols=['uniqueID','latitude','longitude'])
display(stations_df)

cfgTemplate='''[extract]
selectVariables=DDEP_SOX_m2Grid
selectVariables=WDEP_SOX
selectVariables=DDEP_OXN_m2Grid
selectVariables=WDEP_OXN
selectVariables=DDEP_RDN_m2Grid
selectVariables=WDEP_RDN
[interpolate]
method=bilinear
latitudeValues={lat}
longitudeValues={long}
'''

Unnamed: 0,uniqueID,latitude,longitude
0,17019_1000,63.260440,26.415880
1,17026_1001,63.080790,26.753060
2,17071_1002,63.408060,26.536370
3,52757_10021,67.767180,29.456930
4,17276_1003,62.966680,27.137230
...,...,...,...
5380,SE760979-171460-NW760981-171456_NamnlÃ¶s,68.496536,21.053952
5381,SE761025-174651-NW761027-174649_NamnlÃ¶s,68.474500,21.830582
5382,SE761271-172880-NW761274-172881_NamnlÃ¶s,68.511407,21.405789
5383,SE763806-168299-SE763848-714579_Partaljaure,68.771552,20.334780


In [4]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [5]:
year = 2019

with Connection('localhost') as c:
    c.local('''rm -rf emep_{0} && mkdir emep_{0}'''.format(year))

In [6]:
latlong = [(str(i),str(j),k) for i,j,k in zip(stations_df.latitude.values,stations_df.longitude.values,stations_df.uniqueID)]
cnt = 0

for file in opendap[year]:
    for i in chunks(latlong,1000):
        cnt+=1
        with open('emep.cfg','w') as f:
            f.write(cfgTemplate.format(lat = ','.join([j[0] for j in i]),
                                       long = ','.join([j[1] for j in i])
                                      )
                   )
        outFile = 'emep_{}/{:03d}_{}'.format(year,cnt,file.split('/')[-1]) 
        with Connection('localhost') as c:
            c.local('fimex-1.5 -c emep.cfg --input.file={} --output.file={}'.format(file,outFile))
        
        ds = xr.open_mfdataset(outFile)
        ds["totn"] = (ds["WDEP_OXN"] + ds["WDEP_RDN"] + ds["DDEP_OXN_m2Grid"] + ds["DDEP_RDN_m2Grid"] )
        ds["totn"].attrs["units"] = "mgN/m2"
        ds["tot_DEP_OXN"] = (ds["WDEP_OXN"] + ds["DDEP_OXN_m2Grid"] )
        ds["tot_DEP_OXN"].attrs["units"] = "mgN/m2"
        ds["tot_DEP_RDN"] = ( ds["WDEP_RDN"] + ds["DDEP_RDN_m2Grid"]  )
        ds["tot_DEP_RDN"].attrs["units"] = "mgN/m2"
        ds["tot_DEP_SOX"] = ( ds["WDEP_SOX"] + ds["DDEP_SOX_m2Grid"]  )
        ds["tot_DEP_SOX"].attrs["units"] = "mgN/m2"
            
        if cnt == 1:
            result_df = ds.to_dataframe()
            result_df['year'] =[i[0].year for i in result_df.index]
            result_df['uniqueID'] =[j[2] for j in i]
            result_df.reset_index(drop=True,inplace=True)
        else : 
            df = ds.to_dataframe()
            df['year'] =[i[0].year for i in df.index]
            df['uniqueID'] =[j[2] for j in i]
            df.reset_index(drop=True,inplace=True)
            result_df = pd.concat([result_df,df])

In [7]:
result_df = result_df[['uniqueID','year','latitude','longitude','WDEP_SOX','WDEP_OXN','WDEP_RDN','DDEP_SOX_m2Grid','DDEP_OXN_m2Grid','DDEP_RDN_m2Grid','totn',"tot_DEP_OXN","tot_DEP_RDN","tot_DEP_SOX"]].set_index('uniqueID')

In [8]:
result_df.to_csv('Deposition_report_year_{}_bilinear.csv'.format(year))

In [9]:
result_df

Unnamed: 0_level_0,year,latitude,longitude,WDEP_SOX,WDEP_OXN,WDEP_RDN,DDEP_SOX_m2Grid,DDEP_OXN_m2Grid,DDEP_RDN_m2Grid,totn,tot_DEP_OXN,tot_DEP_RDN,tot_DEP_SOX
uniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
17019_1000,2018,63.260441,26.415880,77.370789,99.361137,77.687241,18.372662,68.050438,21.900921,266.999725,167.411575,99.588165,95.743454
17026_1001,2018,63.080791,26.753059,77.433975,98.861961,73.875710,18.870155,68.445190,23.355230,264.538086,167.307159,97.230942,96.304131
17071_1002,2018,63.408058,26.536369,73.723686,95.520988,73.009819,15.848308,59.782364,20.399359,248.712524,155.303345,93.409180,89.571991
52757_10021,2018,67.767181,29.456930,43.757011,46.199341,25.482288,9.391156,27.042391,4.402149,103.126160,73.241730,29.884438,53.148167
17276_1003,2018,62.966679,27.137230,79.699646,103.168228,82.970818,20.518812,72.950775,25.132427,284.222229,176.119003,108.103241,100.218460
...,...,...,...,...,...,...,...,...,...,...,...,...,...
SE760979-171460-NW760981-171456_NamnlÃ¶s,2000,68.496536,21.053951,72.552795,54.104790,19.219986,5.071574,16.296120,0.943140,90.564034,70.400909,20.163126,77.624367
SE761025-174651-NW761027-174649_NamnlÃ¶s,2000,68.474503,21.830582,84.019669,62.434338,22.772577,5.442154,17.498785,1.082582,103.788284,79.933121,23.855160,89.461823
SE761271-172880-NW761274-172881_NamnlÃ¶s,2000,68.511406,21.405788,77.778206,58.050835,20.834770,4.976905,15.982054,0.926368,95.794029,74.032890,21.761139,82.755112
SE763806-168299-SE763848-714579_Partaljaure,2000,68.771553,20.334780,59.324951,45.241997,15.665462,4.574759,15.624271,0.846571,77.378304,60.866268,16.512033,63.899712
