### Jupyter Notebook with a collection of functions for processing ICOS measurement time series and STILT results

### Import this notebook with 
### %run '~/modules/STILT_modules_plus.ipynb'


### Import tools and libraries

In [1]:
# import required modules
import sys   
import netCDF4 as cdf
import numpy as np
from numpy import nan
import datetime as dt
from datetime import datetime#, timedelta, date, time
import os
import shutil
import zipfile
import fnmatch
import requests
import pickle
import pandas as pd
from operator import methodcaller
import matplotlib.pyplot as p
import matplotlib.colors as mcolors
import warnings
warnings.filterwarnings('ignore')

# define colors
orange='#ff8c00'
lime='#00ff00'
aqua='#00ffff'
brown='#663300'
lightgray="#C0C0C0"
gray="#808080"

### Get list of ICOS class 1 and class 2 stations from Carbon Portal

In [2]:
def get_station_class():
    # Query the ICOS SPARQL endpoint for a station list
    # query stationId, class, lng name and country
    # output is an object "data" containing the results in JSON

    url = 'https://meta.icos-cp.eu/sparql'

    query = """
    prefix st: <http://meta.icos-cp.eu/ontologies/stationentry/>
    select distinct ?stationId ?stationClass ?country ?longName
    from <http://meta.icos-cp.eu/resources/stationentry/>
    where{
      ?s a st:AS .
      ?s st:hasShortName ?stationId .
      ?s st:hasStationClass ?stationClass .
      ?s st:hasCountry ?country .
      ?s st:hasLongName ?longName .
      filter (?stationClass = "1" || ?stationClass = "2")
    }
    ORDER BY ?stationClass ?stationId 
    """
    r = requests.get(url, params = {'format': 'json', 'query': query})
    data = r.json()

    # convert the the result into a table
    # output is an array, where each row contains 
    # information about the station

    cols = data['head']['vars']
    datatable = []

    for row in data['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        
        datatable.append(item)

    # print the table 
    df_datatable = pd.DataFrame(datatable, columns=cols)
    #df_datatable.head(5)
    return df_datatable

### Store all STILT station information in a dictionary
Dictionary contains information on
- STILT station id
- Station coordinates (latitude, longitude)
- Altitude of tracer release in STILT simultation
- STILT location identifier
- Station name - if available

In [3]:
def create_STILT_dictionary(path_tmp):
    # store all STILT station information in a dictionary 

    # get all ICOS station IDs by listing subdirectories in stiltweb
    # extract location from filename of link

    pathStations='/data/stiltweb/stations/'
    allStations = os.listdir(pathStations)

    # empty dictionary
    stations = {}

    # fill dictionary with ICOS station id, latitude, longitude and altitude
    for ist in sorted(list(set(allStations))):
        stations[ist] = {}
        # get filename of link (original stiltweb directory structure) 
        # and extract location information
        if os.path.exists(pathStations+ist):
            loc_ident = os.readlink(pathStations+ist)
            clon = loc_ident[-13:-6]
            lon = np.float(clon[:-1])
            if clon[-1:] == 'W':
                lon = -lon
            clat = loc_ident[-20:-14]
            lat = np.float(clat[:-1])
            if clat[-1:] == 'S':
                lat = -lat
            alt = np.int(loc_ident[-5:])

            stations[ist]['lat']=lat
            stations[ist]['lon']=lon
            stations[ist]['alt']=alt
            stations[ist]['locIdent']=os.path.split(loc_ident)[-1]

        
    # add information on station name (and new STILT station id) from stations.csv file used in stiltweb 

    url="https://stilt.icos-cp.eu/viewer/stationinfo"
    df = pd.read_csv(url)

    for ist in sorted(list(set(stations))):
        stationName = df.loc[df['STILT id'] == ist]['STILT name']
        if len(stationName.value_counts()) > 0:
            stations[ist]['name'] = stationName.item()
        else:
            stations[ist]['name'] = ''

    # Get list of ICOS class 1 and class 2 stations from Carbon Portal
    df_datatable = get_station_class()

    # add information if ICOS class 1 or class 2 site
    for ist in sorted(list(set(stations))):
        stations[ist]['stationClass'] = np.nan
        for istICOS in df_datatable['stationId']:
            ic = int(df_datatable[df_datatable['stationId']==istICOS].index.values)
            if istICOS in ist:
                stations[ist]['stationClass'] = df_datatable['stationClass'][ic]

    # print dictionary
    #for ist in sorted(stations):
    #    print ('station:', ist)
    #    for k in stations[ist]:
    #        print (k,':', stations[ist][k])

    # write dictionary to pickle file for further use
    if not os.path.exists(path_tmp):
        os.makedirs(path_tmp, exist_ok=True)
    pickle.dump( stations, open( path_tmp+"/stationsDict.pickle", "wb" ) )

    return stations

### Read dictionary with all stations
Dictionary contains information on
- STILT station id
- Station coordinates (latitude, longitude)
- Altitude of tracer release in STILT simultation
- STILT location identifier
- Station name - if available

In [4]:
def read_STILT_dictionary(path_tmp):
    # read STILT station dictionary from pickle file

    filename = 'stationsDict.pickle'
    if not os.path.isfile(path_tmp+filename):
        stations = create_STILT_dictionary(path_tmp)
    else:
        stations = pd.read_pickle(path_tmp+filename)

    # print dictionary
    for ist in sorted(stations):
        print ('station:', ist)
        for k in stations[ist]:
            print (k,':', stations[ist][k])

    return stations

### List available footprints and store them in a dictionary

In [5]:
def available_STILT_dictionary():
    # store availability of STILT footprints in a dictionary 

    # get all ICOS station IDs by listing subdirectories in stiltweb
    # extract availability from directory structure

    pathStations='/data/stiltweb/stations/'
    allStations = os.listdir(pathStations)

    # empty dictionary
    available = {}

    # fill dictionary with station name, years and months for each year
    for ist in sorted(list(set(allStations))):
        if os.path.exists(pathStations+'/'+ist):
            #print ('directory '+pathStations+'/'+ist+' exits')
            available[ist] = {}
            years = os.listdir(pathStations+'/'+ist)
            available[ist]['years'] = years
            for yy in sorted(available[ist]['years']):
                available[ist][yy] = {}
                months = os.listdir(pathStations+'/'+ist+'/'+yy)
                available[ist][yy]['months'] = months
                available[ist][yy]['nmonths'] = len(available[ist][yy]['months'])
        #else:
        #    print ('directory '+pathStations+'/'+ist+' does not exit')

    # Get list of ICOS class 1 and class 2 stations from Carbon Portal
    df_datatable = get_station_class()

    # add information if ICOS class 1 or class 2 site
    for ist in sorted(available):
        available[ist]['stationClass'] = np.nan
        for istICOS in df_datatable['stationId']:
            ic = int(df_datatable[df_datatable['stationId']==istICOS].index.values)
            if istICOS in ist:
                available[ist]['stationClass'] = df_datatable['stationClass'][ic]
    # find latest year with STILT results
    yymax = (max([max(available[ist]['years']) for ist in available]))

    # print availability
    #for ist in sorted(available):
    #    print ('station:', ist)
    #    for k in available[ist]:
    #        print (k,':', available[ist][k])
    return available

### Plot STILT footprint availability

In [6]:
def plot_available_STILT(pngfile=''):
    
    print ('run available_STILT_dictionary()')
    available = available_STILT_dictionary()
    
    # Plot availability
    # Each dot in the figure below represents one year. 
    # The size of the dot is proportional to the number of months per year for which footprints are available. 

    startyear = 2006
    endyear = int(max([max(available[ist]['years']) for ist in available]))

    ny = endyear - startyear + 1
    yy = np.arange(ny) + startyear
    nm = np.zeros(ny)
    dy = 0.5

    fig = p.figure(figsize=(15, 32))
    for i, ist in enumerate(sorted(available, reverse=True)) :
        # available number of months per available year
        nm = [available[ist][str(yy[j])]['nmonths'] if str(yy[j]) in available[ist].keys() else 0 for j in np.arange(ny)]
        if available[ist]['stationClass'] == '1':
            x = p.scatter(yy, np.ones(np.size(yy))*i+dy,c='r',marker='D', s=30*np.sqrt(np.asarray(nm)))
            p.text(startyear-2+0.2, i+dy/2, ist, color='r', fontsize=14)         
        elif available[ist]['stationClass'] == '2':
            x = p.scatter(yy, np.ones(np.size(yy))*i+dy,c='b', marker='^', s=40*np.sqrt(np.asarray(nm)))
            p.text(startyear-2+0.2, i+dy/2, ist, color='b', fontsize=14)
        else:
            x = p.scatter(yy, np.ones(np.size(yy))*i+dy,c='k',s=40*np.sqrt(np.asarray(nm)))
            p.text(startyear-2+0.2, i+dy/2, ist, fontsize=14)         
        
    p.xticks(np.arange(startyear-2, np.max(yy)+2, 1.0))
    p.xlim(startyear-2, np.max(yy)+1)
    p.yticks(np.arange(0, len(available), 1.0), ())
    p.ylim(0, len(available))
    p.grid(axis='y')
    p.tick_params(labeltop=True,labelsize=14)
    p.title('Available STILT footprints (size proportional to number of months per year)\n\n\n', fontsize=18)
    p.figtext(0.4, 0.9, 'ICOS class 1 stations in red', color='r', fontsize=16, ha ='right')
    p.figtext(0.6, 0.9, 'ICOS class 2 stations in blue', color='b', fontsize=16, ha ='left')
    p.show()
    p.close()
    if len(pngfile)>0:
        plotdir='plots'
        if not os.path.exists(plotdir):
            os.mkdir(plotdir)
        fig.savefig(plotdir+'/'+pngfile+'_'+dt.datetime.now().strftime('%Y%m%d')+'.png',dpi=100)

### Convert station longitude and latitude to STILT grid indices

In [7]:
# function to convert station longitude and latitude (slat, slon) to indices of STILT model grid (ix,jy)
def lonlat_2_ixjy(slon,slat,mlon,mlat):
    #slon, slat: longitude and latitude of station
    #mlon, mlat: 1-dim. longitude and latitude of model grid
    ix = (np.abs(mlon-slon)).argmin()
    jy = (np.abs(mlat-slat)).argmin()
    return ix,jy

### Read STILT time series (new format)

In [9]:
# function to read STILT concentration time series (new format of STILT results)
def read_stilt_timeseries(station,date_range):
    url = 'https://stilt.icos-cp.eu/viewer/stiltresult'
    headers = {'Content-Type': 'application/json', 'Accept-Charset': 'UTF-8'}
    # check if STILT results exist
    new_range=[]
    for zDate in date_range:
        if os.path.exists('/data/stiltweb/slots/'+stations[station]['locIdent']+'/'+str(zDate.year)+'/'+str(zDate.month).zfill(2)+'/'
                    +str(zDate.year)+'x'+str(zDate.month).zfill(2)+'x'+str(zDate.day).zfill(2)+'x'+str(zDate.hour).zfill(2)+'/'):
            new_range.append(zDate)
    if len(new_range) > 0:
        date_range = new_range
        fromDate = date_range[0].strftime('%Y-%m-%d')
        toDate = date_range[-1].strftime('%Y-%m-%d')
        columns = ('["isodate","co2.stilt","co2.fuel","co2.bio","co2.fuel.coal","co2.fuel.oil",'+
                   '"co2.fuel.gas","co2.fuel.bio","co2.energy","co2.transport", "co2.industry",'+
                   '"co2.others", "co2.cement", "co2.background",'+
                   '"co.stilt","co.fuel","co.bio","co.fuel.coal","co.fuel.oil",'+
                   '"co.fuel.gas","co.fuel.bio","co.energy","co.transport", "co.industry",'+
                   '"co.others", "co.cement", "co.background",'+
                   '"rn", "rn.era","rn.noah","wind.dir","wind.u","wind.v","latstart","lonstart"]')
        data = '{"columns": '+columns+', "fromDate": "'+fromDate+'", "toDate": "'+toDate+'", "stationId": "'+station+'"}'
        #print (data)
        response = requests.post(url, headers=headers, data=data)
        if response.status_code != 500:
            #print (response.json())
            output=np.asarray(response.json())
            df = pd.DataFrame(output[:,:], columns=eval(columns))
            df = df.replace('null',np.NaN)
            df = df.astype(float)
            df['date'] = pd.to_datetime(df['isodate'], unit='s')
            df.set_index(['date'],inplace=True)
            df['name'] = station
            df['model'] = 'STILT'
            df['wind.speed']=np.sqrt((df['wind.u']**2)+(df['wind.v']**2))
            #print (df.columns)
    else:
        df=pd.DataFrame({'A' : []})
    return df

In [10]:
#Function that creates and adds a column with datetime objects in a dataframe:
def createDatetimeObjList(df_data):

    """
    Project:         'ICOS Carbon Portal'
    Created:          Mon Nov 05 10:27:00 2019
    Last Changed:     Mon Nov 05 10:27:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Add a column with datetime objects to the input dataframe. The datetime objects are created 
                      based on the content of the 'Year', 'Month', 'Day', 'Hour' and 'Minute' columns of every row.
    Input parameters: ICOS data pandas dataframe (Atmospheric Level-1 or Level-2 Data)
    Output:           pandas dataframe
                      columns: 
                            1.  Station 3-character Code (var_name: 'Site', var_type: String)
                            2.  Sampling Height (var_name: 'SamplingHeight', var_type: String)
                            3.  Sampling Year (var_name: 'Year', var_type: String)
                            4.  Sampling Month (var_name: 'Month', var_type: String)
                            5.  Sampling Day (var_name: 'Day', var_type: String)
                            6.  Sampling Hour (var_name: 'Hour', var_type: String)
                            7.  Sampling Minute (var_name: 'Minute', var_type: String)
                            8.  Sampling Decimal Date (var_name: 'DecimalDate', var_type: String)
                            9.  Tracer/Gas concentration (var_name: 'ch4' or 'co2' or 'co', var_type: String)
                            10. Standard Deviation (var_name: 'Stdev', var_type: String)
                            11. Number of Points used for the measurment(var_name: 'NbPoints', var_type: String)
                            12. Quality Flag (var_name: 'Flag', var_type: String)
                            13. Instrument ID (var_name: 'InstrumentId', var_type: String)
                            14. Quality ID (var_name: 'QualityId', var_type: String)
                            15. Internal Flag: only for Level-1 data
                                (var_name: 'InternalFlag', var_type: String)
                            15. LTR: only for Level-2 data (var_name: 'LTR', var_type: String)
                            16. Auto-Descriptive Flag: only for Level-1 data
                                (var_name: 'AutoDescriptiveFlag', var_type: String)
                            16. CMR: only for Level-2 data (var_name: 'CMR', var_type: String)
                            17. Manual-Descriptive Flag: only for Level-1 data
                                (var_name: 'ManualDescriptiveFlag', var_type: String)
                            17: STTB: only for Level-2 data (var_name: 'STTB', var_type: String)
                            18: Datetime object (var_name: 'DateTime', var_type: Stri)
                            
    """
    
    #Create a list with datetime obj:
    datetimeObj_list = [datetime.strptime((df_data['Year'][i]+"/"+
                                           df_data['Month'][i]+"/"+
                                           df_data['Day'][i]+" "+
                                           df_data['Hour'][i] +":"+
                                           df_data['Minute'][i]),'%Y/%m/%d %H:%M')
                        for i in range(len(df_data))]
    
    #Add list with datetime objects to the data dataframe:
    df_data['DateTime'] =  datetimeObj_list
    
    #Return dataframe:
    return df_data


In [11]:
#Function that creates a dataframe with the data values:
def icosDatadf(data, tracer, level=2):

    """
    Project:         'ICOS Carbon Portal'
    Created:          Mon Oct 08 09:27:00 2019
    Last Changed:     Mon Oct 08 09:27:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Read in observations from ICOS Atmosphere Data File (Level-1 or Level-2) to a pandas dataframe.
    
    Input parameters: 1. Text containing column names and all observations (var_name: 'data', var_type: String)
                      2. Name of gas/tracer (var_name: 'tracer', var_type: String) - e.g. 'co2' or 'co' or 'ch4'
                      3. Data level [optional] (var_name: 'level', var_type: Integer)
    
    Default value for level: The default value for data level is "2". Function calls for Level-2 data do not have
                             to include a value for the level input parameter.
    
    Output:           pandas dataframe
    
                      columns: 
                            1.  Station 3-character Code (var_name: 'Site', var_type: String)
                            2.  Sampling Height (var_name: 'SamplingHeight', var_type: String)
                            3.  Sampling Year (var_name: 'Year', var_type: String)
                            4.  Sampling Month (var_name: 'Month', var_type: String)
                            5.  Sampling Day (var_name: 'Day', var_type: String)
                            6.  Sampling Hour (var_name: 'Hour', var_type: String)
                            7.  Sampling Minute (var_name: 'Minute', var_type: String)
                            8.  Sampling Decimal Date (var_name: 'DecimalDate', var_type: String)
                            9.  Tracer/Gas concentration (var_name: 'ch4' or 'co2' or 'co', var_type: String)
                            10. Standard Deviation (var_name: 'Stdev', var_type: String)
                            11. Number of Points used for the measurment(var_name: 'NbPoints', var_type: String)
                            12. Quality Flag (var_name: 'Flag', var_type: String)
                            13. Instrument ID (var_name: 'InstrumentId', var_type: String)
                            14. Quality ID (var_name: 'QualityId', var_type: String)
                            15. Internal Flag: only for Level-1 data
                                (var_name: 'InternalFlag', var_type: String)
                            15. LTR: only for Level-2 data (var_name: 'LTR', var_type: String)
                            16. Auto-Descriptive Flag: only for Level-1 data
                                (var_name: 'AutoDescriptiveFlag', var_type: String)
                            16. CMR: only for Level-2 data (var_name: 'CMR', var_type: String)
                            17. Manual-Descriptive Flag: only for Level-1 data
                                (var_name: 'ManualDescriptiveFlag', var_type: String)
                            17: STTB: only for Level-2 data (var_name: 'STTB', var_type: String)
                            18: Datetime object (var_name: 'DateTime', var_type: Stri)
                            
    """
    
    #Split data to rows:
    data_rows = data.split('\n')
    
    #Remove the first row (contains the column names)
    #split its contents and add them to a list:
    data_labels = data_rows.pop(0).split(';')
    
    #Sepparate the columns of the remaining rows:
    d_rows_list = [data_rows[x].split(';') for x in range(len(data_rows))] 
    
    #Create dataframe:
    df_data = pd.DataFrame.from_records(d_rows_list, columns=data_labels)
    
    #Add column with DateTime objects:
    df_data = createDatetimeObjList(df_data) 
    
    #Check what Level the data belong to:
    if(level==2):
        
        #Filter values based on flag 'O', which corresponds to quality controlled data:
        df_data = df_data.loc[df_data['Flag']=='O']
    
    else:
        #Set missing values (e.g. "-999.99") to NaN:
        df_data.loc[df_data[tracer].astype(np.float16)<0, tracer] = np.nan
        df_data.loc[df_data['Stdev'].astype(np.float16)<0, 'Stdev'] = np.nan
    
    #Return Data dataframe:
    return df_data.set_index('DateTime') #set column 'DateTime' as index
    

In [12]:
#Function that creates a dataframe with the metadata values:
def icosMetadatadf(data):
    

    """
    Project:         'ICOS Carbon Portal'
    Created:          Mon Oct 08 10:27:00 2019
    Last Changed:     Mon Oct 08 10:27:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Read in metadata from ICOS Atmosphere Data File (Level-1 or Level-2) to a pandas dataframe.
    
    Input parameters: Text containing column names and all observations (var_name: 'data', var_type: String)
    
    Output:           pandas dataframe
    
                      columns: 
                            1. MetadataLabel (var_name: 'MetadataLabel', var_type: String)
                                i.       Dataset Title (var_name: 'TITLE', var_type: String)
                                ii.      Dataset File Name (var_name: 'FILE NAME', var_type: String)
                                iii.     Data Format (var_name: 'DATA FORMAT', var_type: String)
                                iv.      Total num of records (var_name: 'TOTAL LINES', var_type: String)
                                v.       Metadata Header Lines (var_name: 'HEADER LINES', var_type: String)
                                vi.      Project Data Version (var_name: 'PROJECT DATA VERSION', var_type: String)
                                vii.     Data Product Type (var_name: 'DATA PRODUCT TYPE', var_type: String)
                                viii.    Station 3-character Code (var_name: 'STATION CODE', var_type: String)
                                ix.      Station Full Name (var_name: 'STATION NAME', var_type: String)
                                x.       Station Category (var_name: 'STATION CATEGORY', var_type: String)
                                xi.      Observation Category (var_name: 'OBSERVATION CATEGORY', var_type: String)
                                xii.     Country (var_name: 'COUNTRY/TERRITORY', var_type: String)
                                xiii.    Contributor (var_name: 'CONTRIBUTOR', var_type: String)
                                xiv.     Latitude (var_name: 'LATITUDE', var_type: String)
                                xv.      Longitude (var_name: 'LONGITUDE', var_type: String)
                                xvi.     Altitude (var_name: 'ALTITUDE', var_type: String)
                                xvii.    Number of Sampling Heights
                                         (var_name: 'NUMBER OF SAMPLING HEIGHTS', var_type: String)
                                xviii.   Sampling Height (var_name: 'SAMPLING HEIGHTS', var_type: String)
                                xix.     Contact Information - email (var_name: 'CONTACT POINT', var_type: String)
                                xx.      Observation Parameter - e.g. 'CO' (var_name: 'PARAMETER', var_type: String)
                                xxi.     Covering Time Period (var_name: 'COVERING PERIOD', var_type: String)
                                xxii.    Sampling Freequency (var_name: 'TIME INTERVAL', var_type: String)
                                xxiii.   Measurement Unit (var_name: 'MEASUREMENT UNIT', var_type: String)
                                xxiv.    Measurement Method (var_name: 'MEASUREMENT METHOD', var_type: String)
                                xxv.     Sampling Type (var_name: 'SAMPLING TYPE', var_type: String)
                                xxvi.    Time Zone (var_name: 'TIME ZONE', var_type: String)
                                xxvii.   Measurement Scale (var_name: 'MEASUREMENT SCALE', var_type: String)
                                xxviii.  Data Policy (var_name: 'DATA POLICY', var_type: String)
                                xxix.    Comment Notes (var_name: 'COMMENT', var_type: String)
                                
                            2. MetadataInfo (var_name: 'MetadataInfo', var_type: String)
                            
                            
    """
    
    #Split the metadata values for label "comment":
    metadata_split = data.split('\n#   ')
    
    #Get the metadata rows:
    metadata_rows = metadata_split.pop(0).split('\n')
    
    #Remove "# " from rows:
    metadata_rows = [metadata_rows[i].replace("# ", "") for i in range(len(metadata_rows))]
    
    #Split labels from values:
    metadata_rows_split = [metadata_rows[i].split(': ') for i in range(len(metadata_rows))]
    
    #Get the metadata labels:
    metadata_labels = [metadata_rows_split[i].pop(0) for i in range(len(metadata_rows_split))]
    
    #Remove ":" from label "COMMENT:"
    metadata_labels[-1]= metadata_labels[-1].replace(":", "")
    
    #Join the metadata values for the label "comment" to one string:
    comment_values = ", ".join(metadata_split)
    
    #Add the metadata values for the label "comment" to the metadata-values list:
    metadata_rows_split[-1] = [comment_values]
    
    #Construct the metadata-values list as a list of strings instead of a list of lists:
    metadata_values = [metadata_rows_split[i].pop(0) for i in range(len(metadata_rows_split))]
    
    #Create a dictionary with the metadata-label and -values lists:
    metadata_dict = {'MetadataLabel':metadata_labels,
                     'MetadataInfo':metadata_values}
    
    #Create Metadata dataframe:
    df_metadata = pd.DataFrame(metadata_dict,columns=['MetadataLabel', 'MetadataInfo'])
    
    #Make the 'MetadataLabel'-column an index:
    meta_df_i = df_metadata.set_index(keys=['MetadataLabel'],inplace=False)

    #Return Metadata dataframe:
    return meta_df_i

In [13]:
def str2dataframe(data, tracer, level=2):
    
    """
    Project:         'ICOS Carbon Portal'
    Created:          Tue Oct 08 08:40:00 2019
    Last Changed:     Tue Oct 08 08:40:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Split an ICOS Level-1 or Level-2 Amtosheric Data text file to metadata and data parts.
                      Call functions to read-in the metadata-text to a pandas dataframe and the data-text to
                      another pandas dataframe. Return the newly created dataframes as output.
                      
    Input parameters: 1. Text containing ICOS Level-1 or Level-2 Atmospheric metadata and observation-data
                         (var_name: 'data', var_type: String)
                      2. Name of gas/tracer - e.g. 'co2' or 'co' or 'ch4'
                         (var_name: 'tracer', var_type: String)
                      3. Data level [optional]
                         (var_name: 'level', var_type: Integer)
    
    Default value for level: The default value for data level is "2". Function calls for Level-2 data do not have
                             to include a value for the level input parameter.
    
    Output:           2 pandas dataframes: metadata pandas dataframe, data pandas dataframe 

    """
    
    #Split data to a list containing metadata and data values:
    data_split = data.split('\n#\n#')
    
    #datasplit[0] -- > contains metadata
    #datasplit[1] -- > contains data
    
    #Call function to create the ICOS Metadata dataframe:
    df_metadata = icosMetadatadf(data_split[0])
    
    #Call function to create the ICOS Data dataframe:
    df_data = icosDatadf(data_split[1], tracer, level)
    
    return df_metadata, df_data
    

In [14]:
#Function that converts data with data type "bytes" to data type "string":
def byte2string(databytes):
   
    """
    Project:         'ICOS Carbon Portal'
    Created:          Tue Oct 08 08:40:00 2019
    Last Changed:     Tue Oct 08 08:40:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Convert data from bytes to string.
                      
    Input parameters: Binary data containing ICOS Level-1 or Level-2 Atmospheric metadata and observation-data 
                      (var_name: 'data', var_type: String)
    
    Output:           Text data containing ICOS Level-1 or Level-2 Atmospheric metadata and observation-data.

    """
    
    #Convert data to string:
    datastring = databytes.decode("utf-8")
    
    #Return converted data:
    return datastring

In [15]:
#Function that unzips a file at a given directory:
def unzip(fullpath):
    
    """
    Project:         'ICOS Carbon Portal'
    Created:          Tue Oct 08 08:35:00 2019
    Last Changed:     Tue Oct 08 08:35:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Unzip zipped ICOS Level-1 or Level-2 Atmospheric Data Files.
                      
    Input parameters: Path to ICOS Level-1 or Level-2 Atmospheric Data file (var_name: 'fullpath', var_type: String)
    
    Output:           Unzipped data file.

    """
    
    #Open zipfile in reading mode:
    with zipfile.ZipFile(fullpath, 'r') as zf:
        
        #Store the unzipped file in the same directory as the zipped one
        #zip_ref.extractall(pathtodir)
        try:
            data = zf.read(zf.namelist()[0])
        except KeyError:
            print('ERROR: Did not find %s in zip file' % zf.namelist()[0])
        
        return data

In [16]:
def read_ICOS_zipfile(filename, tracer, level=2):
    
    """
    Project:         'ICOS Carbon Portal'
    Created:          Tue Oct 08 08:30:00 2019
    Last Changed:     Tue Oct 08 08:30:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Function that unzips an ICOS data file, checks if the unzipped file contain binary or text data,
                      converts the binary data to to text and returns a pandas dataframe with metadata & 
                      a pandas dataframe with observation data.
                      
    Input parameters: 1. File name for ICOS Level-1 or Level-2 Atmospheric Data File
                         (var_name: 'filename', var_type: String).
                      2. Name of gas/tracer - e.g. 'co2' or 'co' or 'ch4'
                         (var_name: 'tracer', var_type: String)
                      3. Data level [optional]
                         (var_name: 'level', var_type: Integer)
    
    Default value for level: The default value for data level is "2". Function calls for Level-2 data do not have
                             to include a value for the level input parameter.                      2. 
    
    Output:           2 pandas dataframes: metadata pandas dataframe, data pandas dataframe 
    
    """
    
    #Unzip file: 
    data = unzip(filename) #retunrs data type bytes in python 3.x (instead of string)

    #Check data type & convert to string:
    if (type(data) == bytes):
        data = byte2string(data)

    #Call function to create a pandas dataframe for metadata & one for data:
    df_metadata, df_data = str2dataframe(data, tracer, level)
    
    #Return data & metadata dataframes:
    return df_metadata, df_data 

In [17]:
# %load /home/ute/Stations/Claudio/helper_functions.py
"""
Created on Wed Oct  3 2018
Last change on Nov  1 2018
@author: Claudio D'Onofrio
"""

__version__= "0.1.0"

# create helper functions
#-----------------------------------------------------------------

def is_number(num):
    """ check if we deal with a number """
    try:
        float(num)
        return True
    except ValueError:
        return False

#-----------------------------------------------------------------

def checklib(module):
    """ load a list of modoules if available, otherwise throw exception """
    import imp
    for mod in module:
        try:
            imp.find_module(mod)
            ret = 1
        except ImportError as imperror:
            print(imperror)
            ret = 0
    return ret

#---------------------------------------------------------------------


In [18]:
# %load /home/ute/Stations/Claudio/sparqls.py
"""
Created on Thu Nov 22 16:35:50 2018
contains functions, returning complete sparql queries,
to run against the ICOS Carbon Portal RDF Triple Store
@author: Claudio D'Onofrio

"""

__version__ = "0.1.0"

# personal storage of sparql queries 
# -----------------------------------------

def atc_query(tracer,level=2):
    """
        Return SPARQL query to get a list of
        ICOS Atmospheric CO2, CO or MTO, level 2 or level 1 (=NRT) data objects
       :return: SPARQL query to get all ATC Level <level> products for tracer <tracer>
       :rtype: string 
    """
    tracer = tracer.lower().title()
    dataobject = ["NrtGrowingDataObject","L2DataObject"]
    
    query = """
        prefix cpmeta: <http://meta.icos-cp.eu/ontologies/cpmeta/>
        prefix prov: <http://www.w3.org/ns/prov#>
        select ?dobj ?spec ?fileName ?size ?submTime ?timeStart ?timeEnd
        FROM <http://meta.icos-cp.eu/resources/atmprodcsv/>
        where {
                BIND(<http://meta.icos-cp.eu/resources/cpmeta/atc"""+tracer+dataobject[level-1]+"""> AS ?spec)
                ?dobj cpmeta:hasObjectSpec ?spec .
	
                FILTER NOT EXISTS {[] cpmeta:isNextVersionOf ?dobj}
                ?dobj cpmeta:hasSizeInBytes ?size .
                ?dobj cpmeta:hasName ?fileName .
                ?dobj cpmeta:wasSubmittedBy [
                prov:endedAtTime ?submTime ;
                prov:wasAssociatedWith ?submitter
                ] .
                ?dobj cpmeta:hasStartTime | (cpmeta:wasAcquiredBy / prov:startedAtTime) ?timeStart .
                ?dobj cpmeta:hasEndTime | (cpmeta:wasAcquiredBy / prov:endedAtTime) ?timeEnd .
        }

        """
    return query
##------------------------------------------------------------------------------
    
def atc_stationlist(station,tracer='co2',level=2):
    """
        Return SPARQL query to get a list of
        ICOS Atmospheric CO2, CO or MTO, level 2 or level 1 (=NRT) data objects
        for all stations in list
       :return: SPARQL query to get all ATC products for specific stations, tracer and ICOS-level
       :rtype: string 
    """
    tracer = tracer.lower().title()
    dataobject = ["NrtGrowingDataObject","L2DataObject"]
    
    if type(station) == str:
        station = [station]
    strUrl=" "
    for ist in station:
        strUrl = strUrl + " " + """<http://meta.icos-cp.eu/resources/stations/AS_"""+ist+""">"""

    query = """
    prefix cpmeta: <http://meta.icos-cp.eu/ontologies/cpmeta/>
    prefix prov: <http://www.w3.org/ns/prov#>
    select ?dobj ?spec ?fileName ?size ?submTime ?timeStart ?timeEnd
    FROM <http://meta.icos-cp.eu/resources/atmprodcsv/>
    where {
        BIND(<http://meta.icos-cp.eu/resources/cpmeta/atc"""+tracer+dataobject[level-1]+"""> AS ?spec)
        ?dobj cpmeta:hasObjectSpec ?spec .
        VALUES ?station {"""+strUrl+"""} ?dobj cpmeta:wasAcquiredBy/prov:wasAssociatedWith ?station .
        FILTER NOT EXISTS {[] cpmeta:isNextVersionOf ?dobj}
        ?dobj cpmeta:hasSizeInBytes ?size .
        ?dobj cpmeta:hasName ?fileName .
        ?dobj cpmeta:wasSubmittedBy [
            prov:endedAtTime ?submTime ;
            prov:wasAssociatedWith ?submitter
        ] .
        ?dobj cpmeta:hasStartTime | (cpmeta:wasAcquiredBy / prov:startedAtTime) ?timeStart .
        ?dobj cpmeta:hasEndTime | (cpmeta:wasAcquiredBy / prov:endedAtTime) ?timeEnd .
        }
        """
    return query


In [19]:
def get_ICOS_filename(station,tracer='co2',level=2,download=False,path_data=''):
    # %load /home/ute/Stations/Claudio/atc_co2_l2.py
    """
    hack the carbon portal
    download data files directly from 
    the carbon portal. 
    Created on Thu Nov 22 17:17:27 2018

    @author: Claudio
    """

    __version__ = "0.1.0"

#--------------------------------------------------------------
    import sys
    #import sparqls
    #import helper_functions as h

    # set the list of necessary modules to run the code
    modules = ["os", "requests", "pandas", "tqdm"]

    # check if the modules are available and load them, otherwise stop execution
    #if not h.checklib(modules):
    if not checklib(modules):
        sys.exit("module dependencies are not fulfilled")

    else:
        import os
        import requests
        import pandas as pd
        from tqdm import tqdm

    # --------------------------------------------------------------
    # this is the bit, where the sparql query is sent and we expect
    # a list of dataobject    
    url = 'https://meta.icos-cp.eu/sparql'

    r = requests.get(url, params={
        'format': 'json',
        'query': atc_stationlist(station,tracer=tracer,level=level)})

    data = r.json()

    #------------------------------------------------------------------------


    # convert the the result into a table
    # output is an array, where each row contains
    # information about the data object

    cols = data['head']['vars']
    datatable = []

    for row in data['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))

        datatable.append(item)

    # print the table if you want
    dt = pd.DataFrame(datatable, columns=cols)
    #print(dt.head(5))
    #print(dt.fileName)

    #------------------------------------------------------------------------
    # download all ATC CO2 L2 files

    if download:
        print("download all files: ",dt.fileName)
        if path_data == '':
            HOME = os.path.expanduser("~")
            path_data = HOME+'/tmp_icos/'
        if not os.path.exists(path_data):
            os.makedirs(path_data, exist_ok=True)
        print("download all files: ",dt.fileName, ' to ', path_data)

        # now loop through the results, and download the corresponding file
        # files are download directly to the folder where this script is
        # located. If the file does already exist it will be skipped.

        for idx in dt.index:
            if os.path.isfile(path_data+dt.fileName[idx]):
                print("file already exists, skip..."+dt.fileName[idx])
            else:    
                # a little hack to provide "yes" to the licence agreement
                prefix = "https://data.icos-cp.eu/licence_accept?ids=%5B%22"
                suffix = "%22%5D"
                url = dt.dobj[idx]
                url = url.split("/")
                url = prefix + url[4] + suffix
        
                # print(url)
                response = requests.get(url, stream=True)
                with open(path_data+dt.fileName[idx], "wb") as handle:
                    for data in tqdm(response.iter_content()):
                        handle.write(data)

        print("all done")

    return dt


In [20]:
def get_ICOS_list(tracer,level=2):
    # %load /home/ute/Stations/Claudio/atc_co2_l2.py
    """
    hack the carbon portal
    download data files directly from 
    the carbon portal. 
    Created on Thu Nov 22 17:17:27 2018

    @author: Claudio
    """

    __version__ = "0.1.0"

    # download all ATC CO2 L2 files
    # ---------------------------------------

    #--------------------------------------------------------------
    import sys
    #import sparqls
    #import helper_functions as h

    # set the list of necessary modules to run the code
    modules = ["os", "requests", "pandas", "tqdm"]

    # check if the modules are available and load them, otherwise stop execution
    if not checklib(modules):
        sys.exit("module dependencies are not fulfilled")

    else:
        import os
        import requests
        import pandas as pd
        from tqdm import tqdm

    # --------------------------------------------------------------
    # this is the bit, where the sparql query is sent and we expect
    # a list of dataobject    
    url = 'https://meta.icos-cp.eu/sparql'
    r = requests.get(url, params={
        'format': 'json',
        'query': atc_query(tracer,level)})

    data = r.json()

    #------------------------------------------------------------------------


    # convert the the result into a table
    # output is an array, where each row contains
    # information about the data object

    cols = data['head']['vars']
    datatable = []

    for row in data['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))

        datatable.append(item)

    # print the table if you want
    dt = pd.DataFrame(datatable, columns=cols)
    #print(dt.head(5))

    return dt


In [21]:
# list all defined functions
func = %who_ls function
print ("\033[1m" + "Functions defined for handling STILT output:" + "\033[0;0m")
for ff in func:
    print (ff)

[1mFunctions defined for handling STILT output:[0;0m
atc_query
atc_stationlist
available_STILT_dictionary
byte2string
checklib
createDatetimeObjList
create_STILT_dictionary
get_ICOS_filename
get_ICOS_list
get_station_class
icosDatadf
icosMetadatadf
is_number
lonlat_2_ixjy
plot_available_STILT
read_ICOS_zipfile
read_STILT_dictionary
read_emissions
read_stilt_timeseries
str2dataframe
unzip
