## Jupyter Notebook with modules for reading ICOS atmospheric observations

Code to read ICOS atmospheric data provided by Karolina Pantazatou 
on 28 Nov 2018 in icos_timeseries_static_py2.ipynb

Code to handle sparql queries and download from Carbon Portal provided by Claudio D'Onofrio
on 22 Nov 2018 in sparqls.py, atc_co2_l2.py, and helper_functions.py

<br />    
<font color='red'> 
**Please copy the notebook to your own directory before making changes.** 
</font>

### Preparations
#### Import tools

In [1]:
#import modules:
import sys   
#reload(sys)
#sys.setdefaultencoding('utf8')
import os
import shutil
import zipfile
import pandas as pd
from operator import methodcaller
import numpy as np
from numpy import nan
import warnings
import datetime as dt
from datetime import datetime#, timedelta, date, time
#import matplotlib
#from matplotlib.ticker import FuncFormatter
#import matplotlib.pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [2]:
#Function that creates and adds a column with datetime objects in a dataframe:
def createDatetimeObjList(df_data):

    """
    Project:         'ICOS Carbon Portal'
    Created:          Mon Nov 05 10:27:00 2019
    Last Changed:     Mon Nov 05 10:27:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Add a column with datetime objects to the input dataframe. The datetime objects are created 
                      based on the content of the 'Year', 'Month', 'Day', 'Hour' and 'Minute' columns of every row.
    Input parameters: ICOS data pandas dataframe (Atmospheric Level-1 or Level-2 Data)
    Output:           pandas dataframe
                      columns: 
                            1.  Station 3-character Code (var_name: 'Site', var_type: String)
                            2.  Sampling Height (var_name: 'SamplingHeight', var_type: String)
                            3.  Sampling Year (var_name: 'Year', var_type: String)
                            4.  Sampling Month (var_name: 'Month', var_type: String)
                            5.  Sampling Day (var_name: 'Day', var_type: String)
                            6.  Sampling Hour (var_name: 'Hour', var_type: String)
                            7.  Sampling Minute (var_name: 'Minute', var_type: String)
                            8.  Sampling Decimal Date (var_name: 'DecimalDate', var_type: String)
                            9.  Tracer/Gas concentration (var_name: 'ch4' or 'co2' or 'co', var_type: String)
                            10. Standard Deviation (var_name: 'Stdev', var_type: String)
                            11. Number of Points used for the measurment(var_name: 'NbPoints', var_type: String)
                            12. Quality Flag (var_name: 'Flag', var_type: String)
                            13. Instrument ID (var_name: 'InstrumentId', var_type: String)
                            14. Quality ID (var_name: 'QualityId', var_type: String)
                            15. Internal Flag: only for Level-1 data
                                (var_name: 'InternalFlag', var_type: String)
                            15. LTR: only for Level-2 data (var_name: 'LTR', var_type: String)
                            16. Auto-Descriptive Flag: only for Level-1 data
                                (var_name: 'AutoDescriptiveFlag', var_type: String)
                            16. CMR: only for Level-2 data (var_name: 'CMR', var_type: String)
                            17. Manual-Descriptive Flag: only for Level-1 data
                                (var_name: 'ManualDescriptiveFlag', var_type: String)
                            17: STTB: only for Level-2 data (var_name: 'STTB', var_type: String)
                            18: Datetime object (var_name: 'DateTime', var_type: Stri)
                            
    """
    
    #Create a list with datetime obj:
    datetimeObj_list = [datetime.strptime((df_data['Year'][i]+"/"+
                                           df_data['Month'][i]+"/"+
                                           df_data['Day'][i]+" "+
                                           df_data['Hour'][i] +":"+
                                           df_data['Minute'][i]),'%Y/%m/%d %H:%M')
                        for i in range(len(df_data))]
    
    #Add list with datetime objects to the data dataframe:
    df_data['DateTime'] =  datetimeObj_list
    
    #Return dataframe:
    return df_data

In [3]:
#Function that creates a dataframe with the data values:
def icosDatadf(data, tracer, level=2):

    """
    Project:         'ICOS Carbon Portal'
    Created:          Mon Oct 08 09:27:00 2019
    Last Changed:     Mon Oct 08 09:27:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Read in observations from ICOS Atmosphere Data File (Level-1 or Level-2) to a pandas dataframe.
    
    Input parameters: 1. Text containing column names and all observations (var_name: 'data', var_type: String)
                      2. Name of gas/tracer (var_name: 'tracer', var_type: String) - e.g. 'co2' or 'co' or 'ch4'
                      3. Data level [optional] (var_name: 'level', var_type: Integer)
    
    Default value for level: The default value for data level is "2". Function calls for Level-2 data do not have
                             to include a value for the level input parameter.
    
    Output:           pandas dataframe
    
                      columns: 
                            1.  Station 3-character Code (var_name: 'Site', var_type: String)
                            2.  Sampling Height (var_name: 'SamplingHeight', var_type: String)
                            3.  Sampling Year (var_name: 'Year', var_type: String)
                            4.  Sampling Month (var_name: 'Month', var_type: String)
                            5.  Sampling Day (var_name: 'Day', var_type: String)
                            6.  Sampling Hour (var_name: 'Hour', var_type: String)
                            7.  Sampling Minute (var_name: 'Minute', var_type: String)
                            8.  Sampling Decimal Date (var_name: 'DecimalDate', var_type: String)
                            9.  Tracer/Gas concentration (var_name: 'ch4' or 'co2' or 'co', var_type: String)
                            10. Standard Deviation (var_name: 'Stdev', var_type: String)
                            11. Number of Points used for the measurment(var_name: 'NbPoints', var_type: String)
                            12. Quality Flag (var_name: 'Flag', var_type: String)
                            13. Instrument ID (var_name: 'InstrumentId', var_type: String)
                            14. Quality ID (var_name: 'QualityId', var_type: String)
                            15. Internal Flag: only for Level-1 data
                                (var_name: 'InternalFlag', var_type: String)
                            15. LTR: only for Level-2 data (var_name: 'LTR', var_type: String)
                            16. Auto-Descriptive Flag: only for Level-1 data
                                (var_name: 'AutoDescriptiveFlag', var_type: String)
                            16. CMR: only for Level-2 data (var_name: 'CMR', var_type: String)
                            17. Manual-Descriptive Flag: only for Level-1 data
                                (var_name: 'ManualDescriptiveFlag', var_type: String)
                            17: STTB: only for Level-2 data (var_name: 'STTB', var_type: String)
                            18: Datetime object (var_name: 'DateTime', var_type: Stri)
                            
    """
    
    #Split data to rows:
    data_rows = data.split('\n')
    
    #Remove the first row (contains the column names)
    #split its contents and add them to a list:
    data_labels = data_rows.pop(0).split(';')
    
    #Sepparate the columns of the remaining rows:
    d_rows_list = [data_rows[x].split(';') for x in range(len(data_rows))] 
    
    #Create dataframe:
    df_data = pd.DataFrame.from_records(d_rows_list, columns=data_labels)
    
    #Add column with DateTime objects:
    df_data = createDatetimeObjList(df_data) 
    
    #Check what Level the data belong to:
    if(level==2):
        
        #Filter values based on flag 'O', which corresponds to quality controlled data:
        df_data = df_data.loc[df_data['Flag']=='O']
    
    else:
        #Set missing values (e.g. "-999.99") to NaN:
        df_data.loc[df_data[tracer].astype(np.float16)<0, tracer] = np.nan
        df_data.loc[df_data['Stdev'].astype(np.float16)<0, 'Stdev'] = np.nan
    
    #Return Data dataframe:
    return df_data.set_index('DateTime') #set column 'DateTime' as index
    

In [4]:
#Function that creates a dataframe with the metadata values:
def icosMetadatadf(data):
    

    """
    Project:         'ICOS Carbon Portal'
    Created:          Mon Oct 08 10:27:00 2019
    Last Changed:     Mon Oct 08 10:27:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Read in metadata from ICOS Atmosphere Data File (Level-1 or Level-2) to a pandas dataframe.
    
    Input parameters: Text containing column names and all observations (var_name: 'data', var_type: String)
    
    Output:           pandas dataframe
    
                      columns: 
                            1. MetadataLabel (var_name: 'MetadataLabel', var_type: String)
                                i.       Dataset Title (var_name: 'TITLE', var_type: String)
                                ii.      Dataset File Name (var_name: 'FILE NAME', var_type: String)
                                iii.     Data Format (var_name: 'DATA FORMAT', var_type: String)
                                iv.      Total num of records (var_name: 'TOTAL LINES', var_type: String)
                                v.       Metadata Header Lines (var_name: 'HEADER LINES', var_type: String)
                                vi.      Project Data Version (var_name: 'PROJECT DATA VERSION', var_type: String)
                                vii.     Data Product Type (var_name: 'DATA PRODUCT TYPE', var_type: String)
                                viii.    Station 3-character Code (var_name: 'STATION CODE', var_type: String)
                                ix.      Station Full Name (var_name: 'STATION NAME', var_type: String)
                                x.       Station Category (var_name: 'STATION CATEGORY', var_type: String)
                                xi.      Observation Category (var_name: 'OBSERVATION CATEGORY', var_type: String)
                                xii.     Country (var_name: 'COUNTRY/TERRITORY', var_type: String)
                                xiii.    Contributor (var_name: 'CONTRIBUTOR', var_type: String)
                                xiv.     Latitude (var_name: 'LATITUDE', var_type: String)
                                xv.      Longitude (var_name: 'LONGITUDE', var_type: String)
                                xvi.     Altitude (var_name: 'ALTITUDE', var_type: String)
                                xvii.    Number of Sampling Heights
                                         (var_name: 'NUMBER OF SAMPLING HEIGHTS', var_type: String)
                                xviii.   Sampling Height (var_name: 'SAMPLING HEIGHTS', var_type: String)
                                xix.     Contact Information - email (var_name: 'CONTACT POINT', var_type: String)
                                xx.      Observation Parameter - e.g. 'CO' (var_name: 'PARAMETER', var_type: String)
                                xxi.     Covering Time Period (var_name: 'COVERING PERIOD', var_type: String)
                                xxii.    Sampling Freequency (var_name: 'TIME INTERVAL', var_type: String)
                                xxiii.   Measurement Unit (var_name: 'MEASUREMENT UNIT', var_type: String)
                                xxiv.    Measurement Method (var_name: 'MEASUREMENT METHOD', var_type: String)
                                xxv.     Sampling Type (var_name: 'SAMPLING TYPE', var_type: String)
                                xxvi.    Time Zone (var_name: 'TIME ZONE', var_type: String)
                                xxvii.   Measurement Scale (var_name: 'MEASUREMENT SCALE', var_type: String)
                                xxviii.  Data Policy (var_name: 'DATA POLICY', var_type: String)
                                xxix.    Comment Notes (var_name: 'COMMENT', var_type: String)
                                
                            2. MetadataInfo (var_name: 'MetadataInfo', var_type: String)
                            
                            
    """
    
    #Split the metadata values for label "comment":
    metadata_split = data.split('\n#   ')
    
    #Get the metadata rows:
    metadata_rows = metadata_split.pop(0).split('\n')
    
    #Remove "# " from rows:
    metadata_rows = [metadata_rows[i].replace("# ", "") for i in range(len(metadata_rows))]
    
    #Split labels from values:
    metadata_rows_split = [metadata_rows[i].split(': ') for i in range(len(metadata_rows))]
    
    #Get the metadata labels:
    metadata_labels = [metadata_rows_split[i].pop(0) for i in range(len(metadata_rows_split))]
    
    #Remove ":" from label "COMMENT:"
    metadata_labels[-1]= metadata_labels[-1].replace(":", "")
    
    #Join the metadata values for the label "comment" to one string:
    comment_values = ", ".join(metadata_split)
    
    #Add the metadata values for the label "comment" to the metadata-values list:
    metadata_rows_split[-1] = [comment_values]
    
    #Construct the metadata-values list as a list of strings instead of a list of lists:
    metadata_values = [metadata_rows_split[i].pop(0) for i in range(len(metadata_rows_split))]
    
    #Create a dictionary with the metadata-label and -values lists:
    metadata_dict = {'MetadataLabel':metadata_labels,
                     'MetadataInfo':metadata_values}
    
    #Create Metadata dataframe:
    df_metadata = pd.DataFrame(metadata_dict,columns=['MetadataLabel', 'MetadataInfo'])
    
    #Make the 'MetadataLabel'-column an index:
    meta_df_i = df_metadata.set_index(keys=['MetadataLabel'],inplace=False)

    #Return Metadata dataframe:
    return meta_df_i

In [5]:
def str2dataframe(data, tracer, level=2):
    
    """
    Project:         'ICOS Carbon Portal'
    Created:          Tue Oct 08 08:40:00 2019
    Last Changed:     Tue Oct 08 08:40:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Split an ICOS Level-1 or Level-2 Amtosheric Data text file to metadata and data parts.
                      Call functions to read-in the metadata-text to a pandas dataframe and the data-text to
                      another pandas dataframe. Return the newly created dataframes as output.
                      
    Input parameters: 1. Text containing ICOS Level-1 or Level-2 Atmospheric metadata and observation-data
                         (var_name: 'data', var_type: String)
                      2. Name of gas/tracer - e.g. 'co2' or 'co' or 'ch4'
                         (var_name: 'tracer', var_type: String)
                      3. Data level [optional]
                         (var_name: 'level', var_type: Integer)
    
    Default value for level: The default value for data level is "2". Function calls for Level-2 data do not have
                             to include a value for the level input parameter.
    
    Output:           2 pandas dataframes: metadata pandas dataframe, data pandas dataframe 

    """
    
    #Split data to a list containing metadata and data values:
    data_split = data.split('\n#\n#')
    
    #datasplit[0] -- > contains metadata
    #datasplit[1] -- > contains data
    
    #Call function to create the ICOS Metadata dataframe:
    df_metadata = icosMetadatadf(data_split[0])
    
    #Call function to create the ICOS Data dataframe:
    df_data = icosDatadf(data_split[1], tracer, level)
    
    return df_metadata, df_data
    

In [6]:
#Function that converts data with data type "bytes" to data type "string":
def byte2string(databytes):
   
    """
    Project:         'ICOS Carbon Portal'
    Created:          Tue Oct 08 08:40:00 2019
    Last Changed:     Tue Oct 08 08:40:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Convert data from bytes to string.
                      
    Input parameters: Binary data containing ICOS Level-1 or Level-2 Atmospheric metadata and observation-data 
                      (var_name: 'data', var_type: String)
    
    Output:           Text data containing ICOS Level-1 or Level-2 Atmospheric metadata and observation-data.

    """
    
    #Convert data to string:
    datastring = databytes.decode("utf-8")
    
    #Return converted data:
    return datastring

In [7]:
#Function that unzips a file at a given directory:
def unzip(fullpath):
    
    """
    Project:         'ICOS Carbon Portal'
    Created:          Tue Oct 08 08:35:00 2019
    Last Changed:     Tue Oct 08 08:35:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Unzip zipped ICOS Level-1 or Level-2 Atmospheric Data Files.
                      
    Input parameters: Path to ICOS Level-1 or Level-2 Atmospheric Data file (var_name: 'fullpath', var_type: String)
    
    Output:           Unzipped data file.

    """
    
    #Open zipfile in reading mode:
    with zipfile.ZipFile(fullpath, 'r') as zf:
        
        #Store the unzipped file in the same directory as the zipped one
        #zip_ref.extractall(pathtodir)
        try:
            data = zf.read(zf.namelist()[0])
        except KeyError:
            print('ERROR: Did not find %s in zip file' % zf.namelist()[0])
        
        return data

In [8]:
def read_ICOS_zipfile(filename, tracer, level=2):
    
    """
    Project:         'ICOS Carbon Portal'
    Created:          Tue Oct 08 08:30:00 2019
    Last Changed:     Tue Oct 08 08:30:00 2019
    Version:          1.0.0
    Author(s):        Karolina
    
    Description:      Function that unzips an ICOS data file, checks if the unzipped file contain binary or text data,
                      converts the binary data to to text and returns a pandas dataframe with metadata & 
                      a pandas dataframe with observation data.
                      
    Input parameters: 1. File name for ICOS Level-1 or Level-2 Atmospheric Data File
                         (var_name: 'filename', var_type: String).
                      2. Name of gas/tracer - e.g. 'co2' or 'co' or 'ch4'
                         (var_name: 'tracer', var_type: String)
                      3. Data level [optional]
                         (var_name: 'level', var_type: Integer)
    
    Default value for level: The default value for data level is "2". Function calls for Level-2 data do not have
                             to include a value for the level input parameter.                      2. 
    
    Output:           2 pandas dataframes: metadata pandas dataframe, data pandas dataframe 
    
    """
    
    #Unzip file: 
    data = unzip(filename) #retunrs data type bytes in python 3.x (instead of string)

    #Check data type & convert to string:
    if (type(data) == bytes):
        data = byte2string(data)

    #Call function to create a pandas dataframe for metadata & one for data:
    df_metadata, df_data = str2dataframe(data, tracer, level)
    
    #Return data & metadata dataframes:
    return df_metadata, df_data 

In [9]:
# %load /home/ute/Stations/Claudio/helper_functions.py
"""
Created on Wed Oct  3 2018
Last change on Nov  1 2018
@author: Claudio D'Onofrio
"""

__version__= "0.1.0"

# create helper functions
#-----------------------------------------------------------------

def is_number(num):
    """ check if we deal with a number """
    try:
        float(num)
        return True
    except ValueError:
        return False

#-----------------------------------------------------------------

def checklib(module):
    """ load a list of modoules if available, otherwise throw exception """
    import imp
    for mod in module:
        try:
            imp.find_module(mod)
            ret = 1
        except ImportError as imperror:
            print(imperror)
            ret = 0
    return ret

#---------------------------------------------------------------------


In [10]:
# %load /home/ute/Stations/Claudio/sparqls.py
"""
Created on Thu Nov 22 16:35:50 2018
contains functions, returning complete sparql queries,
to run against the ICOS Carbon Portal RDF Triple Store
@author: Claudio D'Onofrio

"""

__version__ = "0.1.0"

# personal storage of sparql queries 
# -----------------------------------------

def atc_query(tracer,level=2):
    """
        Return SPARQL query to get a list of
        ICOS Atmospheric CO2, CO or MTO, level 2 or level 1 (=NRT) data objects
       :return: SPARQL query to get all ATC Level <level> products for tracer <tracer>
       :rtype: string 
    """
    tracer = tracer.lower().title()
    dataobject = ["NrtGrowingDataObject","L2DataObject"]
    
    query = """
        prefix cpmeta: <http://meta.icos-cp.eu/ontologies/cpmeta/>
        prefix prov: <http://www.w3.org/ns/prov#>
        select ?dobj ?spec ?fileName ?size ?submTime ?timeStart ?timeEnd
        FROM <http://meta.icos-cp.eu/resources/atmprodcsv/>
        where {
                BIND(<http://meta.icos-cp.eu/resources/cpmeta/atc"""+tracer+dataobject[level-1]+"""> AS ?spec)
                ?dobj cpmeta:hasObjectSpec ?spec .
	
                FILTER NOT EXISTS {[] cpmeta:isNextVersionOf ?dobj}
                ?dobj cpmeta:hasSizeInBytes ?size .
                ?dobj cpmeta:hasName ?fileName .
                ?dobj cpmeta:wasSubmittedBy [
                prov:endedAtTime ?submTime ;
                prov:wasAssociatedWith ?submitter
                ] .
                ?dobj cpmeta:hasStartTime | (cpmeta:wasAcquiredBy / prov:startedAtTime) ?timeStart .
                ?dobj cpmeta:hasEndTime | (cpmeta:wasAcquiredBy / prov:endedAtTime) ?timeEnd .
        }

        """
    return query
##------------------------------------------------------------------------------
    
def atc_stationlist(station,tracer='co2',level=2):
    """
        Return SPARQL query to get a list of
        ICOS Atmospheric CO2, CO or MTO, level 2 or level 1 (=NRT) data objects
        for all stations in list
       :return: SPARQL query to get all ATC products for specific stations, tracer and ICOS-level
       :rtype: string 
    """
    tracer = tracer.lower().title()
    dataobject = ["NrtGrowingDataObject","L2DataObject"]
    
    if type(station) == str:
        station = [station]
    strUrl=" "
    for ist in station:
        strUrl = strUrl + " " + """<http://meta.icos-cp.eu/resources/stations/AS_"""+ist+""">"""

    query = """
    prefix cpmeta: <http://meta.icos-cp.eu/ontologies/cpmeta/>
    prefix prov: <http://www.w3.org/ns/prov#>
    select ?dobj ?spec ?fileName ?size ?submTime ?timeStart ?timeEnd
    FROM <http://meta.icos-cp.eu/resources/atmprodcsv/>
    where {
        BIND(<http://meta.icos-cp.eu/resources/cpmeta/atc"""+tracer+dataobject[level-1]+"""> AS ?spec)
        ?dobj cpmeta:hasObjectSpec ?spec .
        VALUES ?station {"""+strUrl+"""} ?dobj cpmeta:wasAcquiredBy/prov:wasAssociatedWith ?station .
        FILTER NOT EXISTS {[] cpmeta:isNextVersionOf ?dobj}
        ?dobj cpmeta:hasSizeInBytes ?size .
        ?dobj cpmeta:hasName ?fileName .
        ?dobj cpmeta:wasSubmittedBy [
            prov:endedAtTime ?submTime ;
            prov:wasAssociatedWith ?submitter
        ] .
        ?dobj cpmeta:hasStartTime | (cpmeta:wasAcquiredBy / prov:startedAtTime) ?timeStart .
        ?dobj cpmeta:hasEndTime | (cpmeta:wasAcquiredBy / prov:endedAtTime) ?timeEnd .
        }
        """
    return query


In [11]:
def get_ICOS_filename(station,tracer='co2',level=2,download=False):
    # %load /home/ute/Stations/Claudio/atc_co2_l2.py
    """
    hack the carbon portal
    download data files directly from 
    the carbon portal. 
    Created on Thu Nov 22 17:17:27 2018

    @author: Claudio
    """

    __version__ = "0.1.0"

#--------------------------------------------------------------
    import sys
    #import sparqls
    #import helper_functions as h

    # set the list of necessary modules to run the code
    modules = ["os", "requests", "pandas", "tqdm"]

    # check if the modules are available and load them, otherwise stop execution
    #if not h.checklib(modules):
    if not checklib(modules):
        sys.exit("module dependencies are not fulfilled")

    else:
        import os
        import requests
        import pandas as pd
        from tqdm import tqdm

    # --------------------------------------------------------------
    # this is the bit, where the sparql query is sent and we expect
    # a list of dataobject    
    url = 'https://meta.icos-cp.eu/sparql'

    r = requests.get(url, params={
        'format': 'json',
        'query': atc_stationlist(station,tracer=tracer,level=level)})

    data = r.json()

    #------------------------------------------------------------------------


    # convert the the result into a table
    # output is an array, where each row contains
    # information about the data object

    cols = data['head']['vars']
    datatable = []

    for row in data['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))

        datatable.append(item)

    # print the table if you want
    dt = pd.DataFrame(datatable, columns=cols)
    #print(dt.head(5))
    #print(dt.fileName)

    #------------------------------------------------------------------------
    # download all ATC CO2 L2 files

    if download:
        print("download all files: ",dt.fileName)

        # now loop through the results, and download the corresponding file
        # files are download directly to the folder where this script is
        # located. If the file does already exist it will be skipped.

        for idx in dt.index:
            if os.path.isfile(dt.fileName[idx]):
                print("file already exists, skip..."+dt.fileName[idx])
            else:    
                # a little hack to provide "yes" to the licence agreement
                prefix = "https://data.icos-cp.eu/licence_accept?ids=%5B%22"
                suffix = "%22%5D"
                url = dt.dobj[idx]
                url = url.split("/")
                url = prefix + url[4] + suffix
        
                # print(url)
                response = requests.get(url, stream=True)
                with open(dt.fileName[idx], "wb") as handle:
                    for data in tqdm(response.iter_content()):
                        handle.write(data)

        print("all done")

    return dt


In [12]:
def get_ICOS_list(tracer,level=2):
    # %load /home/ute/Stations/Claudio/atc_co2_l2.py
    """
    hack the carbon portal
    download data files directly from 
    the carbon portal. 
    Created on Thu Nov 22 17:17:27 2018

    @author: Claudio
    """

    __version__ = "0.1.0"

    # download all ATC CO2 L2 files
    # ---------------------------------------

    #--------------------------------------------------------------
    import sys
    #import sparqls
    #import helper_functions as h

    # set the list of necessary modules to run the code
    modules = ["os", "requests", "pandas", "tqdm"]

    # check if the modules are available and load them, otherwise stop execution
    if not checklib(modules):
        sys.exit("module dependencies are not fulfilled")

    else:
        import os
        import requests
        import pandas as pd
        from tqdm import tqdm

    # --------------------------------------------------------------
    # this is the bit, where the sparql query is sent and we expect
    # a list of dataobject    
    url = 'https://meta.icos-cp.eu/sparql'
    r = requests.get(url, params={
        'format': 'json',
        'query': atc_query(tracer,level)})

    data = r.json()

    #------------------------------------------------------------------------


    # convert the the result into a table
    # output is an array, where each row contains
    # information about the data object

    cols = data['head']['vars']
    datatable = []

    for row in data['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))

        datatable.append(item)

    # print the table if you want
    dt = pd.DataFrame(datatable, columns=cols)
    #print(dt.head(5))

    return dt


#### outdated modules for reading ICOS data

In [13]:
#from IPython.display import Markdown, display# Function that takes a 4-digit integer as input (representing a year) and returns an integer with 2-digits (representing a year with its last 2 digits - omitting millenium and century).
def func_YYYY_to_YY(yyyy):
    yy = abs(int(yyyy))%100
    
    return yy



# Function that takes 5 string parameters as input -representing year (YYYY), month (MM), day (DD), hour (HH) and minute (HH)- and returns a formatted string object ('YY/MM/DD HH:MM'):
def func_date_format_str(year, month, day, hour, minute):
    
    #Create the formatted string:
    yy_mm_dd_hh_mm = year + '/' + month + '/' + day + ' ' + hour + ':' + minute
    
    #Return string
    return yy_mm_dd_hh_mm



# Function that takes a dataframe containing date-columns as input- and returns a list of datetime objects:
def func_list_datetimeObj(df_data):
    
    #Call function to make every year-value appear only with its last 2 digits and save the results in a list:
    yy_list = list(map(func_YYYY_to_YY, df_data['Year']))
    
    #Create the list that will store the formatted date strings:
    lst_datetimeObj = []
    
    #Loop that calls the date string-formatting function for every record in the dataframe.
    #The string-formatting function results are then used as inputs to the datetime-function, that creates a datetime object for every call.
    #All datetime objects are then stored in a list.
    for i in range(len(df_data)):
        lst_datetimeObj.append(dt.datetime.strptime(func_date_format_str(str(yy_list[i]), df_data['Month'][i], df_data['Day'][i], df_data['Hour'][i], df_data['Minute'][i]),'%y/%m/%d %H:%M'))
    
    #Return list with datetime object:
    return lst_datetimeObj


In [14]:
def read_ICOS_atm_downloaded(filename):
    
    #test if zip file or data file 
    if filename.lower().endswith('.zip'):
        
        #Import zip file and print the files stored in it:
        zf = zipfile.ZipFile(filename, 'r')
        print ('Zipfile: ',filename)
        #print (zf.namelist())
        # Extract archived files from the ZIP archive:
        for filename in zf.namelist():
            try:
                data = zf.read(filename)
            except KeyError:
                print('ERROR: Did not find %s in zip file' % filename)
            else:
                print('Filename: ', filename)
                #print (data)
    else:
        print('Filename: ', filename)
        af = open(filename, 'r')
        data = af.read()
    
    #Split the string whenever the '#' is found, to separate the Metadata headers from the Measurement data:
    #data_split = data.split('#')
    
    #updated:
    data_split = data.decode().split('#')

    #Display the list of the splitted data:
    #data_split
    
    #The following list item contains the Measurement data header and the Measurement data:
    #data_split[39]
    #Create a new list that will store the Measurment data and its header as text:
    data_txt = data_split.pop(39)

    #Display list:
    ##data_txt
    
    #----------Format the measurement txt data---------------
    #Split the text data at every '\n' to get the rows:
    data_txt_split = data_txt.split('\n')

    #Create a list that will store the rows of the measurement data
    measurement_data = []

    #Loop that splits the data for every measurement (row):
    measurement_data = list(map(methodcaller("split", ";"), data_txt_split))

    #Display the content of the splitted Measurement data list:
    #measurement_data

    #Remove the first list item that contains the Measurement data headers:
    data_labels = measurement_data.pop(0)

    #Display the Measurement data headers:
    #data_labels

    #Create a list for every variable in a measurement.
    #Every list will store the value of its corresponding variable for all measurements.
    site_list = []            #0
    samplingHeight_list = []  #1
    year_list = []            #2
    month_list = []           #3
    day_list = []             #4
    hour_list = []            #5
    minute_list = []          #6
    decimalDate_list = []     #7
    co2_list = []             #8
    stdev_list = []           #9
    nbPoints_list = []        #10
    flag_list = []            #11
    instrumentID_list = []    #12
    qualityID_list = []       #13
    LTR_list = []             #14
    CMR_list = []             #15
    STTB_list = []            #16

    #Loop that will populate the variable lists with data:
    for var_row in range(len(measurement_data)):
        for var_value in range(len(measurement_data[var_row])):
            if var_value == 0:
                site_list.append(measurement_data[var_row][var_value])
                #print ('1st condition st, ', 'var_row: ', var_row, ', var_value: ', var_value, ', row_data: ', measurement_data[var_row], ', measurement_data: ', measurement_data[var_row][var_value])
            elif var_value == 1:
                samplingHeight_list.append(float(measurement_data[var_row][var_value])) #Convert string to float
                #print ('2nd condition st, ', 'var_row: ', var_row, ', var_value: ', var_value, ', row_data: ', measurement_data[var_row], ', measurement_data: ', measurement_data[var_row][var_value])
            elif var_value == 2:
                year_list.append(measurement_data[var_row][var_value])
            elif var_value == 3:
                month_list.append(measurement_data[var_row][var_value])
            elif var_value == 4:
                day_list.append(measurement_data[var_row][var_value])
            elif var_value == 5:
                hour_list.append(measurement_data[var_row][var_value])
            elif var_value == 6:
                minute_list.append(measurement_data[var_row][var_value])
            elif var_value == 7:
                decimalDate_list.append(float(measurement_data[var_row][var_value])) #Convert string to float
            elif var_value == 8:
                co2_list.append(float(measurement_data[var_row][var_value])) #Convert string to float
            elif var_value == 9:
                stdev_list.append(float(measurement_data[var_row][var_value])) #Convert string to float
            elif var_value == 10:
                nbPoints_list.append(int(measurement_data[var_row][var_value]))
            elif var_value == 11:
                flag_list.append(measurement_data[var_row][var_value])
            elif var_value == 12:
                instrumentID_list.append(measurement_data[var_row][var_value])
            elif var_value == 13:
                qualityID_list.append(measurement_data[var_row][var_value])
            elif var_value == 14:
                LTR_list.append(measurement_data[var_row][var_value])
            elif var_value == 15:
                CMR_list.append(measurement_data[var_row][var_value])
            elif var_value == 16:
                STTB_list.append(measurement_data[var_row][var_value])
            else:
                print ('Exception!!!\nNumber of categories exceed the hardcoded number of categories.')

    #Create a dictionary containing the variable lists:
    d_data = {'Site':site_list,
         'SamplingHeight':samplingHeight_list,
         'Year':year_list,
         'Month':month_list,
         'Day':day_list,
         'Hour':hour_list,
         'Minute':minute_list,
         'DecimalDate':decimalDate_list,
         'CO2':co2_list,
         'Stdev':stdev_list,
         'NbPoints':nbPoints_list,
         'Flag':flag_list,
         'InstrumentID':instrumentID_list,
         'QualityID':qualityID_list,
         'LTR':LTR_list,
         'CMR':CMR_list,
         'STTB':STTB_list
        }

    #Create a pandas data frame from the dictionary:
    #df_data = pd.DataFrame(d_data)

    #Python pandas order the columns in an alphabetical order.
    #If you wish to keep the order of the columns as it was in the initial txt file, run the code bellow:
    df_data = pd.DataFrame(d_data,columns=['Site', 'SamplingHeight', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'DecimalDate', 'CO2', 'Stdev', 'NbPoints', 'Flag', 'InstrumentID', 'QualityID', 'LTR', 'CMR', 'STTB'])

    #Display dataframe:
    #df_data

    #Display the remaining contents of the list (only metadata headers are left after the split): 
    #data_split

    ############################################# Create a function of this!
    #Remove 1st and last item from the list:
    del data_split[0], data_split[-1]

    #Show the contents of the list after the removal of the 1st and last list item:
    #data_split

    #Create a list to store the extracted metadata:
    data_split_repl = []

    #Loop that removes the first empty-space from the left in a string as well as the occurence of "\n":
    for i in range(len(data_split)):
        data_split_repl.append(data_split[i].lstrip().replace('\n', ''))

    #Display list
    #data_split_repl
    
    #In the metadata text, separate labels from information
    #Create list to store formatted data that separates the metadata labels from their corresponding metadata info:
    label_info = []
    label_list = []
    info_list = []
    comment_list = []

    #Loop that splits every list item to "label-part" and "info-part":
    for lbl_info_row in range(len(data_split_repl)):
        label_info.append(data_split_repl[lbl_info_row].split(': '))
    
        #Store every list-item's "label-part" in the label_list and every "info_part" in the info_list:
        if(lbl_info_row < 28):
            label_list.append(label_info[lbl_info_row][0])
            info_list.append(label_info[lbl_info_row][1])
        #Handle the special case of the "comment-label":
        elif(lbl_info_row == 28):
            label_list.append(label_info[lbl_info_row][0].replace(':', ''))
        else:
            #Store the comment-label's info data in the comment_list:
            comment_list.append(data_split_repl[lbl_info_row])

    #Add the comment_list to the info_list:
    info_list.append(comment_list)

    #Add the data_columns list to the info_list:
    info_list.append(data_labels)

    #Add a label for the data-columns in the label_list:
    label_list.append('DATA COLUMNS')

    ####CONTROL RESULTS####
    #print (label_list, '\n')
    #print (info_list)
    
    #Create a dictionary containing the metadata-label & metadata-info lists:
    d_metadata = {'MetadataLabel':label_list,
                  'MetadataInfo':info_list,
                 }

    #Create a pandas data frame from the dictionary:
    #df_metadata = pd.DataFrame(d_metadata)

    #Python pandas order the columns in an alphabetical order.
    #If you wish to keep the order of the columns as it was in the initial txt file, run the code bellow:
    df_metadata = pd.DataFrame(d_metadata,columns=['MetadataLabel', 'MetadataInfo'])

    #Display dataframe:
    #df_metadata
    
    # Convert CO2 missing values (assigned the value -999.990 or -999.999) to NaN
    mask_co2 = df_data.CO2 < 0
    column_name_co2 = 'CO2'
    df_data.loc[mask_co2, column_name_co2] = nan

    # Convert Stdev-values of CO2 missing values (assigned the value -9.990 or -9.999) to NaN
    mask_stdev = df_data.Stdev < 0
    column_name_stdev = 'Stdev'
    df_data.loc[mask_stdev, column_name_stdev] = nan

    #Display updated df_data:
    #df_data

    #df_data.loc[df_data['Stdev']<0] 
    #Call function to add a datetime obj to the data dataframe:
    datetime_obj_list = func_list_datetimeObj(df_data)    
    #Add list with datetime objects to the data dataframe:
    df_data['DateTime'] =  datetime_obj_list

    #df_data
    
    return df_data, df_metadata