## How to use the CarbonPortal Binary File extractor

### This shows how to use the binary file access module. Not all the data is available, but in essence everything which is available in the "preview" on http://data.icos-cp.eu can be accessed and processed directly in python.


In [None]:
# python lib
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from tqdm import tqdm

###  ICOS specific libraries

In [None]:
#Import module:
import sys

#Set path to ICOS tools:
sys.path.insert(0,'/data/project/pytools')

#Import ICOS tools:
from icoscp.sparql import sparqls
from icoscp.cpb.cpbinfile import CpBinFile
from icoscp.sparql.runsparql import RunSparql


### Get a list of all Atmosphere CO2 level2 data objects

In [None]:
query = sparqls.atc_co2_level2()
fmt = 'pandas'
sparql = RunSparql(query, fmt)
files = sparql.run()
files

### Lets see what columns are inside the "first" object. To extract data you need to have the persistent digital object identifier (PID). The "files" from above contain that link in the column 'dobj'

In [None]:
f = CpBinFile(files['dobj'][0])
if f.valid:
    print(f.colNames)
else:
    SystemExit('no binary data found', 0)

### Extract the data as a pandas dataframe

In [None]:
data = f.getColumns() 
data

### Extract the units from the metadata

In [None]:
time_idx = list(f.colNames).index('TIMESTAMP')
time_unit = f.info[1].iloc[time_idx]['valueType']
print('time: ' + time_unit)

co2_idx = list(f.colNames).index('co2')
co2_unit = f.info[1].iloc[co2_idx]['unit']
print('co2: ' + co2_unit)

 ### Lets create a figure, to plot the co2 series  for the first object with proper labels and a moving window mean

In [None]:
fig1, ax1 = plt.subplots(figsize=(10,7))
fig1.suptitle("CO2 data for one site \n and a moving window mean  ")

ax1.plot(data['TIMESTAMP'], data['co2'])

# create a smoothed data series (moving window), with the mean of a month
# if you look at the timestamps, you see each record is one hour, hence

smooth =  data['co2'].ewm(span = 4*7*24).mean()   
ax1.plot(data['TIMESTAMP'], smooth)

# set the labels for x and y axis
ax1.set_xlabel(time_unit)
ax1.set_ylabel(co2_unit)

plt.grid(True)
plt.show()

### Since we now know all about the files, loop through datasets and plot the smoothed data only


In [None]:
fig2, ax2 = plt.subplots(figsize=(10,7))
fig2.suptitle("Moving window mean for all datasets")
for dobjId in tqdm(files['dobj']):
    f.dobj = dobjId
    try:
        data = f.getColumns(['TIMESTAMP','co2']) 
        data['TIMESTAMP'] = pd.to_datetime(data['TIMESTAMP'], unit='ms')
        smooth =  data['co2'].ewm(span = 4*7*24).mean()   
        ax2.plot(data['TIMESTAMP'], smooth)
    except: 
        pass
# set the labels for x and y axis
ax2.set_xlabel(time_unit)
ax2.set_ylabel(co2_unit)
plt.grid(True)    