In [1]:
import os
import glob
import pandas as pd
import polars as pl
import numpy
from datetime import datetime
from datetime import timezone
import plotly.express as px

# Get the PICARRO_DATA_DIRECTORY environment variable, or set a default if not found
PICARRO_DATA_DIRECTORY = os.environ.get("PICARRO_DATA_DIRECTORY")
if PICARRO_DATA_DIRECTORY is None:
    # In the folder PICARRO you copy the files M/Day/*.dat
    PICARRO_DATA_DIRECTORY = "/Users/juanbettinelli/Documents/Uni/MasterThesis/4_Scrips_and_Data/4_Data/10_2024_Data/Picarro_Raw"



In [3]:
# util funtions

def process_bottle(data: list, ignore_len: bool = False):
    if ignore_len:
        x = data[int(len(data) * 0.3) : int(len(data) * 0.95)]
        return numpy.median(x)
    # 2nd bottle
    if 50 < len(data) < 70:
        x = data[int(len(data) * 0.3) : int(len(data) * 0.95)]
        return numpy.median(x)
    # 1st bottle
    elif 70 < len(data) < 130:
        x = data[int(len(data) * 0.5) : int(len(data) * 0.95)]
        return numpy.median(x)
    else:
        return 0.0
    
def two_point_calibration(measured_values: list, true_values: list):
    # Check if input lists have length 2
    if len(measured_values) != 2 or len(true_values) != 2:
        return 0, 0

    # Calculate calibration parameters (slope and intercept)

    slope = (true_values[1] - true_values[0]) / (
        measured_values[1] - measured_values[0]
    )
    # y_true = m * y_meas + t
    intercept = true_values[0] - slope * measured_values[0]

    return slope, intercept

In [4]:
filenames = glob.glob(PICARRO_DATA_DIRECTORY + "/*/*/*.dat")

# read all *.dat picarro measurement files and add to single db using Pandas
df_list = []
for filename in filenames:
    df_list.append(pd.read_csv(filename,sep='\s+'))

df = pd.concat(df_list, ignore_index=True)
df["datetime"] = pd.to_datetime((df['DATE'] + ' ' + df['TIME']))
df.sort_values(by='datetime', inplace = True)

In [5]:
# Pandas DF to Polars DF
df = pl.from_pandas(df)
# Select columns
df = df.select(pl.col("datetime").dt.cast_time_unit("us").dt.replace_time_zone("UTC").alias("creation_timestamp"),
               pl.col("CO2_dry"),
               pl.col("h2o_reported"))
df.head(1)

creation_timestamp,CO2_dry,h2o_reported
"datetime[μs, UTC]",f64,f64
2024-04-17 00:00:00.226 UTC,430.096717,0.894978


In [13]:
# ICOS Bottle 610.95 ppm
start_date = datetime(2024, 4, 17, 11, 53, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 4, 17, 12, 24, 0).replace(tzinfo=timezone.utc)

#start_date = datetime(2024, 5, 6, 12, 1, 0).replace(tzinfo=timezone.utc)
#end_date = datetime(2024, 5, 6, 12, 30, 0).replace(tzinfo=timezone.utc)

df_p_600 = df.filter(pl.col("creation_timestamp").is_between(start_date, end_date))

#ICOS Bottle 427.38 ppm
start_date = datetime(2024, 4, 17, 12, 25, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 4, 17, 12, 55, 0).replace(tzinfo=timezone.utc)

#start_date = datetime(2024, 5, 6, 11, 31, 0).replace(tzinfo=timezone.utc)
#end_date = datetime(2024, 5, 6, 12, 00, 0).replace(tzinfo=timezone.utc)

df_p_400 = df.filter(pl.col("creation_timestamp").is_between(start_date, end_date))

In [14]:
df_p_600.head(1)

creation_timestamp,CO2_dry,h2o_reported
"datetime[μs, UTC]",f64,f64
2024-04-17 11:53:00.819 UTC,607.460775,0.01454


In [15]:
measured_values = [None, None]
true_values = [427.38, 610.95]

# 400 ppm 
data = df_p_400.select(pl.col("CO2_dry")).to_series().to_list()
measured_values[0] = process_bottle(data=data, ignore_len=True)

# 600 ppm
data = df_p_600.select(pl.col("CO2_dry")).to_series().to_list()
measured_values[1] = process_bottle(data=data, ignore_len=True)

print(measured_values)

picaro_slope, picarro_intercept = two_point_calibration(measured_values, true_values)
print(picaro_slope, picarro_intercept)



[424.79570109, 607.22807303]
1.006235889211451 -0.06468001949787094
