# Build Raw Datasets

In [None]:
import os
import json
import pathlib
import pandas as pd

from copy import deepcopy

In [None]:
data_folder = os.path.join("..", "data")
utils_folder = os.path.join("..", "utils")

raw_data_path = os.path.join(data_folder, "Raw Data")
interm_data_path = os.path.join(data_folder, "Intermediate Data")

eb_measure_path = os.path.join(raw_data_path, "EB0_EB1.xlsx")
eb0_sensor_path = os.path.join(raw_data_path, "Dades sensor online EB0")
eb1_sensor_path = os.path.join(raw_data_path, "Dades sensor online EB1")
thm_measure_path = os.path.join(raw_data_path, "THMs_ARBOÇ_2022.csv")

# EB0 - EB1 laboratory measurements

In [None]:
raw_eb0_measure_df = pd.read_excel(eb_measure_path, sheet_name="EB0")
raw_eb1_measure_df = pd.read_excel(eb_measure_path, sheet_name="EB1")

## EB0

In [None]:
raw_eb0_measure_df

In [None]:
eb0_measure_df = raw_eb0_measure_df.copy()

eb0_measure_df["data"] = pd.to_datetime(
    eb0_measure_df["data"], format="%d/%m/%Y"
).dt.date
eb0_measure_df["SAMPDATE"] = pd.to_datetime(
    eb0_measure_df["SAMPDATE"], format="%d/%m/%Y %H:%M:%S"
).dt.time

In [None]:
eb0_measure_df

In [None]:
# load eb0 features and targets
with open(
    os.path.join(utils_folder, "lists", "eb0_measure_features.json"), "r"
) as f:
    eb0_measure_features = json.load(f)

with open(
    os.path.join(utils_folder, "lists", "eb0_measure_targets.json"), "r"
) as f:
    eb0_measure_targets = json.load(f)

In [None]:
# Create a new dataframe with only the features of interest, where the name of the features in the list above are in the column 'ANALYTE'
eb0_features_df = eb0_measure_df[
    eb0_measure_df["ANALYTE"].isin(eb0_measure_features)
]

eb0_features_df.drop(
    columns=[
        "any_",
        "FOLDERNO",
        "APPRSTS",
        "CLSAMPNO",
        "DISPSTS",
        "TESTNO",
        "SINONYM",
        "SINAC_SENDED",
    ],
    inplace=True,
)

In [None]:
eb0_targets_df = eb0_measure_df[
    eb0_measure_df["ANALYTE"].isin(eb0_measure_targets)
]

eb0_targets_df.drop(
    columns=[
        "any_",
        "FOLDERNO",
        "APPRSTS",
        "CLSAMPNO",
        "DISPSTS",
        "TESTNO",
        "SINONYM",
        "SINAC_SENDED",
    ],
    inplace=True,
)

In [None]:
eb0_features_df["RESULTASNUMERIC"] = eb0_features_df["RESULTASNUMERIC"].astype(
    float
)
eb0_targets_df["RESULTASNUMERIC"] = eb0_targets_df["RESULTASNUMERIC"].astype(
    float
)

In [None]:
eb0_features_df.insert(
    loc=0,
    column="DateTime",
    value=pd.to_datetime(
        eb0_features_df["data"].astype(str)
        + " "
        + eb0_features_df["SAMPDATE"].astype(str)
    ),
)

eb0_targets_df.insert(
    loc=0,
    column="DateTime",
    value=pd.to_datetime(
        eb0_targets_df["data"].astype(str)
        + " "
        + eb0_targets_df["SAMPDATE"].astype(str)
    ),
)

In [None]:
eb0_features_df.drop(columns=["data", "SAMPDATE"], inplace=True)
eb0_targets_df.drop(columns=["data", "SAMPDATE"], inplace=True)

In [None]:
eb0_features_df

In [None]:
eb0_features_df[eb0_features_df["ANALYTE"] == "Índex UV"]

In [None]:
eb0_targets_df

### Create DataFrame with one sample per unique DateTime

In [None]:
eb0_features_df = eb0_features_df.pivot_table(
    index=pd.Grouper(key="DateTime"),
    columns="ANALYTE",
    values="RESULTASNUMERIC",
)

In [None]:
eb0_features_df

In [None]:
eb0_targets_df = eb0_targets_df.pivot_table(
    index=pd.Grouper(key="DateTime"),
    columns="ANALYTE",
    values="RESULTASNUMERIC",
)

In [None]:
eb0_targets_df

In [None]:
eb0_features_df.dropna(how="all", inplace=True)
eb0_targets_df.dropna(how="all", inplace=True)

In [None]:
eb0_min_date = eb0_features_df.index.min()
eb0_max_date = eb0_features_df.index.max()

In [None]:
eb0_features_df.reset_index(inplace=True)
eb0_targets_df.reset_index(inplace=True)

In [None]:
eb0_features_df

In [None]:
eb0_targets_df

In [None]:
eb0_features_df.to_excel(
    os.path.join(interm_data_path, "EB0_Features_Lab.xlsx"), index=False
)

eb0_targets_df.to_excel(
    os.path.join(interm_data_path, "EB0_Targets_Lab.xlsx"), index=False
)

## EB1

In [None]:
raw_eb1_measure_df

In [None]:
eb1_measure_df = raw_eb1_measure_df.copy()

eb1_measure_df["data"] = pd.to_datetime(
    eb1_measure_df["data"], format="%d/%m/%Y"
).dt.date
eb1_measure_df["SAMPDATE"] = pd.to_datetime(
    eb1_measure_df["SAMPDATE"], format="%d/%m/%Y %H:%M:%S"
).dt.time

In [None]:
eb1_measure_df

In [None]:
# load eb1 features and targets
with open(
    os.path.join(utils_folder, "lists", "eb1_measure_features.json"), "r"
) as f:
    eb1_measure_features = json.load(f)

with open(
    os.path.join(utils_folder, "lists", "eb1_measure_targets.json"), "r"
) as f:
    eb1_measure_targets = json.load(f)

In [None]:
eb1_features_df = eb1_measure_df[
    eb1_measure_df["ANALYTE"].isin(eb1_measure_features)
]

eb1_features_df.drop(
    columns=[
        "any_",
        "FOLDERNO",
        "APPRSTS",
        "CLSAMPNO",
        "DISPSTS",
        "TESTNO",
        "SINONYM",
        "SINAC_SENDED",
    ],
    inplace=True,
)

In [None]:
eb1_targets_df = eb1_measure_df[
    eb1_measure_df["ANALYTE"].isin(eb1_measure_targets)
]

eb1_targets_df.drop(
    columns=[
        "any_",
        "FOLDERNO",
        "APPRSTS",
        "CLSAMPNO",
        "DISPSTS",
        "TESTNO",
        "SINONYM",
        "SINAC_SENDED",
    ],
    inplace=True,
)

In [None]:
eb1_features_df["RESULTASNUMERIC"] = eb1_features_df["RESULTASNUMERIC"].astype(
    float
)
eb1_targets_df["RESULTASNUMERIC"] = eb1_targets_df["RESULTASNUMERIC"].astype(
    float
)

In [None]:
eb1_features_df.insert(
    loc=0,
    column="DateTime",
    value=pd.to_datetime(
        eb1_features_df["data"].astype(str)
        + " "
        + eb1_features_df["SAMPDATE"].astype(str)
    ),
)

eb1_targets_df.insert(
    loc=0,
    column="DateTime",
    value=pd.to_datetime(
        eb1_targets_df["data"].astype(str)
        + " "
        + eb1_targets_df["SAMPDATE"].astype(str)
    ),
)

In [None]:
eb1_features_df.drop(columns=["data", "SAMPDATE"], inplace=True)
eb1_targets_df.drop(columns=["data", "SAMPDATE"], inplace=True)

In [None]:
eb1_features_df

In [None]:
eb1_targets_df

### Create DataFrame with one sample per unique DateTime

In [None]:
eb1_features_df = eb1_features_df.pivot_table(
    index=pd.Grouper(key="DateTime"),
    columns="ANALYTE",
    values="RESULTASNUMERIC",
)

In [None]:
eb1_features_df

In [None]:
eb1_targets_df = eb1_targets_df.pivot_table(
    index=pd.Grouper(key="DateTime"),
    columns="ANALYTE",
    values="RESULTASNUMERIC",
)

In [None]:
eb1_targets_df

In [None]:
eb1_features_df.dropna(how="all", inplace=True)
eb1_targets_df.dropna(how="all", inplace=True)

In [None]:
eb1_min_date = eb1_features_df.index.min()
eb1_max_date = eb1_features_df.index.max()

In [None]:
eb1_features_df.reset_index(inplace=True)
eb1_targets_df.reset_index(inplace=True)

In [None]:
eb1_features_df.to_excel(
    os.path.join(interm_data_path, "EB1_Features_Lab.xlsx"), index=False
)

eb1_targets_df.to_excel(
    os.path.join(interm_data_path, "EB1_Targets_Lab.xlsx"), index=False
)

# Online Sensors Reading

In [None]:
def load_data_from_folder(folder_path):
    df_dict = {}
    for filename in os.listdir(folder_path):
        f = os.path.join(folder_path, filename)
        if os.path.isfile(f):
            if not filename.startswith("."):
                if filename.endswith(".csv"):
                    with open(os.path.join(folder_path, filename), "r") as file:
                        lines = file.readlines()

                        corrected_lines = []
                        line_iter = iter(lines)
                        max_len = 0
                        for line in line_iter:
                            if len(line.split(";")) > max_len:
                                max_len = len(line.split(";"))

                            if len(line.split(";")) < max_len:
                                try:
                                    line = line + next(line_iter)
                                except StopIteration:
                                    pass

                            if "n/a;\n" in line:
                                if not line.endswith("n/a;\n"):
                                    line = line.replace("n/a;\n", "n/a;")
                                    line = line.rstrip(";\n") + "\n"
                                else:
                                    line = line.replace("n/a;\n", "n/a;")
                                    line = line.rstrip(";") + "\n"
                            else:
                                line = line.rstrip(";\n") + "\n"

                            corrected_lines.append(line)

                        # Write the corrected lines to a new file
                        corrected_file = os.path.join(folder_path, filename)
                        with open(corrected_file, "w") as file:
                            file.writelines(corrected_lines)

                    # Load the corrected file with pandas
                    df = pd.read_csv(corrected_file, sep=";", na_filter=False)
                elif filename.endswith(".xlsx"):
                    df = pd.read_excel(f)
                else:
                    raise Exception("File format not supported")
                df_dict[pathlib.Path(filename).stem] = df
    return df_dict

## EB0 online sensor readings

In [None]:
raw_eb0_sensor_df_dict = load_data_from_folder(eb0_sensor_path)

In [None]:
new_dict = {}
for key in deepcopy(raw_eb0_sensor_df_dict).keys():
    new_key = key.partition("_")[2].upper()
    new_dict[new_key] = deepcopy(raw_eb0_sensor_df_dict).pop(key)
raw_eb0_sensor_df_dict = new_dict

In [None]:
raw_eb0_sensor_df_dict.keys()

In [None]:
raw_eb0_sensor_df_dict["COND"]

In [None]:
eb0_sensor_df_dict = deepcopy(raw_eb0_sensor_df_dict)

In [None]:
for key, item in eb0_sensor_df_dict.items():
    item["Fecha"] = pd.to_datetime(item["Fecha"], format="%d/%m/%Y").dt.date
    item["HORA"] = pd.to_datetime(item["HORA"], format="%H:%M:%S").dt.time
    item.insert(
        loc=0,
        column="DateTime",
        value=pd.to_datetime(
            item["Fecha"].astype(str) + " " + item["HORA"].astype(str)
        ),
    )
    item.drop(columns=["Fecha", "HORA"], inplace=True)
    for col in item.columns[1:]:
        item[col] = item[col].replace("n/a", None, regex=True)
        item.dropna(inplace=True)
        item[col] = item[col].replace(",", ".", regex=True)
        item[col] = item[col].astype(float)

    eb0_sensor_df_dict[key] = item[
        (item["DateTime"] >= eb0_min_date) & (item["DateTime"] <= eb0_max_date)
    ]

In [None]:
# the column that contains the measured value is the first one containing _PV
# rename it with the name of the measured parameter
for key, item in eb0_sensor_df_dict.items():
    for col in item.columns:
        if "_PV" in col:
            item.rename(columns={col: key}, inplace=True)
            break

In [None]:
# a sample is considered valid if the columns after the one containing the measured value (the second one) are all equal to 0
# if not, the sample is considered invalid and is dropped
for key, item in eb0_sensor_df_dict.items():
    # Select the columns after the second one that contain '_VM', 'VA' and 'F1' and check if all values are equal to 0
    selected_columns = item.columns[2:].tolist()
    filtered_columns = (
        item.filter(selected_columns).filter(like="_VM").columns.tolist()
        + item.filter(selected_columns).filter(like="VA").columns.tolist()
        + item.filter(selected_columns).filter(like="F1").columns.tolist()
    )

    mask = (item[filtered_columns] != 0).any(axis=1)

    # Drop the rows where the mask is True
    item.drop(item[mask].index, inplace=True)

    # Drop the columns after the second one
    item.drop(columns=item.columns[2:], inplace=True)

In [None]:
# get the median sampling rate for each sensor to retrieve the most frequent sampling rate
sampling_rates = {}
for key, item in eb0_sensor_df_dict.items():
    sampling_rates[key] = item["DateTime"].diff().median()

# get the most frequent sampling rate
sampling_rate = max(
    set(sampling_rates.values()), key=list(sampling_rates.values()).count
)

# resample the dataframes to the most frequent sampling rate
for key, item in eb0_sensor_df_dict.items():
    item.set_index("DateTime", inplace=True)
    item = item[~item.index.duplicated(keep="first")]
    item = item.resample(sampling_rate).interpolate(method="time")
    item.reset_index(inplace=True)
    eb0_sensor_df_dict[key] = item

In [None]:
sampling_rates

In [None]:
# Initialize the final DataFrame with the first DataFrame in the dictionary
eb0_sensor_df = next(iter(eb0_sensor_df_dict.values()))

# Merge the rest of the DataFrames
for key, df in list(eb0_sensor_df_dict.items())[1:]:
    eb0_sensor_df = eb0_sensor_df.merge(df, on="DateTime", how="outer")

# Sort the final DataFrame by 'DateTime'
eb0_sensor_df.sort_values("DateTime", inplace=True)

In [None]:
# Sort columns after DateTime by name
eb0_sensor_df = eb0_sensor_df.reindex(sorted(eb0_sensor_df.columns), axis=1)

# move DateTime to the first column
col = eb0_sensor_df.pop("DateTime")
eb0_sensor_df.insert(0, "DateTime", col)

In [None]:
eb0_sensor_df

### Compare TEMP and TEMP_AIGUA with Temperatura

In [None]:
lab_temp_df = eb0_features_df[["DateTime", "Temperatura"]].copy()

sensor_temp_df = eb0_sensor_df[["DateTime", "TEMP"]].copy()
sensor_temp_aigua_df = eb0_sensor_df[["DateTime", "TEMP_AIGUA"]].copy()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(20, 10))

sns.lineplot(data=lab_temp_df, x="DateTime", y="Temperatura", label="Lab")
sns.lineplot(data=sensor_temp_df, x="DateTime", y="TEMP", label="Sensor")
sns.lineplot(
    data=sensor_temp_aigua_df,
    x="DateTime",
    y="TEMP_AIGUA",
    label="Sensor_water",
)

In [None]:
# drop TEMP from the sensor dataframe and keep only TEMP_AIGUA
eb0_sensor_df.drop(columns=["TEMP"], inplace=True)

In [None]:
eb0_sensor_df.to_excel(os.path.join(interm_data_path, "EB0_Sensor.xlsx"))

## EB1 online sensor readings

In [None]:
raw_eb1_sensor_df_dict = load_data_from_folder(eb1_sensor_path)

In [None]:
new_dict = {}
for key in deepcopy(raw_eb1_sensor_df_dict).keys():
    new_key = " ".join(key.split(" ")[:-1]).upper()
    new_dict[new_key] = deepcopy(raw_eb1_sensor_df_dict).pop(key)
raw_eb1_sensor_df_dict = new_dict

In [None]:
raw_eb1_sensor_df_dict.keys()

In [None]:
# remove SCAN variables from the dictionary since they are less reliable than the other sensors
raw_eb1_sensor_df_dict.pop("TERBOLESA SCAN")
raw_eb1_sensor_df_dict.pop("UVA SCAN")

In [None]:
raw_eb1_sensor_df_dict["PH"]

In [None]:
eb1_sensor_df_dict = deepcopy(raw_eb1_sensor_df_dict)

In [None]:
for key, item in eb1_sensor_df_dict.items():
    item.dropna(inplace=True)
    item["Fecha"] = pd.to_datetime(
        item["Fecha"].astype(str).str.lstrip(),
        format="%d/%m/%Y",
        errors="coerce",
    ).dt.date

    # if HORA is in the column list
    if "HORA" in item.columns:
        item.drop(item[item["HORA"] == "0"].index, inplace=True)
        item["HORA"] = pd.to_datetime(item["HORA"], format="%H:%M:%S").dt.time

        item.insert(
            loc=0,
            column="DateTime",
            value=pd.to_datetime(
                item["Fecha"].astype(str) + " " + item["HORA"].astype(str)
            ),
        )
        item.drop(columns=["Fecha", "HORA"], inplace=True)
    else:
        item.insert(
            loc=0,
            column="DateTime",
            value=item["Fecha"],
        )
        item.drop(columns=["Fecha"], inplace=True)

    for col in item.columns[1:]:
        item[col] = item[col].replace("n/a", None, regex=True)
        item.dropna(inplace=True)
        item[col] = item[col].replace(",", ".", regex=True)
        item[col] = item[col].astype(float)

    eb1_sensor_df_dict[key] = item[
        (item["DateTime"] >= eb1_min_date) & (item["DateTime"] <= eb1_max_date)
    ]

In [None]:
# the column that contains the measured value is the first one containing _PV
# rename it with the name of the measured parameter
for key, item in eb1_sensor_df_dict.items():
    for col in item.columns:
        if "_PV" in col:
            item.rename(columns={col: key}, inplace=True)
            break

In [None]:
# a sample is considered valid if the columns after the one containing the measured value (the second one) are all equal to 0
# if not, the sample is considered invalid and is dropped
for key, item in eb1_sensor_df_dict.items():
    # Select the columns after the second one and check if all values are equal to 0
    mask = (item.iloc[:, 2:] != 0).any(axis=1)

    # Drop the rows where the mask is True
    item.drop(item[mask].index, inplace=True)

    # Drop the columns after the second one
    item.drop(columns=item.columns[2:], inplace=True)

In [None]:
# get the median sampling rate for each sensor to retrieve the most frequent sampling rate
sampling_rates = {}
for key, item in eb1_sensor_df_dict.items():
    sampling_rates[key] = item["DateTime"].diff().median()

# get the most frequent sampling rate
sampling_rate = max(
    set(sampling_rates.values()), key=list(sampling_rates.values()).count
)

# resample the dataframes to the most frequent sampling rate
for key, item in eb1_sensor_df_dict.items():
    item.set_index("DateTime", inplace=True)
    item = item[~item.index.duplicated(keep="first")]
    item = item.resample(sampling_rate).interpolate(method="time")
    item.reset_index(inplace=True)
    eb1_sensor_df_dict[key] = item

In [None]:
sampling_rates

In [None]:
# Initialize the final DataFrame with the first DataFrame in the dictionary
eb1_sensor_df = next(iter(eb1_sensor_df_dict.values()))

# Merge the rest of the DataFrames
for key, df in list(eb1_sensor_df_dict.items())[1:]:
    eb1_sensor_df = eb1_sensor_df.merge(df, on="DateTime", how="outer")

# Sort the final DataFrame by 'DateTime'
eb1_sensor_df.sort_values("DateTime", inplace=True)

In [None]:
# Sort columns after DateTime by name
eb1_sensor_df = eb1_sensor_df.reindex(sorted(eb1_sensor_df.columns), axis=1)

# move DateTime to the first column
col = eb1_sensor_df.pop("DateTime")
eb1_sensor_df.insert(0, "DateTime", col)

In [None]:
eb1_sensor_df

In [None]:
eb1_sensor_df.to_excel(os.path.join(interm_data_path, "EB1_Sensor.xlsx"))

# THM measurements

In [None]:
# remove the last ";" from the last line of the file

with open(thm_measure_path, "r") as file:
    lines = file.readlines()

    corrected_lines = []
    line_iter = iter(lines)
    max_len = 0
    for line in line_iter:
        line = line.rstrip(";\n") + "\n"
        corrected_lines.append(line)

with open(thm_measure_path, "w") as file:
    file.writelines(corrected_lines)

In [None]:
raw_thm_measure_df = pd.read_csv(thm_measure_path, sep=";", na_filter=False)

In [None]:
raw_thm_measure_df

In [None]:
thm_measure_df = deepcopy(raw_thm_measure_df)

In [None]:
thm_measure_df["Data"] = pd.to_datetime(
    thm_measure_df["Data"], format="%d/%m/%Y"
).dt.date


thm_measure_df["Hora"] = pd.to_datetime(
    thm_measure_df["Hora"], format="%Hh%M"
).dt.time

In [None]:
thm_measure_df[["TCM", "DCBM", "CDBM", "TBM", "TTHMs"]] = thm_measure_df[
    ["TCM", "DCBM", "CDBM", "TBM", "TTHMs"]
].astype(float)

In [None]:
thm_measure_df.drop(columns=["PM"], inplace=True)

In [None]:
# union of the date and time columns
thm_measure_df.insert(
    loc=0,
    column="DateTime",
    value=pd.to_datetime(
        thm_measure_df["Data"].astype(str)
        + " "
        + thm_measure_df["Hora"].astype(str)
    ),
)

In [None]:
thm_measure_df.drop(columns=["Data", "Hora"], inplace=True)

In [None]:
thm_measure_df

In [None]:
thm_measure_df.to_excel(os.path.join(interm_data_path, "THMs.xlsx"))