In [1]:
import pandas as pd
import os

In [2]:
variable_dir = "../../../../data/test_correlation/data_soundscape"
output_dir = "../../../../data/test_correlation/PCA/PCA.pkl"

In [3]:
# Define MAHY campaigns date range (so that current_speed and other variables match)
date_min = pd.to_datetime("2020-10-13")
date_max = pd.to_datetime("2024-09-27")

# Name of variables
variables = [
    "wind_speed",
    "current_speed",
    "total_precipitation",
    "wave_height",
    "wave_period"
]

daily_dfs = {}

for var in variables:
    file_path = os.path.join(variable_dir, var, "h2.pkl") # h2 is the hydrophone with best correlations to sound

    df = pd.read_pickle(file_path)

    df.index = pd.to_datetime(df.index)

    if var!= "current_speed":
        if var == "total_precipitation":
            df = df.resample("D").mean() # Mean filter for total precipitation is more meaningful
        else:
            df = df.resample("D").median()


    # Restrict dataframe to MAHY date range
    df = df[(df.index >= date_min) & (df.index <= date_max)]

    # Rename column
    df.columns = [var]

    # Store data
    daily_dfs[var] = df

# Merge all variables on the date index
df_final = pd.concat(daily_dfs.values(), axis=1)

# Sort and save
df_final.sort_index(inplace=True)
df_final.to_pickle(output_dir)

In [4]:
print(df_final.head())

            wind_speed  current_speed  total_precipitation  wave_height  \
2020-10-13    2.507703       0.121684             0.000190     0.803816   
2020-10-14    5.274800       0.123749             0.000440     1.099759   
2020-10-15    2.550296       0.143577             0.000006     0.936439   
2020-10-16    2.283948       0.138994             0.000029     1.107630   
2020-10-17    2.886081       0.154113             0.000050     1.074547   

            wave_period  
2020-10-13     6.389330  
2020-10-14     6.067352  
2020-10-15     6.712574  
2020-10-16     9.223223  
2020-10-17     8.870979  
