In [1]:
import pandas as pd
import os

In [6]:
variable_dir = "../../../../../../data/test_correlation/data_soundscape"
output_dir = "../../../../../../data/test_correlation/PCA/PCA.pkl"

In [9]:
# Define MAHY campaigns date range (so that current_speed and other variables match)
date_min = pd.to_datetime("2020-10-13")
date_max = pd.to_datetime("2024-09-27")

# Name of variables
variables = [
    "wind_speed",
    "current_speed",
    "total_precipitation",
    "wave_height",
    "wave_period"
]

daily_dfs = {}

for var in variables:
    file_path = os.path.join(variable_dir, var, "h2.pkl") # h2 is the hydrophone with best correlations to sound

    df = pd.read_pickle(file_path)

    df.index = pd.to_datetime(df.index)

    if var!= "current_speed":
        if var == "total_precipitation":
            df = df.resample("D").mean() # Mean filter for total precipitation is more meaningful
        else:
            df = df.resample("D").median()


    # Restrict dataframe to MAHY date range
    df = df[(df.index >= date_min) & (df.index <= date_max)]

    # Rename columnquelle méthode de corrélation choisir
    df.columns = [var]

    # Store data
    daily_dfs[var] = df

# Merge all variables on the date index
df_final = pd.concat(daily_dfs.values(), axis=1)

# Sort and save
df_final.sort_index(inplace=True)
df_final.to_pickle(output_dir)

In [10]:
df_final

Unnamed: 0,wind_speed,current_speed,total_precipitation,wave_height,wave_period
2020-10-13,3.260732,0.359753,0.000208,0.880543,6.897312
2020-10-14,5.427910,0.364634,0.000546,1.165013,6.425883
2020-10-15,2.552592,0.362585,0.000002,1.026866,6.963400
2020-10-16,2.530052,0.378530,0.000012,1.235508,9.658520
2020-10-17,3.403784,0.379249,0.000044,1.195181,9.225677
...,...,...,...,...,...
2024-09-23,2.930956,0.065563,0.000054,1.068743,8.404451
2024-09-24,2.463402,0.055704,0.000065,1.382326,8.313869
2024-09-25,4.648966,0.048254,0.000051,1.262825,7.322538
2024-09-26,1.851627,0.045414,0.000053,1.135633,8.522695
