In [1]:
# Import Libraries needed
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from os import listdir
from os.path import join
from datetime import datetime

In [2]:
# Set file paths for mechanical readings csv and wav file location
DATA_PATH = "/kaggle/input/rain-data-master-8k"
MECH_FILE_PATH = "/kaggle/input/rain-data-master-8k/rain_data_mechanical_master.csv"
NON_MECH_PATH = "/kaggle/input/rain-data-master-8k/rainfall_sound_8k"
Fs = 8000
MAX_LEN = 1360000

In [21]:
# Read mechanical raingauge readings as csv using pandas
mech_data = pd.read_csv(join(DATA_PATH, MECH_FILE_PATH))

# Convert time column to pandas time datatype
mech_data["Time"]= pd.to_datetime(mech_data["Time"])

# Get list of wav files and sort them in ascending order
wave_files = sorted(listdir(NON_MECH_PATH))

# Get total count of wav files available
N = len(wave_files)

In [22]:
# Function to parse date and time components from wav filenames
def filename_parser(filename):
    year, month, day, hour, minute, second, _ = map(int, filename.split(".")[0].split("_"))
    return datetime(year, month, day, hour, minute, second)

# Get the fist starting time available for wav files
start_time = filename_parser(wave_files[0])

# Get the last time available for wav files
end_time = filename_parser(wave_files[-1])

In [23]:
# Find overlapping time instances where mechanical readings and wav files are available
row_overlap = (mech_data["Time"]>start_time)&(mech_data["Time"]<end_time)
mech_data = mech_data[row_overlap]

# permanent_setup_day = datetime(2023, 12, 8)
# mech_data = mech_data[mech_data["Time"]>=permanent_setup_day]

In [24]:
mech_data["Time"] = mech_data["Time"].astype('str')
mech_data[["YMD", "HMS"]] = mech_data["Time"].str.split(" ",expand=True)
mech_data["YMD"].value_counts().keys()

Index(['2023-12-17', '2023-11-23', '2023-12-01', '2023-11-22', '2023-12-02',
       '2024-01-05', '2023-12-08', '2023-12-16', '2023-12-18'],
      dtype='object', name='YMD')

In [None]:
# Function to load a wav file using librosa and get its sampling rate and duration
def load_wav(file_path, Fs):
    audio, Fs = librosa.load(file_path, sr=Fs)
    duration = librosa.get_duration(y=audio, sr=Fs)
    return audio, Fs, duration

In [None]:
# Function to trim unit from target column and convert all values to millimeters
def format_rainfall(rain_fall):
    rain_fall, unit = rain_fall.split(" ")
    if unit == "µm":
        rain_fall = float(rain_fall)/(10**3)
    elif unit == "mm":
        rain_fall = float(rain_fall)
    return rain_fall

In [None]:
# Function to find checkpoints as per mechanical raingauge and its respective rainfall in mm
def get_checkpoints(mech_data, wave_files):
    start_time = filename_parser(wave_files[0])
    start_times = []
    checkpoints = []
    targets = []
    for idx, row in mech_data.iterrows():
        checkpoint = row["Time"]
        target = format_rainfall(row["device_frmpayload_data_rainfall"])
        start_times.append(start_time)
        checkpoints.append(checkpoint)
        targets.append(target)
        start_time = checkpoint
    return start_times, checkpoints, targets

In [None]:
start_times, checkpoints, targets = get_checkpoints(mech_data, wave_files)
K = len(checkpoints)
print("Length of start times: ", len(start_times))
print("Length of checkpoints: ", K)
print("Length of targets: ", len(targets))

In [None]:
# Function to check whether a given wav file belongs in a given (3 min) time interval
def file_flagger(file_name, start_time, checkpoint):
    file_name_short = file_name.split("_")[:-1]
    year, month, day, hour, minute, second = map(int, file_name_short)
    fname_time = datetime(year, month, day, hour, minute, second)
    return start_time <= fname_time <= checkpoint

def file_filter(wave_files, start_time, checkpoint):
    filtered_files = [x for x in wave_files if file_flagger(x, start_time, checkpoint)]
    return filtered_files

In [None]:
# Code block to combine all wav files from last 3 minutes from checkpoint
data_basic = pd.DataFrame()
target = np.array([])
for idx in tqdm(range(1, K)):
    if start_times[idx]!=checkpoints[idx]:
        selected_files = file_filter(wave_files, start_times[idx], checkpoints[idx])
        num_files = len(selected_files)
        if num_files:
            audio_sample = np.array([])
            for one_file in selected_files:
                file_path = join(NON_MECH_PATH, one_file)
                audio, Fs, duration = load_wav(file_path, Fs)
                audio_sample = np.append(audio_sample, audio)
            audio_sample = audio_sample[:MAX_LEN]
            with open("audio_{}.npy".format(idx), "wb") as f:
                np.save(f, audio_sample)
            data_row = {"start_time": start_times[idx],
                        "checkpoint": checkpoints[idx],
                        "num_files": num_files,
                        "fname": "audio_{}.npy".format(idx), 
                        "target": targets[idx]}
            data_basic = pd.concat([data_basic, pd.DataFrame([data_row])], ignore_index=True)
    
data_basic.to_csv("rain_data_basic.csv")