In [22]:
import tensorflow as tf
import os
from datetime import datetime

In [23]:
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
tf.config.experimental.set_memory_growth(gpus[1], True)

AUTOTUNE = tf.data.AUTOTUNE

In [24]:
INPUT_AUDIO_DIR = os.path.dirname(os.getcwd()) + "/data-manipulation/input/audio/"
INPUT_META_DIR = os.path.dirname(os.getcwd()) + "/data-manipulation/input/metadata/"

if not os.path.exists(INPUT_AUDIO_DIR):
    os.makedirs(INPUT_AUDIO_DIR)
if not os.path.exists(INPUT_META_DIR):
    os.makedirs(INPUT_META_DIR)

## Define functions

In [25]:
def split_into_windows(waveform, frame_length=2205, frame_step=1102): # 50ms windows with 50% overlap
    print("Waveform shape before framing:", waveform.shape)
    frames = tf.signal.frame(waveform, frame_length=frame_length, frame_step=frame_step)
    print("Frames shape after framing (with overlap):", frames.shape)
    return frames

def split_into_sequences(frames, sequence_length=3):
    num_frames = tf.shape(frames)[0]
    sequence_step = 1
    start_indices = tf.range(0, num_frames - sequence_length + 1, sequence_step)
    sequences = tf.map_fn(
        lambda start: frames[start:start + sequence_length],
        start_indices,
        fn_output_signature=tf.TensorSpec(shape=(sequence_length, frames.shape[1]), dtype=frames.dtype)
    )
    print("Frames grouped into sequences:", sequences.shape)
    return sequences

In [None]:
def getAudioTimestamps():
    timestamps = []
    for file in os.listdir(INPUT_AUDIO_DIR):
        split_fn = file.split("-")
        split_fn[1] = split_fn[1][:-4] # Remove .wav extension
        split_fn[0] = float(split_fn[0])
        split_fn[1] = float(split_fn[1])
        timestamps.append({"start_time": datetime.fromtimestamp(split_fn[0]), "end_time": datetime.fromtimestamp(split_fn[1])})
    return(timestamps)

def loadMetadata():
    metadata = []
    for file in os.listdir(INPUT_META_DIR):
        with open(INPUT_META_DIR + file, "r") as file:
            fdata = file.readlines()

        for line in fdata:
            line = line.strip("\n").split(",")
            metadata.append({"key": line[0], "time": datetime.fromtimestamp(float(line[1]))})
    return metadata

# Remove keypresses keylogged outside of recording time
def filterMetadata(audio_timestamps, metadata):
    filtered_metadata = []
    for mdata in metadata:
        for audio_ts in audio_timestamps:
            start_time = audio_ts["start_time"]
            end_time = audio_ts["end_time"]
            timestamp = mdata["time"]
            if start_time <= timestamp <= end_time:
                filtered_metadata.append(mdata)
    return filtered_metadata

print(loadMetadata())
print(filterMetadata(getAudioTimestamps(), loadMetadata()))

[{'key': 'a', 'time': datetime.datetime(2024, 12, 1, 16, 23, 25, 747109)}, {'key': 'b', 'time': datetime.datetime(2024, 12, 1, 16, 23, 26, 362888)}, {'key': 'c', 'time': datetime.datetime(2024, 12, 1, 16, 23, 26, 891134)}, {'key': 'd', 'time': datetime.datetime(2024, 12, 1, 16, 23, 27, 507704)}, {'key': 'e', 'time': datetime.datetime(2024, 12, 1, 16, 23, 27, 971362)}, {'key': 'f', 'time': datetime.datetime(2024, 12, 1, 16, 23, 28, 467856)}, {'key': 'g', 'time': datetime.datetime(2024, 12, 1, 16, 23, 29, 106957)}, {'key': 'h', 'time': datetime.datetime(2024, 12, 1, 16, 23, 30, 27894)}, {'key': 'i', 'time': datetime.datetime(2024, 12, 1, 16, 23, 30, 515035)}, {'key': 'j', 'time': datetime.datetime(2024, 12, 1, 16, 23, 30, 955257)}, {'key': 'k', 'time': datetime.datetime(2024, 12, 1, 16, 23, 31, 347058)}, {'key': 'l', 'time': datetime.datetime(2024, 12, 1, 16, 23, 31, 715748)}, {'key': 'm', 'time': datetime.datetime(2024, 12, 1, 16, 23, 32, 43640)}, {'key': 'n', 'time': datetime.datetime(

## Preprocess Data

## Convert Supplementary data to Dataset

[{'start_time': datetime.datetime(2024, 12, 1, 16, 23, 48, 840918),
  'end_time': datetime.datetime(2024, 12, 1, 16, 23, 50, 654815)},
 {'start_time': datetime.datetime(2024, 12, 1, 16, 24, 17, 209348),
  'end_time': datetime.datetime(2024, 12, 1, 16, 24, 18, 944359)},
 {'start_time': datetime.datetime(2024, 12, 1, 16, 23, 34, 717905),
  'end_time': datetime.datetime(2024, 12, 1, 16, 23, 36, 440883)},
 {'start_time': datetime.datetime(2024, 12, 1, 16, 24, 29, 608866),
  'end_time': datetime.datetime(2024, 12, 1, 16, 24, 31, 344969)},
 {'start_time': datetime.datetime(2024, 12, 1, 16, 23, 57, 773402),
  'end_time': datetime.datetime(2024, 12, 1, 16, 23, 59, 504849)},
 {'start_time': datetime.datetime(2024, 12, 1, 16, 23, 40, 41845),
  'end_time': datetime.datetime(2024, 12, 1, 16, 23, 41, 780490)},
 {'start_time': datetime.datetime(2024, 12, 1, 16, 23, 25, 868043),
  'end_time': datetime.datetime(2024, 12, 1, 16, 23, 27, 648627)},
 {'start_time': datetime.datetime(2024, 12, 1, 16, 24, 4

## Save new dataset