### Sequence pre-processing

This notebook will have a csv containing the six input streams, gyro and acc in the three axis. Output will be a tensor from tensorflow which is 6xY where Y will be the selected length of the sequence. The model we use can only use selected data timeframes as input and this notebook aims to prepare the CSV for this use. 
This notebook has two parameters; overlap, a float between 0 and 1 which indicates the overlap between two time frames. The second is length, this is an integer which contains the desired length of the sequence.  

In [24]:
from pathlib import Path
from config import config
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [46]:
def pad_with_zeros(df, rows_to_pad):
    padding = pd.DataFrame(0, index=range(rows_to_pad), columns=df.columns)
    # Concatenate the original DataFrame with the padding
    df = pd.concat([df, padding], ignore_index=True)
    return df

def create_sequence(metadata_csv_path: Path, overlap: float, length: int):
    data_points = length * 197 # GoPro Max 360 has a sample speed of 200/s
    length_microsec = length * 1000000
    overlap = 1-overlap
    df = pd.read_csv(config.DATA_DIR / 'CSVs' / 'GH010038-ACC&GYRO.csv', skiprows=1)
    if data_points > len(df):
        raise ValueError('Given length is longer than inputted video')
    if not 0 <= overlap <= 1:
        raise ValueError('Overlap must be between 0 and 1')
    
    # 
    # Call CSV Preprocessing
    #

    tensors = []

    min_time = df['time'].min()
    max_time = df['time'].max()
    start = 0

    # Partitioning the dataframe
    while start <= max_time:
        end = start + length_microsec
        partition = df[(df['time'] >= start) & (df['time'] < end)].iloc[:, 2:]
        tensors.append(tf.convert_to_tensor(partition, dtype=tf.float32))
        if start + length_microsec > max_time:
            break
        start += length_microsec*overlap


    return pad_sequences(tensors,padding='post',dtype='float32')
    

In [None]:
t = create_sequence(config.DATA_DIR / 'CSVs' / 'GH010038-ACC&GYRO.csv', 0.5, 10)
display(t)
print(len(t))
