An early version of the sampler function which offers a chosen amount of overlap over consecutively drawn 60 point samples. 

In [4]:
import pandas as pd
import numpy as np
import random
import os

In [9]:
def generate_consecutive_samples(read, batch, sequence_length = 45, overlap = 10):
    """
    A function that creates a list of arrays that are to serve as samples for processing later.
    read is the directory where Excel files containing variables are stored;
    batch is the number of samples that will be generated;
    sequence length is the length of each sample;
    overlap is the number of pointseach sample overlaps with the previous one.
    Returns a list of numpy arrays.
    """

    def load_excel(file_path):
        """
        Extracts sheets from Excel files given a file directory.
        """
        excel_file = pd.ExcelFile(file_path)
        
        sheets = {sheet_name: pd.read_excel(excel_file, sheet_name = sheet_name) for sheet_name in excel_file.sheet_names}

        return sheets
    
    def random_file(read):
        """
        Chooses a random file from a provided directory.
        """
        files = [f for f in os.listdir(read) if f.endswith(".xlsx") and not f.startswith(".")]
        
        if not files:
            raise FileNotFoundError("No Excel files found in the directory")
        
        return os.path.join(read, random.choice(files))
    
    def sampler(sheet_data, sequence_length, overlap):
        """
        Creates samples from an Excel sheet. Samples include 7 variables, each containing sequence_length of points.
        sheet_data are sheets from which data is going to be sampled;
        sequence length is the number of points each sample will contain;
        overlap is the number of points sample a will share wil sample a+1.
        """
        num_points = len(sheet_data)
        
        if num_points < sequence_length:
            return None
        
        samples = []
        start_index = random.randint(0, num_points - sequence_length)

        while start_index + sequence_length <= num_points:
            end_index = start_index + sequence_length
            sample = sheet_data.iloc[start_index : end_index].to_numpy()
            samples.append(sample)
            #Updates start_index of next sample to ensure overlap
            start_index = end_index - overlap

            #Checks if the following sample would exceed bounds
            if start_index + sequence_length > num_points:
                break

        return samples
    
    def weighted_random_choice(sheets):
        """
        Assigns weights to sheets based on the length of each sheet.
        """
        total_points = sum(len(data) for data in sheets.values())
        weights = [len(data) / total_points for data in sheets.values()]

        return random.choices(list(sheets.keys()), weights = weights, k = 1)[0]
    
    #Initializes list to store sample arrays
    samples = []

    while len(samples) < batch:
        file_path = random_file(read)
        sheets = load_excel(file_path)

        while len(samples) < batch:
            sheet_name = weighted_random_choice(sheets)
            sheet_data = sheets[sheet_name][["velocity_x", "velocity_y", "velocity_y", "tortuosity3d", "angular_velocity_yz", "angular_velocity_xz", "angular_velocity_xy"]]
            new_samples = sampler(sheet_data, sequence_length, overlap)

            if new_samples:
                samples.extend(new_samples)

            #If batch size was reached, trims list and stops
            if len(samples) >= batch:
                samples = samples[: batch]
                break

            #If sheet_data has no more valid smaples, moves to next sheet
            if not new_samples:
                break

    return samples

In [11]:
directory = "/Users/maks/Documents/MSc_project/data/features"

test = generate_samples(directory, 5)

In [14]:
print(test[1].shape)
print(len(test))

(45, 7)
5
