In [1]:
import os
import pandas as pd
import tensorflow as tf
from utils.data_utils import * 
import time
import pickle

In [2]:
def save_object(object, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(object, f)

def load_object(file_path):
    with open(file_path, 'rb') as f:
        loaded_object = pickle.load(f)
    return loaded_object

In [3]:
class NPYSequence(tf.keras.utils.Sequence):
    def __init__(self, folder_path, column_names_file, batch_size=32, target_column = "human"):
        self.folder_path = folder_path
        self.column_names_file = column_names_file
        self.batch_size = batch_size
        self.file_names = os.listdir(folder_path)
        self.file_names = [f for f in self.file_names if f.endswith('.npy.gz')]
        self.total_samples = self.calculate_total_samples()
        self.current_data = None
        self.target_column = target_column
        self.load_next_file()
        
    def calculate_total_samples(self):
        total_samples = 0
        for file_name in self.file_names:
            file_path = os.path.join(self.folder_path, file_name)
            data = read(file_path, self.column_names_file)
            total_samples += len(data)
        return total_samples
    
    def __len__(self):
        return self.total_samples // self.batch_size
    
    
    def __getitem__(self, index):
        start_index = index * self.batch_size - self.current_index
        end_index = start_index + self.batch_size
        print("_______________________") 
        print(start_index)
        print(end_index)
        if self.current_data is None:
            return None, None
        batch_data = self.current_data.iloc[start_index:end_index]
 
        if len(batch_data) < self.batch_size:
            
            self.load_next_file()
            start_index = 0
            end_index = self.batch_size-len(batch_data)
            
            if self.current_data is not None:
                batch_data = pd.concat([batch_data, self.current_data.iloc[start_index:end_index]], axis=0)
                self.current_index = index * self.batch_size + len(batch_data)
        X = batch_data.drop(columns = [self.target_column])
        X = X.values
        y = batch_data[self.target_column]
        y = y.values
        return X, y
    
    def load_next_file(self):
        if not self.file_names:
            self.current_data = None
            return
        file_name = self.file_names.pop(0)
        file_path = os.path.join(self.folder_path, file_name)
        self.current_index=0
        self.current_data = read(file_path, self.column_names_file)
    
    

data_folder = './Temp_Data/Train'
column_names_file = './Temp_Data/Train/column_names.txt'
batch_size = 64

data_generator_tf = NPYSequence(data_folder, column_names_file, batch_size=batch_size)

In [None]:
data_generator_tf.__len__()

In [None]:
i=0

start_time = time.time()
for b in data_generator_tf:
    i+=1
end_time = time.time()
elapsed_time_tf = end_time - start_time

In [None]:
save_object(data_generator_tf, "./Generators\\tf_generator.pkl")
data_generator_tf = load_object("./Generators\\tf_generator.pkl")
data_generator_tf.__len__()