In [2]:
import os

import pandas as pd
import tensorflow as tf
from utils.data_utils import * 
class NPYSequence(tf.keras.utils.Sequence):
    def __init__(self, folder_path, column_names_file, batch_size=32):
        self.folder_path = folder_path
        self.column_names_file = column_names_file
        self.batch_size = batch_size
        self.file_names = os.listdir(folder_path)
        self.file_names = [f for f in self.file_names if f.endswith('.npy.gz')]
        self.total_samples = self.calculate_total_samples()
        self.current_data = None
        self.current_index=0
        self.load_next_file()
        
    def calculate_total_samples(self):
        total_samples = 0
        for file_name in self.file_names:
            file_path = os.path.join(self.folder_path, file_name)
            data = read(file_path, self.column_names_file)
            total_samples += len(data)
        return total_samples
    
    def __len__(self):
        return self.total_samples // self.batch_size
    
    
    def __getitem__(self, index):
        start_index = index * self.batch_size - self.current_index
        end_index = start_index + self.batch_size 
        batch_data = self.current_data.iloc[start_index:end_index]
 
        if len(batch_data) < self.batch_size:
            
            self.load_next_file()
            start_index = 0
            end_index = self.batch_size-len(batch_data)
            
            batch_data = pd.concat([batch_data, self.current_data.iloc[start_index:end_index]], axis=0)
            self.current_index = index * self.batch_size + len(batch_data)
        return batch_data
    
    def load_next_file(self):
        if not self.file_names:
            return
        file_name = self.file_names.pop(0)
      
        file_path = os.path.join(self.folder_path, file_name)

        self.current_index=0
        self.current_data = read(file_path, self.column_names_file)
    
    

data_folder = './Small_Preprocessed_Data/Train'
column_names_file = './Small_Preprocessed_Data/Train/column_names.txt'
batch_size = 64

data_generator = NPYSequence(data_folder, column_names_file, batch_size=batch_size)



In [3]:
data_generator.__len__()

34

In [8]:
i=0
for b in data_generator:

    print(i)
    i+=1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
