In [9]:
import tensorflow as tf
from sklearn.datasets import make_regression
import pandas as pd

In [11]:
data = make_regression(n_samples=10000, n_features=10, noise=0.1, n_informative=4, random_state=42)

In [21]:
df = pd.DataFrame(data[0], columns=[f"feature_{i}" for i in range(9)] + ['target'])

In [39]:
X_mean = df.iloc[0:, :-1].mean().tolist()
X_std = df.iloc[0:, :-1].std().tolist()

In [40]:
X_tf_mean = tf.constant(X_mean, dtype=tf.float32)
X_tf_std = tf.constant(X_std, dtype=tf.float32)

In [13]:
total_len = df.shape[0]
batch = 2000

for i in range(0, total_len, batch):
    batch_data = df[i:i+batch]
    batch_data.to_csv(f"./dataset/data_batch_{i//batch}.csv", index=False)
    # Here you can process each batch as needed
    print(f"Processing batch from index {i} to {i+batch}")


Processing batch from index 0 to 2000
Processing batch from index 2000 to 4000
Processing batch from index 4000 to 6000
Processing batch from index 6000 to 8000
Processing batch from index 8000 to 10000


In [14]:
filepath_dataset = tf.data.Dataset.list_files("./dataset/batched/*.csv")  # List all CSV files in the 'data' directory
print([path for path in filepath_dataset])

[<tf.Tensor: shape=(), dtype=string, numpy=b'.\\dataset\\batched\\data_batch_1.csv'>, <tf.Tensor: shape=(), dtype=string, numpy=b'.\\dataset\\batched\\data_batch_4.csv'>, <tf.Tensor: shape=(), dtype=string, numpy=b'.\\dataset\\batched\\data_batch_0.csv'>, <tf.Tensor: shape=(), dtype=string, numpy=b'.\\dataset\\batched\\data_batch_3.csv'>, <tf.Tensor: shape=(), dtype=string, numpy=b'.\\dataset\\batched\\data_batch_2.csv'>]


In [15]:
n_readers = 3
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),  # Skip header row
    cycle_length=n_readers,
    num_parallel_calls=tf.data.AUTOTUNE
)

In [19]:
dataset.take(1).as_numpy_iterator().next()

b'0.6468815545208564,0.17572471303499634,-0.10063463692310684,-0.9491805333536313,-0.09470111788131681,0.8965041830082391,-1.8372639073682737,1.8588559982040431,-0.6827956085012468,1.1186035774008911'

In [20]:
for line in dataset.take(5):
    print(line.numpy())

b'-0.0691479480641767,0.14748765748923487,0.6089459953982035,-0.13417113084509655,-0.23640872079370717,1.0259552398235767,0.6397870811226445,-0.03772038552502816,-2.256116402975414,1.1248891474904743'
b'0.47184708617350163,-0.3492270126308773,2.670400963359257,-0.3470051842189429,-0.4746339470649329,-0.3327744179460228,0.32982069852267654,0.9927660033826496,0.3693966293562307,1.8557123544605079'
b'0.6468815545208564,0.17572471303499634,-0.10063463692310684,-0.9491805333536313,-0.09470111788131681,0.8965041830082391,-1.8372639073682737,1.8588559982040431,-0.6827956085012468,1.1186035774008911'
b'0.44667662977524614,0.6850181507179419,0.12046390943776805,1.818911155425829,-1.1263322288406599,0.8020540010313438,0.9216372151068966,0.2868925990530885,0.9973382875080569,0.16664181776854453'
b'-1.075463894423159,0.3203827603963476,0.6683294172786062,-0.43959616061820234,0.1504886143301048,0.3370317759758207,-0.15110624297346256,-1.200796218421231,-0.3783655452626013,-0.8015523709599974'


In [41]:
X_mean = df.iloc[0:, :-1].mean().tolist()
X_std = df.iloc[0:, :-1].std().tolist()

X_mean = tf.constant(X_mean, dtype=tf.float32)
X_std = tf.constant(X_std, dtype=tf.float32)

In [45]:
n_inputs = 9

def preprocess(line):
    defs = [0.] * (n_inputs + 1)  #                                Assuming last column is the target
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs) # Parse CSV line
    features = tf.stack(parsed_fields[:-1])  #                     All but last are features
    target = parsed_fields[-1]  #                                  Last is target

    features = (features - X_mean) / X_std #                       Normalize features
    return features, target

In [47]:
# let's run function preprocess for the one row
features, target = preprocess(dataset.take(1).as_numpy_iterator().next())
features, target

In [49]:
 # all in one
 def csv_reader_dataset(filepaths, repeat=1, n_readers=5, n_parse_calls=5, batch_size=32, shuffle_buffer_size=1000):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers,
        num_parallel_calls=tf.data.AUTOTUNE
    )
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_calls)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

(<tf.Tensor: shape=(9,), dtype=float32, numpy=
 array([ 0.48320866, -0.33952922,  2.6982684 , -0.3495661 , -0.47571003,
        -0.34202948,  0.32670018,  0.98212135,  0.3589667 ], dtype=float32)>,
 <tf.Tensor: shape=(), dtype=float32, numpy=1.8557124137878418>)