# TFDV + TensorFlow (Keras) Lab — Iris Dataset

This notebook shows:
- **TensorFlow Data Validation (TFDV)**: statistics, schema inference, anomaly detection
- **TensorFlow (Keras)**: training a small Dense NN classifier

Dataset: **Iris** (local CSV included in `data/`)

**Author:** Manoj  
**Date:** 2026-02-27


In [None]:
# If you are running this in a fresh environment, install requirements first:
# !pip install -r requirements.txt

import os
import pandas as pd
import numpy as np


In [None]:
# Paths
ROOT = os.path.abspath(os.path.join(os.getcwd()))
DATA_DIR = os.path.join(ROOT, "data")

TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
TEST_CSV = os.path.join(DATA_DIR, "test.csv")

print("Train:", TRAIN_CSV)
print("Test :", TEST_CSV)

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

train_df.head()


In [None]:
print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
train_df.describe()


## Part A — Data profiling + schema inference (TFDV)

In [None]:
import tensorflow_data_validation as tfdv


In [None]:
# 1) Generate statistics from training data
train_stats = tfdv.generate_statistics_from_csv(TRAIN_CSV)
tfdv.visualize_statistics(train_stats)


In [None]:
# 2) Infer schema from training stats
schema = tfdv.infer_schema(train_stats)
tfdv.display_schema(schema)


In [None]:
# 3) Generate test stats and validate against the schema
test_stats = tfdv.generate_statistics_from_csv(TEST_CSV)
anomalies = tfdv.validate_statistics(test_stats, schema)

tfdv.display_anomalies(anomalies)


In [None]:
# 4) Compare train vs test stats (useful for drift-style checks)
tfdv.visualize_statistics(lhs_statistics=train_stats, rhs_statistics=test_stats, lhs_name="train", rhs_name="test")


## Part B — Train a different model (Keras Dense NN)

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split


In [None]:
FEATURE_COLS = ["sepal_length_cm","sepal_width_cm","petal_length_cm","petal_width_cm"]
LABEL_COL = "label"

x_train = train_df[FEATURE_COLS].astype("float32").to_numpy()
y_train = train_df[LABEL_COL].astype("int32").to_numpy()

x_test = test_df[FEATURE_COLS].astype("float32").to_numpy()
y_test = test_df[LABEL_COL].astype("int32").to_numpy()

x_tr, x_val, y_tr, y_val = train_test_split(
    x_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Normalize features using train split stats
mean = x_tr.mean(axis=0, keepdims=True)
std = x_tr.std(axis=0, keepdims=True) + 1e-7

x_tr_n = (x_tr - mean) / std
x_val_n = (x_val - mean) / std
x_test_n = (x_test - mean) / std

x_tr_n.shape, y_tr.shape


In [None]:
num_classes = 3

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(len(FEATURE_COLS),)),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(16, activation="relu"),
    tf.keras.layers.Dense(num_classes, activation="softmax"),
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"],
)

model.summary()


In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=15, restore_best_weights=True)
]

history = model.fit(
    x_tr_n, y_tr,
    validation_data=(x_val_n, y_val),
    epochs=200,
    batch_size=16,
    verbose=1,
    callbacks=callbacks
)


In [None]:
test_loss, test_acc = model.evaluate(x_test_n, y_test, verbose=0)
print(f"Test accuracy: {test_acc:.4f} | Test loss: {test_loss:.4f}")


In [None]:
# Save the model
out_dir = os.path.join("src", "saved_model")
os.makedirs(out_dir, exist_ok=True)
model_path = os.path.join(out_dir, "iris_dense_classifier")
model.save(model_path)
print("Saved model to:", model_path)


In [None]:
# Quick predictions
probs = model.predict(x_test_n[:5], verbose=0)
preds = probs.argmax(axis=1)
for i in range(5):
    print(f"row {i}: pred={int(preds[i])} true={int(y_test[i])} probs={np.round(probs[i], 3)}")
