In [9]:
import pandas as pd
import tensorflow as tf

# Load dataset
df = pd.read_csv("smaller_dataset.csv")

# Preprocess the dataset
df_encoded = df.drop(columns=[df.columns[0]])  # Drop the first column
X = df_encoded.drop(columns=["DRUH_POVR", "NAZ_LOKALI"])  # Features
y = df_encoded["DRUH_POVR"]  # Target

# Convert features to TensorFlow tensors
X_tensor = tf.convert_to_tensor(X.to_numpy(), dtype=tf.float32)

# Use TensorFlow StringLookup for label encoding
label_lookup = tf.keras.layers.StringLookup(output_mode='int', vocabulary=tf.constant(y.unique()))
y_tensor = label_lookup(y) - 1  # Adjust labels to start from 0

# Create a TensorFlow Dataset
dataset = tf.data.Dataset.from_tensor_slices((X_tensor, y_tensor))

# Shuffle the dataset
dataset_size = len(y_tensor)
dataset = dataset.shuffle(buffer_size=dataset_size, seed=42)

# Split the dataset into train, validation, and test sets
train_size = int(0.7 * dataset_size)
val_size = int(0.15 * dataset_size)
test_size = dataset_size - train_size - val_size

train_dataset = dataset.take(train_size)
remaining = dataset.skip(train_size)
val_dataset = remaining.take(val_size)
test_dataset = remaining.skip(val_size)

# Batch the datasets for training, validation, and evaluation
batch_size = 32
train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

# Determine the number of classes for the output layer
num_classes = len(label_lookup.get_vocabulary()) - 1  # Adjust for 0-based indexing

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),  # Specify input shape
    tf.keras.layers.Dense(num_classes, activation='softmax'),  # Output layer
])

# Compile the model
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']  # Add accuracy as a metric
)

# Train the model with a validation set
history = model.fit(
    train_dataset, 
    validation_data=val_dataset,  # Include validation set
    epochs=10,  # Increased epochs for better training
    verbose=1  # Display progress
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(test_dataset, verbose=0)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.8638, Test Accuracy: 0.4006


In [11]:
y.shape

(32950,)

In [15]:
len(y.unique())

18