In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import torch
from torch.utils.data import DataLoader, TensorDataset

# Load the dataset (assuming zoo.data is in the working directory)

# Column names based on zoo.names
column_names = ['animal_name', 'hair', 'feathers', 'eggs', 'milk', 'airborne', 
                'aquatic', 'predator', 'toothed', 'backbone', 'breathes', 
                'venomous', 'fins', 'legs', 'tail', 'domestic', 'catsize', 'class_type']

# Load the dataset into a pandas dataframe
df = pd.read_csv((r'C:\Users\haris\OneDrive\Documents\zoo.data.csv'), header=None, names=column_names)

# Print the first few rows of the original dataset
print("Original Dataset (first 5 rows):\n", df.head())

# Drop the animal_name column as it's not a feature used in training
df = df.drop('animal_name', axis=1)

# Separate the features and the target variable
X = df.drop('class_type', axis=1)  # Features
y = df['class_type']  # Target

# Step 1: Label Encoding for the Target Variable (class_type)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Print the label encoding mapping
print("\nLabel Encoding for Target Variable (class_type):\n", list(label_encoder.classes_))

# Step 2: Normalization of the 'legs' attribute
scaler = MinMaxScaler()
X['legs'] = scaler.fit_transform(X[['legs']])

# Print the first few rows after normalization
print("\nDataset after 'legs' Normalization (first 5 rows):\n", X.head())

# Step 3: No One-Hot Encoding is needed for Boolean features (already binary)

# Step 4: Train-Test Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("\nShapes of Training and Testing Sets:")=
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

# Step 5: Handle class imbalance (optional, depending on the class distribution)
# Not handling class imbalance in this case since the dataset is reasonably balanced

# Step 6: Convert the preprocessed data into Tensor format
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Print the first few tensors
print("\nFirst 5 Samples of Training Data (as Tensors):\n", X_train_tensor[:5])
print("\nFirst 5 Labels of Training Data (as Tensors):\n", y_train_tensor[:5])

# Step 7: Batching and Shuffling
batch_size = 32

# Create DataLoader for training and testing datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Print out a single batch of data from the train_loader
for batch_features, batch_labels in train_loader:
    print("\nA single batch of features (training):\n", batch_features)
    print("\nA single batch of labels (training):\n", batch_labels)
    break  # Only print the first batch

Original Dataset (first 5 rows):
   animal_name  hair  feathers  eggs  milk  airborne  aquatic  predator  \
0    aardvark     1         0     0     1         0        0         1   
1    antelope     1         0     0     1         0        0         0   
2        bass     0         0     1     0         0        1         1   
3        bear     1         0     0     1         0        0         1   
4        boar     1         0     0     1         0        0         1   

   toothed  backbone  breathes  venomous  fins  legs  tail  domestic  catsize  \
0        1         1         1         0     0     4     0         0        1   
1        1         1         1         0     0     4     1         0        1   
2        1         1         0         0     1     0     1         0        0   
3        1         1         1         0     0     4     0         0        1   
4        1         1         1         0     0     4     1         0        1   

   class_type  
0           1  
1 

# Imports:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import torch

from torch.utils.data import DataLoader, TensorDataset

pandas: A library for data manipulation and analysis. Here, it's used to load and manage the dataset.


train_test_split: A function from sklearn to split the dataset into training and testing sets.

LabelEncoder: Used to convert categorical labels into a numerical format.

MinMaxScaler: Used to normalize numerical data (i.e., scale the data between a range, usually 0 to 1).

torch: The core library of PyTorch, a deep learning framework.

DataLoader and TensorDataset: Used to create iterable datasets for training in PyTorch.

# Loading the Dataset:

# Column names based on zoo.names

column_names = ['animal_name', 'hair', 'feathers', 'eggs', 'milk', 'airborne', 

                'aquatic', 'predator', 'toothed', 'backbone', 'breathes',

                'venomous', 'fins', 'legs', 'tail', 'domestic', 'catsize', 'class_type']


# Load the dataset into a pandas dataframe

df = pd.read_csv((r'C:\Users\haris\OneDrive\Documents\zoo.data.csv'), header=None, names=column_names)


# Print the first few rows of the original dataset

print("Original Dataset (first 5 rows):\n", df.head())

column_names: Defines the column names for the dataset. Since the dataset doesn’t come with headers, column names are manually set based on the attributes of the "zoo" dataset.


pd.read_csv(): Loads the dataset from the specified file path into a pandas DataFrame. The header=None argument indicates there is no header in the CSV file, so column names are provided.


df.head(): Prints the first 5 rows of the dataset.

# Dropping the Animal Name Column:

df = df.drop('animal_name', axis=1)

df.drop(): Drops the 'animal_name' column since it’s not useful for training the model. The axis=1 specifies that the column (and not a row) is being dropped.

# Separating Features and Target Variable:
X = df.drop('class_type', axis=1)  

y = df['class_type'] 

X: Contains all columns except 'class_type' (which is the target). These are the features used for model training.

y: The 'class_type' column, which represents the classification target.


# Label Encoding the Target Variable:
label_encoder = LabelEncoder()

y_encoded = label_encoder.fit_transform(y)


# Print the label encoding mapping

print("\nLabel Encoding for Target Variable (class_type):\n", list(label_encoder.classes_))

LabelEncoder(): Converts the target variable ('class_type') into numeric values. Each unique class type will be mapped to an integer 

fit_transform(y): Fits the encoder to the 'class_type' column and transforms it into a numerical array.

list(label_encoder.classes_): Prints the unique classes and their corresponding numeric labels.



# Normalizing the 'Legs' Feature:
scaler = MinMaxScaler()

X['legs'] = scaler.fit_transform(X[['legs']])



# Print the first few rows after normalization

print("\nDataset after 'legs' Normalization (first 5 rows):\n", X.head())

MinMaxScaler(): Scales the 'legs' feature so that the values are between 0 and 1. Normalization is often done to ensure that no feature dominates others due to larger values.

fit_transform(X[['legs']]): Normalizes the 'legs' column.

X.head(): Prints the first 5 rows after normalization to verify the transformation.



# Splitting the Dataset into Training and Testing Sets:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


# Print the shapes of the training and testing sets

print("\nShapes of Training and Testing Sets:")

print("X_train:", X_train.shape)

print("X_test:", X_test.shape)

print("y_train:", y_train.shape)

print("y_test:", y_test.shape)

train_test_split(): Splits the dataset into training (80%) and testing (20%) sets. The random_state=42 ensures reproducibility of the split.

X_train, X_test, y_train, y_test: The resulting feature and target sets for training and testing.

.shape: Prints the dimensions of the training and testing datasets.

# Converting Data to Tensor Format (for PyTorch):
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)


X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

y_test_tensor = torch.tensor(y_test, dtype=torch.long)


# Print the first few tensors

print("\nFirst 5 Samples of Training Data (as Tensors):\n", X_train_tensor[:5])

print("\nFirst 5 Labels of Training Data (as Tensors):\n", y_train_tensor[:5])

torch.tensor(): Converts the training and testing data into PyTorch tensors, which are required for training models in PyTorch.

X_train.values: Converts the pandas DataFrame into a NumPy array before creating a tensor.

dtype=torch.float32 and dtype=torch.long: Specifies the data types for the features and labels, respectively.

[:5]: Displays the first 5 rows of the tensor data for inspection.


# Creating DataLoader for Batching and Shuffling:

batch_size = 32

# Create DataLoader for training and testing datasets

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

TensorDataset(): Combines the features and labels into a dataset that PyTorch can iterate through.

DataLoader(): Creates an iterable data loader that batches the data.

batch_size=32: Each batch contains 32 samples.

shuffle=True for training data: Randomly shuffles the training data at each epoch to improve learning.

shuffle=False for testing data: No need to shuffle testing data


# Displaying a Single Batch of Data:
for batch_features, batch_labels in train_loader:

    print("\nA single batch of features (training):\n", batch_features)

    print("\nA single batch of labels (training):\n", batch_labels)

    break  

    This loop extracts a single batch of data from the training loader.

batch_features: A batch of features.

batch_labels: Corresponding labels.

The break ensures that only the first batch is printed.




# Summary of the Code:

This code performs data preprocessing and prepares the Zoo dataset for use in a PyTorch deep learning model. Here's the flow of the script:

# 1:Loading the Dataset:

The dataset is loaded into a pandas DataFrame with specified column names. The 'animal_name' column is dropped as it's irrelevant for training.

# 2:Separating Features and Target:

The features (X) are separated from the target variable (y), which represents the class type (animal classification).

# 3:Label Encoding the Target:

The target variable (class_type) is converted into numerical labels using LabelEncoder, where each unique class is assigned a numerical value (e.g., 0, 1, 2).

# 4:Normalizing a Feature:

The 'legs' feature, which is a numerical value, is scaled to a range of 0 to 1 using MinMaxScaler to ensure consistent scaling during model training.

# 5:Train-Test Split:

The dataset is split into training (80%) and testing (20%) sets using train_test_split to prepare the data for training and evaluation.

# 6:Converting to PyTorch Tensors:

Both the features and labels from the training and testing sets are converted into PyTorch tensors to be used with deep learning models in PyTorch.

# 7:Creating Data Loaders:


DataLoader objects are created for both training and testing sets, which allows the data to be batched (with a batch size of 32) and shuffled for efficient model training. 

Batching improves the performance of the model, and shuffling enhances generalization during training.

# 8:Displaying a Batch of Data:

The script demonstrates how to extract and display a single batch of training data from the DataLoader, including both features and their corresponding labels.


This preprocessing pipeline converts the Zoo dataset into a format that is ready for deep learning models, ensuring proper data scaling, encoding, batching, and efficient data loading using PyTorch.