# Midterm Exam

### Nicholas Thomson

I will be using a neural network to classify the flowers.

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader

### Import the dataset

In [2]:
# Load the dataset
data = pd.read_csv("Iris.csv")

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


### Replace Species

The species column is in a string format. It needs to be in numerical form to be used by the neural network.

First, get the unique values in the species column.

In [3]:
# Split the data into features (X) and target variable (y)
X = data.drop('Species', axis=1)
y = data['Species']
y.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

Replace the flower names with numerical values

In [4]:

#Map the species of flowers to float values
y = y.replace(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], [0, 1, 2])
y
print(y.shape)

(150,)


### Split into Train and Test

Use oversampling to ensure that each flower has an equal number of representatives. Also split the data into training and testing variables,

In [5]:
# Use oversampling to address class inbalance issue

from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(random_state=42)
X, y = oversampler.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# After standardized, X_train and X_test will be converted from pandas data frame into numpy arrays. To make the type of X data the same as y data (pandas series), we convert X data back to pandas data frame.
# The purpose of this step is to make the code syntax to convert X and y data to PyTorch tensors to be consistent.

X_train = pd.DataFrame(X_train) 
X_test = pd.DataFrame(X_test) 


### Use pytorch tensors for the neural network

In [6]:
# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32) # X_train.values will convert X_train from a pandas dataframe into an numpy array, which is required as the input type for torch.tensor().
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long) 
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long) 

print(X_train_tensor.shape, y_train_tensor.shape, X_test_tensor.shape, y_test_tensor.shape)

torch.Size([120, 5]) torch.Size([120]) torch.Size([30, 5]) torch.Size([30])


### Create data loaders

In [7]:
# Create TensorDataset objects for train and test data
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoader objects for train and test datasets
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

### Define the neural network architecture

In [8]:
# Define the neural network architecture
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

### Create the neural network instance

In [9]:
# Define hyperparameters

input_size = X_train_tensor.shape[1]
hidden_size = X_train_tensor.shape[1] * 2 # The size of hidden layer is arbitrarily chosen and can be tuned.
num_classes = 3 # Number of classes in your multi-class classification problem

# Instantiate the neural network model
model = NeuralNetwork(input_size, hidden_size, num_classes)

### Define the loss function and optimizer

In [10]:
criterion = nn.CrossEntropyLoss() # CrossEntroyLoss 
optimizer = optim.Adam(model.parameters(), lr=0.001)

### Train the neural network

In [11]:
# Training loop
num_epochs = 200
for epoch in range(num_epochs):
    for inputs, labels in train_dataloader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print progress
        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print()

# Evaluate the model's accuracy on the training set
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in train_dataloader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print('Accuracy:', accuracy)
    

Epoch [10/200], Loss: 0.8638
Epoch [10/200], Loss: 0.9111
Epoch [20/200], Loss: 0.8444
Epoch [20/200], Loss: 0.7802
Epoch [30/200], Loss: 0.7684
Epoch [30/200], Loss: 0.7191
Epoch [40/200], Loss: 0.6828
Epoch [40/200], Loss: 0.6776
Epoch [50/200], Loss: 0.5981
Epoch [50/200], Loss: 0.6480
Epoch [60/200], Loss: 0.5928
Epoch [60/200], Loss: 0.5397
Epoch [70/200], Loss: 0.5331
Epoch [70/200], Loss: 0.5059
Epoch [80/200], Loss: 0.5289
Epoch [80/200], Loss: 0.4202
Epoch [90/200], Loss: 0.4489
Epoch [90/200], Loss: 0.4311
Epoch [100/200], Loss: 0.4354
Epoch [100/200], Loss: 0.3734
Epoch [110/200], Loss: 0.3550
Epoch [110/200], Loss: 0.3986
Epoch [120/200], Loss: 0.3386
Epoch [120/200], Loss: 0.3562
Epoch [130/200], Loss: 0.3016
Epoch [130/200], Loss: 0.3434
Epoch [140/200], Loss: 0.2689
Epoch [140/200], Loss: 0.3302
Epoch [150/200], Loss: 0.2502
Epoch [150/200], Loss: 0.3051
Epoch [160/200], Loss: 0.2753
Epoch [160/200], Loss: 0.2333
Epoch [170/200], Loss: 0.2406
Epoch [170/200], Loss: 0.233

### Evaluate the model on testing set

In [12]:
# Evaluate the model's accuracy on the testing set
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_dataloader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print('Accuracy:', accuracy)

Accuracy: 1.0


# Results

The neural network was able to identify all of the testing data correctly. This model is very good, it will be able to predict the type of flower from the three types given.