# Wine Quality Classification

#### Group Members
<div style="display: inline">mohammad1@ontariotechu.net</div> <br>

<div style="display: inline">dewanmohammad.tasinuzzaman@ontariotechu.net</div><br>





In [239]:
import torch
from torch import (nn, optim)
from torch.utils.data import (Dataset, DataLoader, random_split, TensorDataset)
import torchtext
import pandas as pd
import numpy as np
import warnings
from torch.optim import (Optimizer, Adam)
from torch.nn.functional import cross_entropy

## Data Import and Preprocessing

In [240]:
df = pd.read_csv('winequality-white.csv',sep = ';' )


In [241]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [242]:
len(df)

4898

### Normalize features

In [246]:

df.iloc[:, :-1] = (df.iloc[:, :-1] - df.iloc[:, :-1].min()) / (df.iloc[:, :-1].max() - df.iloc[:, :-1].min())

In [247]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.307692,0.186275,0.216867,0.308282,0.106825,0.149826,0.37355,0.267785,0.254545,0.267442,0.129032,6
1,0.240385,0.215686,0.204819,0.015337,0.118694,0.041812,0.285383,0.132832,0.527273,0.313953,0.241935,6
2,0.413462,0.196078,0.240964,0.096626,0.121662,0.097561,0.204176,0.154039,0.490909,0.255814,0.33871,6
3,0.326923,0.147059,0.192771,0.121166,0.145401,0.156794,0.410673,0.163678,0.427273,0.209302,0.306452,6
4,0.326923,0.147059,0.192771,0.121166,0.145401,0.156794,0.410673,0.163678,0.427273,0.209302,0.306452,6


### Convert quality into buckets

The quality of wines (1-10) was converted to four categories for simplier classification

- 1-3 --> 0
- 4-5 --> 1
- 6-7 --> 2
- 8-10 --> 3

In [248]:
df['quality'].unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [236]:
def convert_quality(label):
    if label <= 3:
        return 0
    elif label <= 5:
        return 1
    elif label <= 7:
        return 2
    else:
        return 3

In [237]:
df['quality'] = df['quality'].apply(convert_quality)

In [238]:
df['quality'].unique()

array([0], dtype=int64)

- we now have 4 labels instead of 7

## Split into test and training data

In [230]:
# sepereate measurement data and class
x = df.drop('quality', axis = 1)
y = df['quality']


x_numeric = x.apply(pd.to_numeric, errors='coerce')

# Drop rows with missing values
x_numeric = x_numeric.dropna()

# Convert to PyTorch tensors
x_tensor = torch.tensor(x_numeric.values, dtype=torch.float32)
y_tensor = torch.tensor(y.loc[x_numeric.index].values, dtype=torch.int64)
# convert to tensors
#x_tensor = torch.tensor(x.values, dtype=torch.float32)
#y_tensor = torch.tensor(y.values, dtype=torch.int64)
    
# create dataset
dataset = TensorDataset(x_tensor, y_tensor)

# Define the sizes of the training and test sets
train_size = int(0.7 * len(dataset))
test_size = len(dataset) - train_size

# Split the dataset into training and test sets
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

## Model

This NN includes two BatchNorm1d layers which normalize the input to each layer. This can reduce the internal covariate shift. There is also a residual layer added between the 2nd and third layer and a softmax activation function for the output.

In [231]:
class NeuralNetClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bnorm1 = nn.BatchNorm1d(hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.bnorm2 = nn.BatchNorm1d(hidden_size)
        self.output = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bnorm1(x)
        x = nn.functional.relu(x)
        residual = x # residual layer
        x = self.fc2(x)
        x = nn.functional.relu(x + residual) # add residual
        x = self.bnorm2(x)
        x = self.output(x)
        return nn.functional.softmax(x, dim=-1)


## Training Function

In [232]:
def train(model: NeuralNetClassifier,
          train_dataset: Dataset,
          learning_rate: float,
          epochs: int,
          max_batches=None) -> pd.DataFrame:
    
    
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()
    
    results = {'train_loss': [], 'train_acc': []}
    
    for epoch in range(epochs):
        losses = []
        
        
        for batch_idx, (x, target) in enumerate(train_data_loader):
            
            # conditon for batch size
            if max_batches is not None and batch_idx >= max_batches:
                break
                
            optimizer.zero_grad()
            y = model(x)
            loss = loss_fn(y, target)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            
        epoch_loss = np.mean(losses)
        results['train_loss'].append(epoch_loss)
        print("Epoch {}: loss={:.4f}".format(epoch, epoch_loss))
        
        # for traing accuracy
        with torch.no_grad():
            success = 0
            total = 0
            for x, target in train_data_loader:
                y = model(x)
                pred = y.argmax(axis=1)
                success += (pred == target).sum().item()
                total += target.shape[0]
            accuracy = success/total
            results['train_acc'].append(accuracy)
            print("Accuracy = {:.2f}".format(accuracy))

    return pd.DataFrame(results)

### Model Training

In [233]:

print(len(train_dataset))


0


In [224]:
# parameters
input_size = 11
hidden_size = 64
output_size = 4

train_data_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
train_data_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)


neural_net_classifier = NeuralNetClassifier(input_size, hidden_size, output_size)

history_nn = train(neural_net_classifier, train_data_loader, 0.001, epochs = 30, max_batches = 100)

ValueError: num_samples should be a positive integer value, but got num_samples=0

### Model Testing

In [161]:
#test acc
def test_saved_model(model):

    loss = nn.CrossEntropyLoss()
    dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)
    acc = 0
    with torch.no_grad():
        for xs, targets in dataloader:
            xs, targets = xs, targets
            ys = model(xs)
            acc += (ys.argmax(axis=1) == targets).sum().item()
    acc = acc / len(test_dataset) * 100
    print("Saved model has test accuracy = %.2f" % acc)


In [162]:
torch.save(neural_net_classifier, 'mymodel.pt')

In [163]:
model = torch.load('./mymodel.pt')
test_saved_model(model)

Saved model has test accuracy = 73.40
