# Wine dataset - white wine

Explored dataset is about white wine samples, from the north of Portugal. I tried to predict wine quality using pytorch's Neural Network na XGboost classifier.


In [None]:
# imports
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

import torch
import torch.utils.data as data_torch
import torch.nn.functional as F
import torch.nn as nn

import random
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
device = torch.device("cuda") 

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', delimiter=";")

# Wine dataset - basic stats
Basic knowloadge about this data set. It's showing facts about data's distribution, correlations etc.


In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.isnull().any().any()

In [None]:
df['quality'].value_counts()

In [None]:
sns.catplot(x='quality', data=df, kind='count')

In [None]:
plt.figure(figsize= (10,10))
sns.heatmap(df.corr(), color="k", annot=True)

In [None]:
plt.figure(figsize=(10,15))
for i, col in enumerate(list(df.columns.values)):
    plt.subplot(4,3,i+1)
    df.boxplot(col)
    plt.grid()
    plt.tight_layout()

In [None]:
plt.figure(figsize=(20,16))
for i, col in enumerate(list(df.columns.values)):
    plt.subplot(4,3,i+1)
    sns.histplot(df[col], color='r', kde=True, stat="density", linewidth=0, label='data')
    plt.grid()
    plt.legend(loc='upper right')
    plt.tight_layout()

In [None]:
sns.pairplot(data=df, kind='scatter', diag_kind='kde')

#Neural Network classificators

I made two neural network. First is a simple model without any additional layers. Second one is more sophisticated. This one use dropout, normalization, and Xavier weights initialization.

In [None]:
LR = 0.001
EPOCHS = 2000
DROPOUT = 0.3
BATCH_SIZE = 128

In [None]:
class MySimpleClassifier(nn.Module):
    
    def __init__(self, num_inputs, num_hidden, num_outputs):
        super().__init__()
        self.linear1 = nn.Linear(num_inputs, num_hidden)
        self.linear2 = nn.Linear(num_hidden, num_hidden)
        self.linear3 = nn.Linear(num_hidden, num_outputs)

    def forward(self, x):
        x = torch.sigmoid(self.linear1(x))
        x = torch.sigmoid(self.linear2(x))
        x = self.linear3(x)
        return x

In [None]:
model = MySimpleClassifier(11, 25, 11)
model = model.float()
model.to(device)

In [None]:
class MyAdvancedClassifier(nn.Module):

    def __init__(self, num_inputs, num_hidden, num_outputs):
        super().__init__()
        self.linear1 = nn.Linear(num_inputs, num_hidden)
        self.bn1 = nn.BatchNorm1d(num_hidden)
        self.rel1 = nn.ReLU()
        self.dropout1 = nn.Dropout(DROPOUT)
        self.linear2 = nn.Linear(num_hidden, num_hidden // 2)
        self.bn2 = nn.BatchNorm1d(num_hidden // 2)
        self.rel2 = nn.ReLU()
        self.dropout2 = nn.Dropout(DROPOUT)
        self.linear3 = nn.Linear(num_hidden // 2, num_outputs)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight.data)
                
    def forward(self, x):
        x = self.linear1(x)
        x = self.bn1(x)
        x = self.rel1(x)
        x = self.dropout1(x)
        x = self.linear2(x)
        x = self.bn2(x)
        x = self.rel2(x)
        x = self.dropout2(x)
        x = self.linear3(x)
        return x

In [None]:
model = MyAdvancedClassifier(11, 32, 11)
model = model.float()
model.to(device)

In [None]:
X, y = df.values[:,:-1], df.values[:,-1]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=123)

scaler = RobustScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

X_train_ts = torch.FloatTensor(X_train)
X_val_ts = torch.FloatTensor(X_val)

y_train_ts = torch.LongTensor(y_train)
y_val_ts = torch.LongTensor(y_val)


train_data = data_torch.TensorDataset(X_train_ts, y_train_ts)
test_data = data_torch.TensorDataset(X_val_ts, y_val_ts)

train_loader = data_torch.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = data_torch.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)



In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LR)
loss_module = nn.CrossEntropyLoss()

In [None]:
# learning loop
model.train()
for epoch in range(EPOCHS):
  for x, y in train_loader:
      x = x.to(device)
      y = y.to(device)
      outputs = model(x)
      loss = loss_module(outputs, y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
  if epoch % 10 == 0:
    print(f"Epoch: {epoch}, loss={loss.item():.3}")

#NN evaluation

Result for basic model is 40-50%, and for more advanced 55-60%.


In [None]:
model.eval()
true_preds, num_preds = 0., 0.
with torch.no_grad():
    for data, label in test_loader:
        data = data.to(device)
        label = label.to(device)
        preds = model(data.float())
        preds = torch.argmax(preds, axis=1)
        true_preds += (preds == label).sum()
        num_preds += len(preds)

print(f"Accuracy: {true_preds / num_preds}")

# XGboost

As a reference I used XGBoost library. XGB have 60-65% accuracy.

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [None]:
X, y = df.values[:,:-1], df.values[:,-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
xgbc = xgb.XGBClassifier()

In [None]:
xgbc.fit(X_train,y_train)

In [None]:
preds = xgbc.predict(X_test)

In [None]:
print(f"Accuracy: {np.count_nonzero((preds - y_test) == 0) / len(y_test)}")