# Supervised Learning
Classifying red and white wine using artificial neural networks and k-nearest neighbors.


By Marco Pleines (12117) for the final presentation in System Simulation SS2017

## Install and import packages

In [2]:
#install
!pip install -U scikit-learn
!pip install tensorflow
!pip install keras
!pip install scipy

#import
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, cohen_kappa_score
from keras.models import Sequential
from keras.layers import Dense
from numpy import *
from sklearn.neighbors import KNeighborsClassifier

Requirement already up-to-date: scikit-learn in c:\users\marco\anaconda3\lib\site-packages


Using TensorFlow backend.


## Read Data

In [3]:
red = pd.read_csv('winequality-red.csv')
white = pd.read_csv('winequality-white.csv')

Taking a look at the data

In [4]:
red.head()
# alternative
# red.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
print(red.info())
# red.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64
chlorides               1599 non-null float64
free sulfur dioxide     1599 non-null float64
total sulfur dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
None


In [6]:
# check for null values
pd.isnull(red)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False,False


In [7]:
white.head()
# alternative
# white.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [8]:
print(white.info())
# white.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
None


In [9]:
# check for null values
pd.isnull(white)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False,False


## Merge Data

In [10]:
# Add 'type' coloum to the data, 1 is reffered to red and 0 is reffered to white
red['type'] = 1
white['type'] = 0

# Merge datasets
wines = red.append(white, ignore_index=True)

## Prepare train and test dataset

In [11]:
# Specify the data 
X=wines.ix[:,0:11]

# Target label
y= np.ravel(wines.type)

# Creating training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Preprocess: Normalize Data

In [12]:
# Define the scaler for normalization
scaler = StandardScaler().fit(X_train)

# Normalize the training dataset
X_train = scaler.transform(X_train)

# Normalize the test dataset
X_test = scaler.transform(X_test)

## Build Neural Network Model

In [13]:
# Initialize the model
model = Sequential()

# Configuring layers
# Input layer using the rectified linear unit activation function
model.add(Dense(12, activation='relu', input_shape=(11,)))

# First hidden layer  using the rectified linear unit activation function
model.add(Dense(6, activation='relu'))

# Second hidden layer  using the hyperbolic tangent activation function
model.add(Dense(3, activation='tanh'))

# Output layer using the sigmoid (s-function) activation function
model.add(Dense(1, activation='sigmoid'))

# Compile and fit the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
                   
model.fit(X_train, y_train,epochs=20, batch_size=1, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1737ac53c88>

## Predicting Wine (ANN)

In [14]:
y_pred = model.predict(X_test)

# comparing predictions to the actual values
y_pred[:5]
array([[0],
       [1],
       [0],
       [0],
       [0]], dtype=int32)

y_test[:5]
array([0, 1, 0, 0, 0])

score = model.evaluate(X_test, y_test,verbose=0)

# [loss, accuracy]
print(score)

[0.037759876535664851, 0.9939393939393939]


## k-nearest Neighbor Model

In [50]:
# prepare data
test_idx = np.random.uniform(0, 1, len(wines)) <= .2
train = wines[test_idx == True]
test = wines[test_idx == False]

In [74]:
features = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides',
            'free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality']

n = 26   # Neighbors
results = []

clf = KNeighborsClassifier(n_neighbors = n)
clf.fit(train[features], train['type'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=26, p=2,
           weights='uniform')

## Predicting Wine (kNN)

In [75]:
preds = clf.predict(test[features])
accuracy = np.where(preds == test['type'], 1, 0).sum() / float(len(test))*100
print("Neighbors: %d, Accuracy: %2f%%" % (n, round(accuracy, 2)))

Neighbors: 26, Accuracy: 93.100000%


## References
https://www.datacamp.com/community/tutorials/deep-learning-python#gs.4cEYuyU


https://www.springboard.com/blog/beginners-guide-neural-network-in-python-scikit-learn-0-18/


https://archive.ics.uci.edu/ml/datasets/Wine

Problem Set 9: sim9_classkNN.ipynb