# Data preparation

This notebook contains some examples of how to pre-process data.

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
# Seaborn for plotting and styling
import seaborn as sns

## Wine Quality Dataset
This dataset contains instances for red and white wine samples.
The inputs include objective tests (e.g. PH values) and the output is based on sensory data
(median of at least 3 evaluations made by wine experts). Each expert graded the wine quality 
between 0 (very bad) and 10 (very excellent).

The two datasets are related to red and white variants of the Portuguese "Vinho Verde" wine.
For more details, consult: http://www.vinhoverde.pt/en/ or the reference [Cortez et al., 2009].
Due to privacy and logistic issues, only physicochemical (inputs) and sensory (the output) variables 
are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.).

In [2]:
df = pd.read_csv('../../datasets/winequality-red.csv', sep=';')
df.head(20)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


### Rescale Data

In [3]:
# Rescale data (between 0 and 1)
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
array = df.values
# separate array into input and output components
X = array[:,0:11]
Y = array[:,11]
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
# summarise transformed data
set_printoptions(precision=3)
print(rescaledX[0:5,:])

[[0.248 0.397 0.    0.068 0.107 0.141 0.099 0.568 0.606 0.138 0.154]
 [0.283 0.521 0.    0.116 0.144 0.338 0.216 0.494 0.362 0.21  0.215]
 [0.283 0.438 0.04  0.096 0.134 0.197 0.17  0.509 0.409 0.192 0.215]
 [0.584 0.11  0.56  0.068 0.105 0.225 0.191 0.582 0.331 0.15  0.215]
 [0.248 0.397 0.    0.068 0.107 0.141 0.099 0.568 0.606 0.138 0.154]]


### Standardise Data

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# summarise transformed data
set_printoptions(precision=3)
print(rescaledX[0:5,:])

[[-0.528  0.962 -1.391 -0.453 -0.244 -0.466 -0.379  0.558  1.289 -0.579
  -0.96 ]
 [-0.299  1.967 -1.391  0.043  0.224  0.873  0.624  0.028 -0.72   0.129
  -0.585]
 [-0.299  1.297 -1.186 -0.169  0.096 -0.084  0.229  0.134 -0.331 -0.048
  -0.585]
 [ 1.655 -1.384  1.484 -0.453 -0.265  0.108  0.412  0.664 -0.979 -0.461
  -0.585]
 [-0.528  0.962 -1.391 -0.453 -0.244 -0.466 -0.379  0.558  1.289 -0.579
  -0.96 ]]


### Normalise Data (length = 1)

In [5]:
# Normalise data (length of 1)
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
# summarise transformed data
set_printoptions(precision=3)
print(normalizedX[0:5,:])

[[1.952e-01 1.846e-02 0.000e+00 5.011e-02 2.004e-03 2.901e-01 8.966e-01
  2.631e-02 9.257e-02 1.477e-02 2.479e-01]
 [1.072e-01 1.210e-02 0.000e+00 3.575e-02 1.347e-03 3.437e-01 9.212e-01
  1.370e-02 4.400e-02 9.349e-03 1.347e-01]
 [1.355e-01 1.320e-02 6.946e-04 3.994e-02 1.598e-03 2.605e-01 9.378e-01
  1.731e-02 5.661e-02 1.129e-02 1.702e-01]
 [1.744e-01 4.359e-03 8.718e-03 2.958e-02 1.168e-03 2.647e-01 9.341e-01
  1.554e-02 4.920e-02 9.030e-03 1.526e-01]
 [1.952e-01 1.846e-02 0.000e+00 5.011e-02 2.004e-03 2.901e-01 8.966e-01
  2.631e-02 9.257e-02 1.477e-02 2.479e-01]]


### Binarise Data

In [6]:
# binarisation
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=3.0).fit(X)
binaryX = binarizer.transform(X)
# summarise transformed data
set_printoptions(precision=3)
print(binaryX[0:5,:])

[[1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1.]
 [1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1.]
 [1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1.]
 [1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1.]
 [1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1.]]
