# <center>Vinho Verde Wine Quality Classification
* ## Stage 1: Data Preprocessing
* ### Importing Essential Libraries

In [1]:
import pandas as pd
import numpy as np

* ### Importing the Dataset

In [2]:
vinho_verde_red_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
vinho_verde_white_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
vinho_verde_red = pd.read_csv(vinho_verde_red_url, delimiter=';')
vinho_verde_white = pd.read_csv(vinho_verde_white_url, delimiter=';')

# standardize column names for ease of use
vinho_verde_white.columns = [c.lower().replace(' ', '_') for c in vinho_verde_white.columns]
vinho_verde_red.columns = [c.lower().replace(' ', '_') for c in vinho_verde_red.columns]

* ### Taking a peek on our Dataset

In [80]:
vinho_verde_red.head(15)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [81]:
vinho_verde_white.head(15)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
5,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,6
7,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
8,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
9,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.9938,3.22,0.45,11.0,6


* ### What is the type of each feature?

In [5]:
vinho_verde_red.dtypes

fixed_acidity           float64
volatile_acidity        float64
citric_acid             float64
residual_sugar          float64
chlorides               float64
free_sulfur_dioxide     float64
total_sulfur_dioxide    float64
density                 float64
ph                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [6]:
vinho_verde_white.dtypes

fixed_acidity           float64
volatile_acidity        float64
citric_acid             float64
residual_sugar          float64
chlorides               float64
free_sulfur_dioxide     float64
total_sulfur_dioxide    float64
density                 float64
ph                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

* ### Getting a Basic Statistical Description of each Dataset

In [7]:
vinho_verde_white.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [8]:
vinho_verde_red.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


* ### Functions
  * **find_outlier_indices:** Given a DataFrame return the outlier indices in list of tuples in (feature, row_indices) format, *provided that the feature values are greater than Q75 + 1.5 IQR or less that Q25 - 1.5 IQR*
  * **drop_outliers:** Given a DataFrame and the Outlier list of tuples in (feature, row_indices) format, return the DataFrame minus the included in the list row indices

In [9]:
def find_outlier_indices(df):
    quantiles = df.quantile([0.25, 0.75])
    feature_outlier_indices = []
    for feature in quantiles:
        q25, q75 = quantiles[feature].values
        iqr = q75 - q25
        
        outlier_indices = df.index[(df[feature] < q25 - (1.5 * iqr)) | (df[feature] > q75 + (1.5 * iqr))].tolist()
        feature_outlier_indices.append((feature, outlier_indices))
    return feature_outlier_indices

def drop_outliers(df, outliers):
    indices = set([])
    for col, col_indices in outliers:
        indices = indices.union(set(col_indices))
    return df.drop(list(indices))

* ### 1.1. Detecting NaN/Empty Values

In [18]:
for feature in vinho_verde_white:
    print ('Feature: {0} === {1} NaN Values, {2} Missing Values.'.format(feature, np.sum(vinho_verde_white[feature] == np.nan),  np.sum(vinho_verde_white[feature].isnull())))

Feature: fixed_acidity === 0 NaN Values, 0 Missing Values.
Feature: volatile_acidity === 0 NaN Values, 0 Missing Values.
Feature: citric_acid === 0 NaN Values, 0 Missing Values.
Feature: residual_sugar === 0 NaN Values, 0 Missing Values.
Feature: chlorides === 0 NaN Values, 0 Missing Values.
Feature: free_sulfur_dioxide === 0 NaN Values, 0 Missing Values.
Feature: total_sulfur_dioxide === 0 NaN Values, 0 Missing Values.
Feature: density === 0 NaN Values, 0 Missing Values.
Feature: ph === 0 NaN Values, 0 Missing Values.
Feature: sulphates === 0 NaN Values, 0 Missing Values.
Feature: alcohol === 0 NaN Values, 0 Missing Values.
Feature: quality === 0 NaN Values, 0 Missing Values.


In [11]:
for feature in vinho_verde_red:
    print ('Feature: {0} === {1} NaN Values, {2} Misisng Values.'.format(feature, np.sum(vinho_verde_red[feature] == np.nan),  np.sum(vinho_verde_red[feature].isnull())))

Feature: fixed_acidity === 0 NaN Values, 0 Misisng Values.
Feature: volatile_acidity === 0 NaN Values, 0 Misisng Values.
Feature: citric_acid === 0 NaN Values, 0 Misisng Values.
Feature: residual_sugar === 0 NaN Values, 0 Misisng Values.
Feature: chlorides === 0 NaN Values, 0 Misisng Values.
Feature: free_sulfur_dioxide === 0 NaN Values, 0 Misisng Values.
Feature: total_sulfur_dioxide === 0 NaN Values, 0 Misisng Values.
Feature: density === 0 NaN Values, 0 Misisng Values.
Feature: ph === 0 NaN Values, 0 Misisng Values.
Feature: sulphates === 0 NaN Values, 0 Misisng Values.
Feature: alcohol === 0 NaN Values, 0 Misisng Values.
Feature: quality === 0 NaN Values, 0 Misisng Values.


<center>_In that occation we didn't had any NaN or Missing Values, so we move forward to detect (and remove) the Outliers that (may) exist in the dataset_

* ### 1.2. Searching (and Removing) Outliers

In [12]:
feature_outlier_indices = find_outlier_indices(vinho_verde_white)
for feature, feature_indices in feature_outlier_indices:
    print ('Feature: {0} === Outlier Values Count: {1}'.format(feature, len(feature_indices)))

Feature: fixed_acidity === Outlier Values Count: 119
Feature: volatile_acidity === Outlier Values Count: 186
Feature: citric_acid === Outlier Values Count: 270
Feature: residual_sugar === Outlier Values Count: 7
Feature: chlorides === Outlier Values Count: 212
Feature: free_sulfur_dioxide === Outlier Values Count: 50
Feature: total_sulfur_dioxide === Outlier Values Count: 19
Feature: density === Outlier Values Count: 5
Feature: ph === Outlier Values Count: 75
Feature: sulphates === Outlier Values Count: 124
Feature: alcohol === Outlier Values Count: 0
Feature: quality === Outlier Values Count: 200


In [13]:
feature_outlier_indices = find_outlier_indices(vinho_verde_red)
for feature, feature_indices in feature_outlier_indices:
    print ('Feature: {0} === Outlier Values Count: {1}'.format(feature, len(feature_indices)))

Feature: fixed_acidity === Outlier Values Count: 49
Feature: volatile_acidity === Outlier Values Count: 19
Feature: citric_acid === Outlier Values Count: 1
Feature: residual_sugar === Outlier Values Count: 155
Feature: chlorides === Outlier Values Count: 112
Feature: free_sulfur_dioxide === Outlier Values Count: 30
Feature: total_sulfur_dioxide === Outlier Values Count: 55
Feature: density === Outlier Values Count: 45
Feature: ph === Outlier Values Count: 35
Feature: sulphates === Outlier Values Count: 59
Feature: alcohol === Outlier Values Count: 13
Feature: quality === Outlier Values Count: 28


#### *Before Outlier Removal*

In [14]:
vinho_verde_white.shape

(4898, 12)

In [15]:
vinho_verde_red.shape

(1599, 12)

#### *After Outlier Removal*

In [16]:
vinho_verde_white_prep = drop_outliers(vinho_verde_white, feature_outlier_indices)
vinho_verde_white_prep.shape

(4478, 12)

In [17]:
vinho_verde_red_prep = drop_outliers(vinho_verde_red, feature_outlier_indices)
vinho_verde_red_prep.shape

(1179, 12)

* ### 1.3. Correlation Analysis 

In [56]:
vv_white_corr = vinho_verde_white.corr().abs()
vv_white_prep_corr = vinho_verde_white_prep.corr().abs() # Outlier Removed
vv_red_corr = vinho_verde_red.corr().abs()
vv_red_prep_corr = vinho_verde_red_prep.corr().abs() # Outlier Removed

* #### Correlation Analysis on *(Outlier Removed)* Red Wine Samples

In [76]:
vv_red_features = vinho_verde_red.shape[1]
vv_red_prep_features = vinho_verde_red_prep.shape[1]

s = vv_red_corr.unstack().sort_values(ascending=False, kind="mergesort")
print ('Correlation Analysis: Red Wine\n\n', s[vv_red_features:], '\n\n')

s = vv_red_prep_corr.unstack().sort_values(ascending=False, kind="mergesort")
print ('Correlation Analysis: Red Wine - Outlier Free\n\n', s[vv_red_prep_features:])

Correlation Analysis: Red Wine

 ph                    fixed_acidity           0.682978
fixed_acidity         ph                      0.682978
citric_acid           fixed_acidity           0.671703
fixed_acidity         citric_acid             0.671703
density               fixed_acidity           0.668047
fixed_acidity         density                 0.668047
total_sulfur_dioxide  free_sulfur_dioxide     0.667666
free_sulfur_dioxide   total_sulfur_dioxide    0.667666
citric_acid           volatile_acidity        0.552496
volatile_acidity      citric_acid             0.552496
ph                    citric_acid             0.541904
citric_acid           ph                      0.541904
alcohol               density                 0.496180
density               alcohol                 0.496180
quality               alcohol                 0.476166
alcohol               quality                 0.476166
quality               volatile_acidity        0.390558
volatile_acidity      quality   

##### Most Correlated Features
* ph                    fixed_acidity           **0.682978**, Outlier Removed: **0.684378**
* citric_acid           fixed_acidity           **0.671703**, Outlier Removed: **0.659397**
* density               fixed_acidity           **0.668047**, Outlier Removed: **0.627194**
* total_sulfur_dioxide  free_sulfur_dioxide     **0.667667**, Outlier Removed: **0.619675**
* citric_acid           volatile_acidity        **0.552496**, Outlier Removed: **0.610125**
* ph                    citric_acid             **0.541904**, Outlier Removed: **0.538901**

* #### Correlation Analysis on *(Outlier Removed)* White Wine Samples

In [75]:
vv_white_features = vinho_verde_white.shape[1]
vv_white_prep_features = vinho_verde_white_prep.shape[1]

s = vv_white_corr.unstack().sort_values(ascending=False, kind="mergesort")
print ('Correlation Analysis: White Wine\n\n', s[vv_white_features:], '\n\n')

s = vv_white_prep_corr.unstack().sort_values(ascending=False, kind="mergesort")
print ('Correlation Analysis: White Wine - Outlier Free\n\n', s[vv_white_prep_features:], '\n\n')

Correlation Analysis: White Wine

 density               residual_sugar          0.838966
residual_sugar        density                 0.838966
alcohol               density                 0.780138
density               alcohol                 0.780138
total_sulfur_dioxide  free_sulfur_dioxide     0.615501
free_sulfur_dioxide   total_sulfur_dioxide    0.615501
density               total_sulfur_dioxide    0.529881
total_sulfur_dioxide  density                 0.529881
alcohol               residual_sugar          0.450631
residual_sugar        alcohol                 0.450631
alcohol               total_sulfur_dioxide    0.448892
total_sulfur_dioxide  alcohol                 0.448892
quality               alcohol                 0.435575
alcohol               quality                 0.435575
ph                    fixed_acidity           0.425858
fixed_acidity         ph                      0.425858
total_sulfur_dioxide  residual_sugar          0.401439
residual_sugar        total_su

##### Most Correlated Features
* density               residual_sugar          **0.838966**, Outlier Removed: **0.838270**
* alcohol               density                 **0.780138**, Outlier Removed: **0.781072**
* total_sulfur_dioxide  free_sulfur_dioxide     **0.615501**, Outlier Removed: **0.613695**
* density               total_sulfur_dioxide    **0.529881**, Outlier Removed: **0.530652**

In [88]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style;  style.use('ggplot')

# TODO

* ### 1.4. Feature Normalization

* ### 1.5. *(If Applicable)* PCA Dimensionality Reduction

* ### 1.6. Dataset Sampling (and use Pandas describe() function to compare with original's stats)

* ### 1.6. T-SNE Dataset 2-D Plot (Before/After Sampling)

* ### 2. Save the Dataset for use in future Steps. The Dataset should be:
  * ##### cleaned *(free from NaN/Null values)*
  * ##### normalized
  * ##### dimension reduced *(if applicable)*
  * ##### non-outlier removed *(we'll see about that)*
  * ##### non-sampled *(the dataset is already small enough)*