# <center>Vinho Verde Wine Quality Classification
* ## Stage 1: Data Preprocessing
* ### Importing Essential Libraries

In [177]:
import pandas as pd
import numpy as np

* ### Importing the Dataset

In [178]:
vinho_verde_red_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
vinho_verde_white_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
vinho_verde_red = pd.read_csv(vinho_verde_red_url, delimiter=';')
vinho_verde_white = pd.read_csv(vinho_verde_white_url, delimiter=';')

# standardize column names for ease of use
vinho_verde_white.columns = [c.lower().replace(' ', '_') for c in vinho_verde_white.columns]
vinho_verde_red.columns = [c.lower().replace(' ', '_') for c in vinho_verde_red.columns]

* ### Taking a peek on our Dataset

In [192]:
vinho_verde_red.head(2)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5


In [193]:
vinho_verde_white.head(2)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6


* ### What is the type of each feature?

In [188]:
vinho_verde_red.dtypes

fixed_acidity           float64
volatile_acidity        float64
citric_acid             float64
residual_sugar          float64
chlorides               float64
free_sulfur_dioxide     float64
total_sulfur_dioxide    float64
density                 float64
ph                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [189]:
vinho_verde_white.dtypes

fixed_acidity           float64
volatile_acidity        float64
citric_acid             float64
residual_sugar          float64
chlorides               float64
free_sulfur_dioxide     float64
total_sulfur_dioxide    float64
density                 float64
ph                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

* ### Getting a Basic Statistical Description of each Dataset

In [152]:
vinho_verde_white.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [150]:
vinho_verde_red.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


* ### Functions
  * **find_outlier_indices:** Given a DataFrame return the outlier indices in list of tuples in (feature, row_indices) format, *provided that the feature values are greater than Q75 + 1.5 \* IQR or less that Q25 - 1.5 \* IQR*
  * **drop_outliers:** Given a DataFrame and the Outlier list of tuples in (feature, row_indices) format, return the DataFrame minus the included in the list row indices

In [154]:
def find_outlier_indices(df):
    quantiles = df.quantile([0.25, 0.75])
    feature_outlier_indices = []
    for feature in quantiles:
        q25, q75 = quantiles[feature].values
        iqr = q75 - q25
        
        outlier_indices = df.index[(df[feature] < q25 - (1.5 * iqr)) | (df[feature] > q75 + (1.5 * iqr))].tolist()
        feature_outlier_indices.append((feature, outlier_indices))
    return feature_outlier_indices

def drop_outliers(df, outliers):
    indices = set([])
    for col, col_indices in outliers:
        indices = indices.union(set(col_indices))
    return df.drop(list(indices))

* ### 1.1. Detecting NaN/Empty Values

In [211]:
for feature in vinho_verde_white:
    print ('Feature: {0} === {1} NaN Values, {2} Misisng Values.'.format(feature, np.sum(vinho_verde_white[feature] == np.nan),  np.sum(vinho_verde_white[feature].isnull())))

Feature: fixed_acidity === 0 NaN Values, 0 Misisng Values.
Feature: volatile_acidity === 0 NaN Values, 0 Misisng Values.
Feature: citric_acid === 0 NaN Values, 0 Misisng Values.
Feature: residual_sugar === 0 NaN Values, 0 Misisng Values.
Feature: chlorides === 0 NaN Values, 0 Misisng Values.
Feature: free_sulfur_dioxide === 0 NaN Values, 0 Misisng Values.
Feature: total_sulfur_dioxide === 0 NaN Values, 0 Misisng Values.
Feature: density === 0 NaN Values, 0 Misisng Values.
Feature: ph === 0 NaN Values, 0 Misisng Values.
Feature: sulphates === 0 NaN Values, 0 Misisng Values.
Feature: alcohol === 0 NaN Values, 0 Misisng Values.
Feature: quality === 0 NaN Values, 0 Misisng Values.


In [212]:
for feature in vinho_verde_red:
    print ('Feature: {0} === {1} NaN Values, {2} Misisng Values.'.format(feature, np.sum(vinho_verde_red[feature] == np.nan),  np.sum(vinho_verde_red[feature].isnull())))

Feature: fixed_acidity === 0 NaN Values, 0 Misisng Values.
Feature: volatile_acidity === 0 NaN Values, 0 Misisng Values.
Feature: citric_acid === 0 NaN Values, 0 Misisng Values.
Feature: residual_sugar === 0 NaN Values, 0 Misisng Values.
Feature: chlorides === 0 NaN Values, 0 Misisng Values.
Feature: free_sulfur_dioxide === 0 NaN Values, 0 Misisng Values.
Feature: total_sulfur_dioxide === 0 NaN Values, 0 Misisng Values.
Feature: density === 0 NaN Values, 0 Misisng Values.
Feature: ph === 0 NaN Values, 0 Misisng Values.
Feature: sulphates === 0 NaN Values, 0 Misisng Values.
Feature: alcohol === 0 NaN Values, 0 Misisng Values.
Feature: quality === 0 NaN Values, 0 Misisng Values.


<center>_In that occation we didn't had any NaN or Missing Values, so we move forward to detect (and remove) the Outliers that (may) exist in the dataset_

* ### 1.2. Searching (and Removing) Outliers

In [155]:
feature_outlier_indices = find_outlier_indices(vinho_verde_white)
for feature, feature_indices in feature_outlier_indices:
    print ('Feature: {0} === Outlier Values Count: {1}'.format(feature, len(feature_indices)))

Feature: fixed_acidity === Outlier Values Count: 119
Feature: volatile_acidity === Outlier Values Count: 186
Feature: citric_acid === Outlier Values Count: 270
Feature: residual_sugar === Outlier Values Count: 7
Feature: chlorides === Outlier Values Count: 212
Feature: free_sulfur_dioxide === Outlier Values Count: 50
Feature: total_sulfur_dioxide === Outlier Values Count: 19
Feature: density === Outlier Values Count: 5
Feature: ph === Outlier Values Count: 75
Feature: sulphates === Outlier Values Count: 124
Feature: alcohol === Outlier Values Count: 0
Feature: quality === Outlier Values Count: 200


In [196]:
feature_outlier_indices = find_outlier_indices(vinho_verde_red)
for feature, feature_indices in feature_outlier_indices:
    print ('Feature: {0} === Outlier Values Count: {1}'.format(feature, len(feature_indices)))

Feature: fixed_acidity === Outlier Values Count: 49
Feature: volatile_acidity === Outlier Values Count: 19
Feature: citric_acid === Outlier Values Count: 1
Feature: residual_sugar === Outlier Values Count: 155
Feature: chlorides === Outlier Values Count: 112
Feature: free_sulfur_dioxide === Outlier Values Count: 30
Feature: total_sulfur_dioxide === Outlier Values Count: 55
Feature: density === Outlier Values Count: 45
Feature: ph === Outlier Values Count: 35
Feature: sulphates === Outlier Values Count: 59
Feature: alcohol === Outlier Values Count: 13
Feature: quality === Outlier Values Count: 28


#### *Before Outlier Removal*

In [156]:
vinho_verde_white.shape

(4898, 12)

In [197]:
vinho_verde_red.shape

(1599, 12)

#### *After Outlier Removal*

In [157]:
vinho_verde_white_prep = drop_outliers(vinho_verde_white, feature_outlier_indices)
vinho_verde_white_prep.shape

(3854, 12)

In [198]:
vinho_verde_red_prep = drop_outliers(vinho_verde_red, feature_outlier_indices)
vinho_verde_red_prep.shape

(1179, 12)