# Machine Learning - Wine Quality Dataset

## Part 1: 

Import the following datasets and combine into a single data frame  
https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv

https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv 

Create a new variable called ‘color’ with values of ‘red’ or ‘white’ depending on the file source 

Compare the mean ‘quality’ for each of red and white wine with "groupby" function – which has the highest quality? 

View the min, 25%, 50%, 75%, max ‘pH’ values with Pandas describe function 

Use the percentiles to define the limits of a new variable called 'acidity_levels' which contains four acidity level groups 'high', 'mod_high', 'medium', 'low' 

Find the mean quality of each acidity level with "groupby" function 

## Part 2: 
Define a function that includes standard data quality checks that can be applied to a dataframe

# Part 1

In [1]:
# Import libraries for use
import pandas as pd
import time
import sys
import numpy as np
from IPython.display import display, HTML

In [2]:
# Set default options for the notebook
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.4f}'.format # Comma and four decimal places for float variables

In [3]:
# Import csv files
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
#url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
df = pd.read_csv(url, sep=';')
df.head()
#print(f'{df.shape}')
#print(help(pd.read_csv))

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
# Search for string value in URL location
print(url.find("red"))
print(url.find("white"))
checkList = ['red','white']
c = [x for x in checkList if url.find(x) > 0]
print(c)

83
-1
['red']


In [5]:
# Import csv files
url1 = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
url2 = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
df1 = pd.read_csv(url1, sep=';')
df2 = pd.read_csv(url2, sep=';')
print(f'shape of df1 : {df1.shape}')
print(f'shape of df2 : {df2.shape}')

shape of df1 : (1599, 12)
shape of df2 : (4898, 12)


In [6]:
# Add new variable
for n in checkList:
    if url1.find(n) > 0:
        df1['colour'] = n
    elif url2.find(n) > 0:
        df2['colour'] = n
    else:
        pass

In [7]:
# Combine DataFrames
df = pd.concat([df1,df2])
df.shape

(6497, 13)

In [8]:
# Find the mean value by colour and sort descending
df.groupby('colour')['quality'].mean().sort_values(ascending=False)

colour
white   5.8779
red     5.6360
Name: quality, dtype: float64

In [9]:
# Find the quantile values
df.describe(include='all')

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,colour
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497
unique,,,,,,,,,,,,,2
top,,,,,,,,,,,,,white
freq,,,,,,,,,,,,,4898
mean,7.2153,0.3397,0.3186,5.4432,0.056,30.5253,115.7446,0.9947,3.2185,0.5313,10.4918,5.8184,
std,1.2964,0.1646,0.1453,4.7578,0.035,17.7494,56.5219,0.003,0.1608,0.1488,1.1927,0.8733,
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.9871,2.72,0.22,8.0,3.0,
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.9923,3.11,0.43,9.5,5.0,
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.9949,3.21,0.51,10.3,6.0,
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.997,3.32,0.6,11.3,6.0,


In [10]:
# Creating new variable "acidity_levels"
labels = ['low','medium','mod_high','high']
df['acidity_levels'] = pd.qcut(df['pH'],4,labels=labels)

In [11]:
# Adding a ranking value for variable
df['pHRank'] = df.pH.rank(pct=True)

In [15]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,colour,acidity_levels,pHRank
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,high,0.9547
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red,medium,0.4837
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red,mod_high,0.6264
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red,medium,0.3795
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,high,0.9547


In [18]:
# Review the boundaries
df_s = df.groupby('acidity_levels').agg(
    {
        "pH":['min','max','count']
        ,"pHRank":['min','max']
    })
df_s.columns = ["_".join(x) for x in df_s.columns.ravel()]
df_s

Unnamed: 0_level_0,pH_min,pH_max,pH_count,pHRank_min,pHRank_max
acidity_levels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
low,2.72,3.11,1718,0.0002,0.2541
medium,3.12,3.21,1643,0.2764,0.5073
mod_high,3.22,3.32,1562,0.5316,0.7477
high,3.33,4.01,1574,0.7644,0.9999


In [19]:
# Find the mean quality by each acidity level
df.groupby('acidity_levels')['quality'].mean()

acidity_levels
low        5.7835
medium     5.7845
mod_high   5.8508
high       5.8596
Name: quality, dtype: float64

In [20]:
pcts = [x for x in np.linspace(0,100,num=5,endpoint=True)]
print(pcts)
pctsList = [x for x in np.percentile(df['pH'],pcts)]
pctsList

[0.0, 25.0, 50.0, 75.0, 100.0]


[2.72, 3.11, 3.21, 3.32, 4.01]

In [21]:
# Try to use the cut method
labels = ['low','medium','mod_high','high']
#df['acidity_level_bins'] = pd.cut(df['pH'],bins=[df['pH'].min(),3.11,3.21,3.32,4.01])
df['acidity_level_bins'] = pd.cut(df['pH'], bins=pctsList, include_lowest=True)
df['acidity_level1'] = pd.cut(df['pH'], bins=pctsList, labels=labels, include_lowest=True)
df.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,colour,acidity_levels,pHRank,acidity_level_bins,acidity_level1
1466,7.3,0.48,0.32,2.1,0.062,31.0,54.0,0.9973,3.3,0.65,10.0,7,red,mod_high,0.7079,"(3.21, 3.32]",mod_high
2423,7.2,0.24,0.29,3.0,0.036,17.0,117.0,0.9941,3.36,0.68,10.1,6,white,high,0.8139,"(3.32, 4.01]",high
1297,7.2,0.53,0.14,2.1,0.064,15.0,29.0,0.9932,3.35,0.61,12.1,6,red,high,0.7965,"(3.32, 4.01]",high
759,7.1,0.13,0.4,1.2,0.047,54.0,134.0,0.9932,3.3,0.97,9.8,7,white,mod_high,0.7079,"(3.21, 3.32]",mod_high
1582,6.1,0.715,0.1,2.6,0.053,13.0,27.0,0.9936,3.57,0.5,11.9,5,red,high,0.9783,"(3.32, 4.01]",high


In [22]:
# Review the boundaries
df_s1 = df.groupby('acidity_level1').agg(
    {
        "pH":['min','max','count']
        ,"pHRank":['min','max']
    })
df_s1.columns = ["_".join(x) for x in df_s1.columns.ravel()]
df_s1

Unnamed: 0_level_0,pH_min,pH_max,pH_count,pHRank_min,pHRank_max
acidity_level1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
low,2.72,3.11,1718,0.0002,0.2541
medium,3.12,3.21,1643,0.2764,0.5073
mod_high,3.22,3.32,1562,0.5316,0.7477
high,3.33,4.01,1574,0.7644,0.9999


# Part 2

In [23]:
# Define a function for data quality checks
class DataQualityChecks():
    # Constructor
    def __init__(self, df):
        self.df = df
        self.df_miss = None

    # Method 1 - summary stats
    def summaryData(self):
        tmp = self.df.describe(include='all')
        tmp1 = self.df.isnull().sum().sort_values(ascending=False)
        tmp2 = self.df.dtypes
        return print(f'Describe: \n{tmp}\nMissing data:\n{tmp1}\nVariable Types:\n{tmp2}')
        
    # Method 2 - missing data
    def missingData(self):
        # Which columns have missing values
        self.df_miss = self.df.isnull().sum().sort_values(ascending=False)
        return self.df_miss

In [24]:
dq = DataQualityChecks(df)
dq.missingData()
dq.df_miss

acidity_level1          0
density                 0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
pH                      0
acidity_level_bins      0
sulphates               0
alcohol                 0
quality                 0
colour                  0
acidity_levels          0
pHRank                  0
fixed acidity           0
dtype: int64

In [25]:
dq.summaryData()

Describe: 
        fixed acidity  volatile acidity  citric acid  residual sugar  \
count      6,497.0000        6,497.0000   6,497.0000      6,497.0000   
unique            nan               nan          nan             nan   
top               nan               nan          nan             nan   
freq              nan               nan          nan             nan   
mean           7.2153            0.3397       0.3186          5.4432   
std            1.2964            0.1646       0.1453          4.7578   
min            3.8000            0.0800       0.0000          0.6000   
25%            6.4000            0.2300       0.2500          1.8000   
50%            7.0000            0.2900       0.3100          3.0000   
75%            7.7000            0.4000       0.3900          8.1000   
max           15.9000            1.5800       1.6600         65.8000   

        chlorides  free sulfur dioxide  total sulfur dioxide    density  \
count  6,497.0000           6,497.0000           