# WINE ANALYTICS MP2

In [5]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import os
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_classif

In [83]:
# Setup
DATA_PATH = Path("../data")
redWine = DATA_PATH / "winequality-red.xlsx"
whiteWine = DATA_PATH / "winequality-white.xlsx"

# Dataframes for files - header=1 to skip the first row (Title of file)
df_red_original = pd.read_excel(redWine, header=1)
df_white_original = pd.read_excel(whiteWine, header=1)

## Data Cleaning

In [None]:
# Shape of files
print(df_red_original.shape)
print(df_white_original.shape)

(1599, 13)
(4898, 13)
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
wine_type               0
dtype: int64


### Add wine type column to dataframe

In [84]:
# Assigning original df's to workable ones
df_red = df_red_original.copy()
df_white = df_white_original.copy()

In [86]:
df_red['wine_type'] = 'red'
df_white['wine_type'] = 'white'


In [30]:
# Ensure new columns has been added
df_red.shape

(1599, 13)

### Finding NA data (if any) and removing duplicate data before anylysis 

In [33]:
print(df_red.isna().sum())
print(df_white.isna().sum())

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
wine_type               0
dtype: int64
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
wine_type               0
dtype: int64


Since no NA data exists, we move on to dropping duplicate data from the dataframes:

In [34]:
# Finding sum of duplicates
print(df_red.duplicated().sum())
print(df_white.duplicated().sum())


240
937


In [43]:
# Dropping duplicates from df's and assigning them to new ones
df_red_clean = df_red.drop_duplicates()
df_white_clean = df_white.drop_duplicates()

# Sum of duplicates after cleaning (should be 0)
print("Sum of red duplicates: " + str(df_red_clean.duplicated().sum()) + "\nNew df_red shape: " + str(df_red_clean.shape))
print("\nSum of white duplicates: "  + str(df_white_clean.duplicated().sum()) + "\nNew df_white shape: " + str(df_white_clean.shape))

Sum of red duplicates: 0
New df_red shape: (1359, 13)

Sum of white duplicates: 0
New df_white shape: (3961, 13)


### Combining the two dataframes into one for processing

In [None]:
df_combined = pd.concat([df_red_clean, df_white_clean], ignore_index=True)

df_combined.shape

(5320, 13)

In [87]:
# Making use both white and red (wine_type) exists in the df
df_combined.sample(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
3887,6.3,0.28,0.3,3.1,0.039,24.0,115.0,0.9942,3.05,0.43,8.6,5,white
1494,6.4,0.22,0.56,14.5,0.055,27.0,159.0,0.998,2.98,0.4,9.1,5,white
5205,6.6,0.19,0.25,1.2,0.052,34.0,181.0,0.99352,3.3,0.42,9.4,7,white
4990,7.1,0.28,0.26,2.8,0.039,50.0,118.0,0.9908,3.06,0.59,11.2,6,white
1153,9.2,0.54,0.31,2.3,0.112,11.0,38.0,0.99699,3.24,0.56,10.9,5,red
3048,6.3,0.21,0.28,1.5,0.051,46.0,142.0,0.9928,3.23,0.42,10.1,6,white
1522,6.6,0.28,0.28,8.5,0.052,55.0,211.0,0.9962,3.09,0.55,8.9,6,white
3938,5.9,0.33,0.32,8.1,0.038,9.0,34.0,0.9911,3.22,0.36,12.7,7,white
1298,6.6,0.44,0.09,2.2,0.063,9.0,18.0,0.99444,3.42,0.69,11.3,6,red
2230,7.9,0.64,0.46,10.6,0.244,33.0,227.0,0.9983,2.87,0.74,9.1,3,white


## Data Exploring 

### Exploring the features of the three df's separately

In [88]:
# Red wine
df_red_original.sample()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1452,6.6,0.58,0.02,2.0,0.062,37.0,53.0,0.99374,3.35,0.76,11.6,7


In [90]:
# White wine
df_white_original.sample()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
4387,7.1,0.21,0.33,1.2,0.039,34.0,97.0,0.99112,3.11,0.75,11.2,6


In [91]:
df_combined.sample()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
5095,6.2,0.28,0.41,5.0,0.043,50.0,188.0,0.99318,3.23,0.64,10.8,6,white


The original dataframes for the red and white wine have a single dependent variable 'quality', where the others are independent (at least from the initial assumption). 
The combined dataframe potentially has two dependent variables after we've added the 'wine_type' variable.

## Exploring features

In [93]:
df_red.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [94]:
df_white.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [95]:
df_combined.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,5320.0,5320.0,5320.0,5320.0,5320.0,5320.0,5320.0,5320.0,5320.0,5320.0,5320.0,5320.0
mean,7.215179,0.34413,0.318494,5.048477,0.05669,30.036654,114.109023,0.994535,3.224664,0.533357,10.549241,5.795677
std,1.319671,0.168248,0.147157,4.50018,0.036863,17.805045,56.774223,0.002966,0.160379,0.149743,1.185933,0.879772
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0
25%,6.4,0.23,0.24,1.8,0.038,16.0,74.0,0.9922,3.11,0.43,9.5,5.0
50%,7.0,0.3,0.31,2.7,0.047,28.0,116.0,0.99465,3.21,0.51,10.4,6.0
75%,7.7,0.41,0.4,7.5,0.066,41.0,153.25,0.99677,3.33,0.6,11.4,6.0
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0


### Transforming categorical data (wine_type)

In [101]:
df_combined['wine_type'] = df_combined['wine_type'].astype('category').cat.codes

In [105]:
df_combined.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
2611,6.2,0.15,0.49,0.9,0.033,17.0,51.0,0.9932,3.3,0.7,9.4,6,1
3917,6.3,0.17,0.32,4.2,0.04,37.0,117.0,0.99182,3.24,0.43,11.3,6,1
436,7.2,0.34,0.32,2.5,0.09,43.0,113.0,0.9966,3.32,0.79,11.1,5,0
58,7.5,0.52,0.16,1.9,0.085,12.0,35.0,0.9968,3.38,0.62,9.5,7,0
3566,6.6,0.12,0.25,1.4,0.039,21.0,131.0,0.99114,3.2,0.45,11.2,7,1


In [106]:
categorical_data = df_combined.drop(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality'], axis=1)

In [107]:
wine_type_count = categorical_data['wine_type'].value_counts()

In [108]:
wine_type_count

wine_type
1    3961
0    1359
Name: count, dtype: int64