# Import libraries

In [4]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier


# Import data
Needed to use the "sep" argument because data is separate by semicolons

In [8]:
df_red = pd.read_csv("Data/winequality-red.csv", sep=";")
df_white = pd.read_csv("Data/winequality-white.csv", sep=";")              


In [9]:
df_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [10]:
df_white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


# clean data 
(turns out no cleaning was really needed)

In [11]:
print(f'red wine DF initial: {df_red.shape}')
print(f'white wine DF initial: {df_white.shape}')

red wine DF initial: (1599, 12)
white wine DF initial: (4898, 12)


In [12]:
#drop any null rows
df_red = df_red.dropna()
df_white = df_white.dropna()

In [13]:
print(f'red wine DF after dropNA: {df_red.shape}')
print(f'white wine DF dropNA: {df_white.shape}')

red wine DF after dropNA: (1599, 12)
white wine DF dropNA: (4898, 12)


In [30]:

red_labels_count = np.unique(df_red['quality'])
white_labels_count = np.unique(df_white['quality'])
print(f'red wine unique quality ratings recorded: {red_labels_count}')
print(f'white wine unique quality ratings recorded: {white_labels_count}')

red wine unique quality ratings recorded: [3 4 5 6 7 8]
white wine unique quality ratings recorded: [3 4 5 6 7 8 9]


#### Interesting to note that neither the red nor the white datasets contain wines with all possible ratings ( to 10)

# train_test_split

In [14]:
#red wine x-values
red_targets = df_red["quality"]

#white wine x-values
white_targets =df_white["quality"]

In [15]:
#red wine y-values
red_features = df_red.drop(columns="quality")

#white wine y-values
white_features = df_white.drop(columns="quality")


In [16]:
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(red_features, red_targets, random_state = 43)

X_train_white, X_test_white, y_train_white, y_test_white = train_test_split(white_features, white_targets, random_state = 43)

# One hot encode y-values

Note that the red hot encoded values have 9 positions and the white ones have 10 positions. This is because the max quality value in the red data set is 8 and the max value in the white set is 9. See above for an example of this.  

In [69]:
oh_y_train_red = to_categorical(y_train_red)
oh_y_test_red = to_categorical(y_test_red)
oh_y_train_white = to_categorical(y_train_white)
oh_y_test_white = to_categorical(y_test_white)

print(oh_y_train_red[1])
print(oh_y_test_red[1])
print(oh_y_train_white[1])
print(oh_y_test_white[1])

[0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]


# scale data

In [51]:
X_red_scaler = MinMaxScaler().fit(X_train_red)
X_white_scaler = MinMaxScaler().fit(X_train_white)
X_train_red_scaled = X_red_scaler.transform(X_train_red)
X_train_white_scaled = X_white_scaler.transform(X_train_white)
X_test_red_scaled = X_red_scaler.transform(X_test_red)
X_test_white_scaled = X_white_scaler.transform(X_test_white)

# instantiate and train the model

In [64]:
red_features_count = len(red_features.columns)


#ouput layer units needs to be set to 9 because max score in the red dataset is "8"

def create_red_model():
    model = Sequential()
    model.add(Dense(units = 20, activation = "relu", input_dim = red_features_count))
    model.add(Dense(units = 9, activation = "softmax"))
    model.compile(optimizer = "adam", loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

red_model = create_red_model()



In [65]:
red_model.fit(X_train_red_scaled, oh_y_train_red, epochs = 100, shuffle= True, verbose = 4)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1690b5edac0>

In [70]:
white_features_count = len(white_features.columns)

#ouput layer units needs to be set to 10 because max quality score in the white dataset is "9"

def create_white_model():
    model = Sequential()
    model.add(Dense(units = 20, activation = "relu", input_dim = white_features_count))
    model.add(Dense(units = 10, activation = "softmax"))
    model.compile(optimizer = "adam", loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

white_model = create_white_model()




In [71]:
white_model.fit(X_train_white_scaled, oh_y_train_white, epochs = 100, shuffle= True, verbose = 4)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1690cae9f40>

# test models

In [84]:
red_model_loss, red_model_accuracy = red_model.evaluate(X_test_red_scaled, oh_y_test_red, verbose=3)

print(f'red model accuracy: {red_model_accuracy}')

red model accuracy: 0.6424999833106995


In [83]:
white_model_loss, white_model_accuracy = white_model.evaluate(X_test_white_scaled, oh_y_test_white, verbose=3)

print(f'white model accuracy: {white_model_accuracy}')


white model accuracy: 0.5510203838348389


# try to optimize by tweaking the model

# try using red to predict white (and vice versa)

In [None]:
# this won't work with this model b/c the white and red data sets have different max values

# grid search to optimize hyperparameters

# create a prediction tool with user input