In [27]:
import os, math
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
layers = keras.layers

print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.6.0


In [26]:
# Get the data: original source is here: https://www.kaggle.com/zynicide/wine-reviews/data
URL = "https://storage.googleapis.com/sara-cloud-ml/wine_data.csv"
path = tf.keras.utils.get_file(URL.split('/')[-1], URL)

In [81]:
data = pd.read_csv(path)

In [82]:
# Shuffle the data
data = data.sample(frac=1)

# first 5
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
66190,66190,New Zealand,"Honey, melon, pineapple, apricot and vanilla a...",,85,15.0,Marlborough,,,Chardonnay,Cairnbrae
16041,16041,US,"La Fenetre scores with this lush, forward Pino...",Le Bon Climat Vineyard,91,59.0,California,Santa Maria Valley,Central Coast,Pinot Noir,La Fenêtre
125412,125412,Italy,There's a meaty or slightly burned quality her...,Passo del Lupo,88,30.0,Central Italy,Conero Riserva,,Montepulciano,Fazi Battaglia
77727,77727,France,89–91. Barrel sample. The wine has good acidit...,Barrel Sample,90,,Bordeaux,Sauternes,,Bordeaux-style White Blend,Château Rabaud-Promis
118334,118334,US,New smoky oak dominates this Chardonnay. Under...,Searby Vineyard,85,45.0,California,Russian River Valley,Sonoma,Chardonnay,Nickel & Nickel


In [83]:
# Clean data
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0], axis=1)

In [89]:
variety_threshold = 1000 # Anything that occurs less than this will be removed.
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(to_remove, np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]

In [90]:
train_size = int(len(data) * .8)
print("Train size: %d" % (train_size))
print("Test size: %d" % ((len(data) - train_size))) 

Train size: 87340
Test size: 21836


In [91]:
# Train features
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]

# Train labels
labels_train = data['price'][:train_size]

# Test features
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]

# Test labels
labels_test = data['price'][train_size:]

In [92]:
vocab_size = 12000
tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenizer.fit_on_texts(description_train)

In [93]:
description_bow_train = tokenizer.texts_to_matrix(description_train)
description_bow_test  = tokenizer.texts_to_matrix(description_test)

In [94]:
# Wide feature 2: one-hot vector of variety categories

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

# Convert labels to one hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)

In [97]:
# Define our wide model with the functional API
bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layers = layers.concatenate([bow_inputs, variety_inputs])
merged_layers = layers.Dense(256, activation='relu')(merged_layers)
predictions = layers.Dense(1)(merged_layers)
wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], outputs=predictions)

In [98]:
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
print(wide_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 26)           0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12026)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 256)          3078912     concatenate_1[0][0]              
__________

In [101]:
# Deep model feature: word embeddings of wine descriptions
train_embed = tokenizer.texts_to_sequences(description_train)
test_embed  = tokenizer.texts_to_sequences(description_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(
    train_embed, maxlen=max_seq_length, padding='post')
test_embed = keras.preprocessing.sequence.pad_sequences(
    test_embed, maxlen=max_seq_length, padding='post')

In [102]:
# Define our deep model with the Functional API
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 170)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 170, 8)            96000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 1360)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 1361      
Total params: 97,361
Trainable params: 97,361
Non-trainable params: 0
_________________________________________________________________
None


In [104]:
deep_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [105]:
# Combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
print(combined_model.summary())

combined_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 26)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 170)          0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12026)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________

In [106]:
# Running 
combined_model.fit([description_bow_train, variety_train] + [train_embed], labels_train, epochs=10, batch_size=128)

Epoch 1/10

Epoch 2/10

Epoch 3/10

Epoch 4/10

Epoch 5/10

Epoch 6/10

Epoch 7/10

Epoch 8/10

Epoch 9/10

Epoch 10/10



<tensorflow.python.keras._impl.keras.callbacks.History at 0x7fa5cbe2f588>

In [107]:
combined_model.evaluate([description_bow_test, variety_test] + [test_embed], labels_test, batch_size=128)




[532.7880193539473, 0.06750320571123801]

In [108]:
# Generate predictions
predictions = combined_model.predict([description_bow_test, variety_test] + [test_embed])

In [109]:
# Compare predictions with actual values for the first few items in our test dataset
num_predictions = 40
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    print(description_test.iloc[i])
    print('Predicted: ', val[0], 'Actural: ', labels_test.iloc[i], '\n')
    diff += abs(val[0] - labels_test.iloc[i])

Too ripe and obvious in fruit, with raisin, currant and black cherry jam flavors, accented with smoky, caramelized oak. Good and rich, but lacks subtlety. Drink now.
Predicted:  49.227608 Actural:  28.0 

Tough in tannins and acidic in the Mayacamas style. Feels hard and dry and earthy. The cherry-berry flavors have a tomatoey edge. But this Merlot, which contains 17% Cabernet Sauvignon, has a classy elegance, and opens up as it breathes in the glass.
Predicted:  35.123184 Actural:  35.0 

A spectacular Pinot Noir, but you don't want to open it anytime soon. Shows the pedigree of this vineyard, a superbly balanced wine rich in acidity and minerals. The fruit? Black and red cherries, blackberries, cola, cocoa powder and red currants, enriched with smoky cedarwood. Beautiful, but needs time. The window is 2011–2013.
Predicted:  70.1012 Actural:  70.0 

Ebullient blossom and berry notes waft from this unique carbonated Pinot Noir (blended with proportions of Cabernet Franc and Cayuga grap

In [110]:
# Compare the average difference between actual price and the model's predicted price
print('Average prediction difference: ', diff / num_predictions)

Average prediction difference:  9.318470692634582
