In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [None]:
# Install the latest version of Tensorflow
!pip install -q -U tensorflow==1.7.0

[K     |████████████████████████████████| 48.0MB 80kB/s 
[K     |████████████████████████████████| 3.1MB 33.7MB/s 
[K     |████████████████████████████████| 890kB 26.4MB/s 
[?25h  Building wheel for html5lib (setup.py) ... [?25l[?25hdone


In [None]:
import itertools
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
layers = keras.layers

# This code was tested with Tensorflow v1.7
print("You have Tensorflow version", tf.__version__)

You have Tensorflow version 1.7.0


In [None]:
!pip install pandas



In [None]:
# Get the data: original source is here: https://www.kaggle.com/zynicide/wine-reviews/data
URL = "https://storage.googleapis.com/sara-cloud-ml/wine_data.csv"
path = tf.keras.utils.get_file(URL.split('/')[-1], URL)

Downloading data from https://storage.googleapis.com/sara-cloud-ml/wine_data.csv


In [None]:
# Convert the data to a Pandas data frame
data = pd.read_csv(path)

In [None]:
# Shuffle the data
data = data.sample(frac=1)

# Print the first 5 rows
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
80186,80186,Italy,"Centine is an easy, well-priced white wine fro...",Centine,86,11.0,Tuscany,Toscana,,White Blend,Banfi
142921,142921,US,Clearly stands above the competition for its e...,Reserve Pagani Vineyard,93,45.0,California,Sonoma Valley,Sonoma,Zinfandel,St. Francis
64952,64952,Italy,This wine seems hotter than it actually is at ...,,85,9.0,Northeastern Italy,Alto Adige,,Pinot Grigio,Josef Weger
137398,137398,Austria,"A highly perfumed wine from old vines, aged fo...",Dürrau Cuvée,94,,Burgenland,,,Blaufränkisch,Weninger
2371,2371,US,Minty cedar and red raspberry jam provide a fr...,Old Vine Reserve,90,38.0,California,Dry Creek Valley,Sonoma,Zinfandel,Flinch


In [None]:
# Do some preprocessing to limits the # of wine varities in the dataset
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0], axis=1)

variety_threshold = 500 # Anything that occurs less than this will be removed.
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(to_remove, np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]

In [None]:
# Split data into train and test
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 95646
Test size: 23912


In [None]:
# Train feautures
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]

# Train  labels
labels_train = data['price'][:train_size]

# Test features
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]

# Test labels
labels_test = data['price'][train_size:]

In [None]:
# Create a tokenizer to preprocess our text description
vocab_size = 12000 # This is a hyperparameter, experiments with different values of your dataset
tokensize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level = False)
tokensize.fit_on_texts(description_train) # only fit on train

In [None]:
# wide features 1: sparse bag of words (bow) vocab_size vector
description_bow_train = tokensize.texts_to_matrix(description_train)
description_bow_test = tokensize.texts_to_matrix(description_test)

In [None]:
# Wide features 2: one-hot vector of variety categories

#Use sklern utility to convert label string to numbered index
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

# Convert labels to one hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)

In [None]:
# Define our wide model with the functional API
bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs, variety_inputs])
merged_layer = layers.Dense(256, activation='relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], outputs=predictions)


In [None]:
wide_model.compile(loss='mse', optimizer= 'adam', metrics=['accuracy'])
print(wide_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12040)        0           input_2[0][0]                    
                                                                 input_3[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 256)          3082496     concatenate_1[0][0]              
__________

In [None]:
# Deep Model features: word embedding of wine descriptions
train_embed = tokensize.texts_to_sequences(description_train)
test_embed = tokensize.texts_to_sequences(description_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(train_embed, maxlen=max_seq_length, padding="post")
test_embed = keras.preprocessing.sequence.pad_sequences(test_embed, maxlen=max_seq_length, padding="post")

In [None]:
# Define our deep model with the Functional API
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 170)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 170, 8)            96000     
_________________________________________________________________
flatten_2 (Flatten)          (None, 1360)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 1361      
Total params: 97,361
Trainable params: 97,361
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
deep_model.compile(loss='mse',
                   optimizer = 'adam',
                   mertrics=['accuracy'])

In [None]:
# Combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
print(combined_model.summary())

combined_model.compile(loss = 'mse',
                       optimizer='adam',
                       metrics=['accuracy'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 170)          0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12040)        0           input_2[0][0]                    
                                                                 input_3[0][0]                    
__________

In [None]:
# Run training
combined_model.fit([description_bow_train, variety_train] + [train_embed], labels_train, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras._impl.keras.callbacks.History at 0x7f57f7c0ecc0>

In [None]:
combined_model.evaluate([description_bow_test, variety_test] + [test_embed], labels_test, batch_size=128)



[540.7186602135891, 0.0637336902036221]

In [None]:
# generate predictions
predictions = combined_model.predict([description_bow_test, variety_test] + [test_embed])

In [None]:
# Compare prediction with actual values for the first few items in our test dataset
num_predictions = 40
diff = 0

for i in range(num_predictions):
  val = predictions[i]
  print(description_test.iloc[i])
  print('Predicted: ', val[0], 'Actual: ', labels_test.iloc[i], '\n')
  diff += abs(val[0] - labels_test.iloc[i])

This refreshing red offers black cherry and pepper on the nose and on the palate, a clean array of red berry, spice and cedar. Its dry style will pair well with food—think poultry or duck.
Predicted:  18.395033 Actual:  17.0 

This wine's pungent bouquet comes close to resembling ammonia in its sweatry intensity, yet it rounds out surprisingly well on the palate, where melon fruit and a touch of sweetness emerges. Drink up.
Predicted:  10.944654 Actual:  14.0 

This Pinot Grigio from Trentino—part of which is barrel aged—offers aromas of pear, apple, oak and flint. The linear palate delivers restrained lemon peel, apple and oak sensations in a lean but elegant style.
Predicted:  21.576347 Actual:  22.0 

This simple Chardonnay is soft and sweet, with some vanilla oaky notes. It's made in the popular style.
Predicted:  15.181451 Actual:  12.0 

Begins with bold mint and prune aromas, and it's also sort of foxy and sweaty. The palate is better, with saucy, tart fruit that leans toward to

In [None]:
# Compare the average difference between actual price and the model's predicted price
print('Average prediction difference: ', diff / num_predictions)

Average prediction difference:  6.300436505675316
