In [1]:
import keras

In [2]:
keras.__version__

'2.6.0'

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [4]:
import itertools
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.compat.v1 as tfc

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras

In [5]:
layers = keras.layers

In [6]:
data = pd.read_csv(r'wine_data\winemag-data_first150k.csv')

In [7]:
pd.set_option('max_columns' , 12)

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [9]:
data.shape

(150930, 11)

In [10]:
pd.notnull(data['country']).value_counts()

True     150925
False         5
Name: country, dtype: int64

In [11]:
data.isnull().sum()

Unnamed: 0         0
country            5
description        0
designation    45735
points             0
price          13695
province           5
region_1       25060
region_2       89977
variety            0
winery             0
dtype: int64

In [12]:
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0] , axis = 1)

In [13]:
data.isnull().sum()

country            0
description        0
designation    42311
points             0
price              0
province           0
region_1       22837
region_2       76526
variety            0
winery             0
dtype: int64

In [14]:
variety_threshold = 500 #Anything that occurs less than this will be removed
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index

In [15]:
data.replace(to_remove , np.nan , inplace = True)

In [16]:
data = data[pd.notnull(data['variety'])]

In [17]:
train_size = int(len(data) * 0.8)
print('Train size: %d' % train_size)
print('Test size: %d' % (len(data) - train_size))

Train size: 95647
Test size: 23912


In [18]:
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]
labels_train = data['price'][:train_size]

description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]
labels_test = data['price'][train_size:]

In [19]:
vocab_size = 12000 #This is hyperparameter , try with different values
tokensize = keras.preprocessing.text.Tokenizer(num_words = vocab_size ,char_level = False)
tokensize.fit_on_texts(description_train)

In [22]:
description_bow_train = tokensize.texts_to_matrix(description_train).astype('uint8')
description_bow_test = tokensize.texts_to_matrix(description_test).astype('uint8')

In [25]:
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train).astype('uint8')
variety_test = encoder.transform(variety_test).astype('uint8')

In [26]:
num_classes = np.max(variety_train) + 1

In [30]:
variety_train = keras.utils.to_categorical(variety_train , num_classes).astype('uint8')
variety_test = keras.utils.to_categorical(variety_test , num_classes).astype('uint8')

In [31]:
variety_train.shape

(95647, 40)

In [32]:
num_classes

40

In [33]:
bow_inputs = layers.Input(shape = (vocab_size , ))
variety_inputs = layers.Input(shape = (num_classes , ))
merged_layer = layers.concatenate([bow_inputs , variety_inputs])
merged_layer = layers.Dense(256 , activation = 'relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs = [bow_inputs , variety_inputs] , outputs = predictions)

In [34]:
wide_model.compile(loss = 'mse' , optimizer = 'adam' , metrics = ['accuracy'])

In [35]:
wide_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 12000)]      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 12040)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 256)          3082496     concatenate[0][0]            

In [36]:
#Deep model feature: word embeddings of wine descriptions

train_embed = tokensize.texts_to_sequences(description_train)
test_embed = tokensize.texts_to_sequences(description_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(
    train_embed , maxlen = max_seq_length , padding = 'post')
test_embed = keras.preprocessing.sequence.pad_sequences(
    test_embed , maxlen = max_seq_length , padding = 'post')

In [37]:
deep_inputs = layers.Input(shape = (max_seq_length , ))
embedding = layers.Embedding(vocab_size , 8 , input_length = max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs = deep_inputs , outputs = embed_out)
print(deep_model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 170)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 170, 8)            96000     
_________________________________________________________________
flatten (Flatten)            (None, 1360)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1361      
Total params: 97,361
Trainable params: 97,361
Non-trainable params: 0
_________________________________________________________________
None


In [38]:
deep_model.compile(loss = 'mse' , optimizer = 'adam' , metrics = ['accuracy'])

In [39]:
merged_out = layers.concatenate([wide_model.output , deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input+[deep_model.input] , merged_out)
combined_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 12000)]      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 170)]        0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 12040)        0           input_1[0][0]                    
                                                                 input_2[0][0]              

In [40]:
combined_model.compile(loss = 'mse' , optimizer = 'adam' , metrics = ['accuracy'])

In [41]:
combined_model.fit([description_bow_train , variety_train] + [train_embed] , labels_train , epochs = 10 , batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b3e6d81730>

In [42]:
combined_model.evaluate([description_bow_test , variety_test] + [test_embed] , labels_test , batch_size = 32)



[352.71807861328125, 0.0]

In [43]:
predictions = combined_model.predict([description_bow_test , variety_test] + [test_embed])

In [44]:
num_predictions = 40
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    print(description_test.iloc[i])
    print('Predicted: ' , val[0], 'Actual: ' , labels_test.iloc[i] , '\n')
    diff += abs(val[0] - labels_test.iloc[i])

H3 is Columbia Crest's line of wines from the Horse Heaven Hills AVA—the winery's home turf—and the wines seem to have a bit more cut and flair than the slightly less expensive Columbia Valley bottlings. Here you will find a more complex blend of fruits—starfruit, apple, melon and a hint of banana—and less focus on butter and toast. In other words, more of a food and sipping wine.
Predicted:  16.56811 Actual:  15.0 

Very ripe and fruity in the modern style, almost too strong in blackberry jam, black currant and oak flavors that are not particularly subtle or delicate, although the wine is thoroughly dry. The tannins are significant, but negotiable. The suggestion is ageability. Give it 2–4 years and try again.
Predicted:  48.076202 Actual:  45.0 

Notes of olive and bay leaves add some interesting complexity to the nose of this blend, otherwise dominated by typical red berry flavors. Ripe and rich on the palate with medium acidity and lighter tannins. The finish, dominated by beautifu

In [45]:
print('Average Prediciton Difference: ' , diff / num_predictions)

Average Prediciton Difference:  2.7815239667892455
