In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [3]:
import itertools
import pandas as pd
import numpy as np
import tensorflow as tf
import math

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras

layers = keras.layers #import keras layers

  from ._conv import register_converters as _register_converters


In [5]:
data_path = "F:/NN/kaggle_competition/wine_reviews/wine-reviews/winemag-data_first150k.csv"

In [6]:
data = pd.read_csv(data_path)
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [7]:
#preprocessing to limit the dataset
data = data[pd.notnull(data['country'])] 
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0],axis=1)

threshold = 500 #anything that comes under 500 will automatically removed
value_counts = data['variety'].value_counts() #count variety of wines available
to_remove = value_counts[value_counts <= threshold].index
data.replace(to_remove,np.nan,inplace=True)
data = data[pd.notnull(data['variety'])]

In [10]:
value_counts.head()

Chardonnay            13775
Pinot Noir            13625
Cabernet Sauvignon    12671
Red Blend              9377
Sauvignon Blanc        6054
Name: variety, dtype: int64

In [11]:
train_size = int(len(data)*.8)
print("Training Data Size:{total_values}".format(total_values=train_size))

Training Data Size:95647


In [12]:
print("Test Data Size:{test_datasize}".format(test_datasize=len(data)-train_size))

Test Data Size:23912


In [13]:
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]
labels_train = data['price'][:train_size]

description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]
labels_test = data['price'][train_size:]

In [15]:
#create a tokenizer to preprocess our text description
vocab_size = 12000
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size,char_level=False)
tokenize.fit_on_texts(description_train) #only fit on train

In [16]:
tokenize

<tensorflow.python.keras._impl.keras.preprocessing.text.Tokenizer at 0x20bfffed550>

In [17]:
#bow
description_bow_train = tokenize.texts_to_matrix(description_train)
description_bow_test = tokenize.texts_to_matrix(description_test)

In [18]:
#one hot Encode for variety categories
encoder = LabelEncoder()
encoder.fit(variety_train)
encoder.fit(variety_test)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)

num_classes = np.max(variety_train) + 1

In [19]:
num_classes

40

In [20]:
variety_train = keras.utils.to_categorical(variety_train,num_classes)
variety_test = keras.utils.to_categorical(variety_test,num_classes)

In [23]:
variety_train.shape,variety_test.shape

((95647, 40), (23912, 40))

In [41]:
#defining wide model with functional API
bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs,variety_inputs])
merged_layer = layers.Dense(256,activation='relu')(merged_layer)
predicted_layer = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs = [bow_inputs,variety_inputs],outputs=predicted_layer)

In [42]:
merged_layer

<tf.Tensor 'dense_7/Relu:0' shape=(?, 256) dtype=float32>

In [43]:
wide_model.compile(loss='mse',optimizer='adam',metrics=['accuracy'])
print(wide_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
concatenate_5 (Concatenate)     (None, 12040)        0           input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
dense_8 (Dense)                 (None, 256)          3082496     concatenate_5[0][0]              
__________

In [44]:
#now defining deep model with functional API
train_embed = tokenize.texts_to_sequences(description_train)
test_embed = tokenize.texts_to_sequences(description_test)

max_seq_length = 180
train_embed = keras.preprocessing.sequence.pad_sequences(train_embed,maxlen=max_seq_length,padding="post")
test_embed = keras.preprocessing.sequence.pad_sequences(test_embed,maxlen=max_seq_length,padding="post")

In [45]:
train_embed.shape, test_embed.shape

((95647, 180), (23912, 180))

In [46]:
#deep model with functional API
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size,8,input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs,outputs=embed_out)
print(deep_model.summary)

<bound method Network.summary of <tensorflow.python.keras._impl.keras.engine.training.Model object at 0x0000020B80E79550>>


In [47]:
deep_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 180)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 180, 8)            96000     
_________________________________________________________________
flatten_2 (Flatten)          (None, 1440)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 1441      
Total params: 97,441
Trainable params: 97,441
Non-trainable params: 0
_________________________________________________________________


In [48]:
#combine wide and deep into one neural network model
merged_out = layers.concatenate([wide_model.output,deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input+[deep_model.input],merged_out)

print(combined_model)

<tensorflow.python.keras._impl.keras.engine.training.Model object at 0x0000020B80E83898>


In [49]:
combined_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            (None, 180)          0                                            
__________________________________________________________________________________________________
concatenate_5 (Concatenate)     (None, 12040)        0           input_7[0][0]                    
                                                                 input_8[0][0]                    
__________

In [50]:
combined_model.compile(loss='mse',optimizer='adam',metrics=['accuracy'])

In [51]:
combined_model.fit([description_bow_train,variety_train] + [train_embed],labels_train,epochs=10,batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras._impl.keras.callbacks.History at 0x20b80d37780>

In [52]:
#increase the epochs will increase the accuracy
combined_model.evaluate([description_bow_test,variety_test] + [test_embed], labels_test, batch_size=128)



[383.64589795558976, 0.06431917029231358]

In [53]:
#generate predictions
predictions = combined_model.predict([description_bow_test,variety_test] + [test_embed])

In [55]:
predictions

array([[18.173372],
       [38.892647],
       [23.121758],
       ...,
       [20.134066],
       [34.42501 ],
       [17.20518 ]], dtype=float32)

In [56]:
num_predictions = 40
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    print("Description:{des}".format(des=description_test.iloc[i]))
    print("Predicted:{price}, Actual:{actual_price} \n".format(price=val[0],actual_price=labels_test.iloc[i]))

Description:H3 is Columbia Crest's line of wines from the Horse Heaven Hills AVA—the winery's home turf—and the wines seem to have a bit more cut and flair than the slightly less expensive Columbia Valley bottlings. Here you will find a more complex blend of fruits—starfruit, apple, melon and a hint of banana—and less focus on butter and toast. In other words, more of a food and sipping wine.
Predicted:18.173372268676758, Actual:15.0 

Description:Very ripe and fruity in the modern style, almost too strong in blackberry jam, black currant and oak flavors that are not particularly subtle or delicate, although the wine is thoroughly dry. The tannins are significant, but negotiable. The suggestion is ageability. Give it 2–4 years and try again.
Predicted:38.89264678955078, Actual:45.0 

Description:Notes of olive and bay leaves add some interesting complexity to the nose of this blend, otherwise dominated by typical red berry flavors. Ripe and rich on the palate with medium acidity and li

In [58]:
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    diff += abs(val[0]-labels_test.iloc[i])

print("Average Prediction Difference is:{difference}".format(difference=diff/num_predictions))

Average Prediction Difference is:5.070724081993103
