In [None]:
import pandas as pd

from google.datalab.ml import TensorBoard
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras import callbacks
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import preprocessing

In [None]:
pd.set_option('display.max_colwidth', 500)

In [None]:
batch_size = 128
embedding_size = 8
epoch_count = 10
sequence_length = 170
split_ratio = 7 / 3
variety_threshold = 500
vocabulary_size = 12000
wide_size = 256

# Data

In [None]:
data = pd.read_csv('data/data.csv')
data = data[['description', 'variety', 'price']]
data = data[~data['price'].isnull()]

In [None]:
varieties = data['variety'].value_counts()
varieties = varieties[varieties >= variety_threshold].index
data = data[data['variety'].isin(varieties.values)]

In [None]:
split = int(split_ratio / (1 + split_ratio) * len(data))
data = data.sample(frac=1, random_state=42)
data_train, data_test = data[:split], data[:split]

# Encoding

In [None]:
encoder = LabelEncoder()
encoder.fit(data_train['variety'])
variety_count = len(encoder.classes_)

In [None]:
variety_wide_train = encoder.transform(data_train['variety']).reshape([-1, 1])
variety_wide_test = encoder.transform(data_test['variety']).reshape([-1, 1])

In [None]:
encoder = OneHotEncoder()
encoder.fit(variety_wide_train);

In [None]:
variety_wide_train = encoder.transform(variety_wide_train)
variety_wide_test = encoder.transform(variety_wide_test)

# Tokenization

In [None]:
vectorizer = CountVectorizer(binary=True, dtype=bool, max_features=vocabulary_size)
vectorizer.fit(data_train['description']);

In [None]:
description_wide_train = vectorizer.transform(data_train['description'])
description_wide_test = vectorizer.transform(data_test['description'])

In [None]:
def convert(descriptions):
    return [
        [vectorizer.vocabulary_.get(token, 0) for token in analyzer(description)]
        for description in descriptions
    ]

analyzer = vectorizer.build_analyzer()
description_deep_train = convert(data_train['description'].values)
description_deep_train = preprocessing.sequence.pad_sequences(description_deep_train, maxlen=sequence_length)
description_deep_test = convert(data_test['description'].values)
description_deep_test = preprocessing.sequence.pad_sequences(description_deep_test, maxlen=sequence_length)

# Modeling

## Deep (Actually Another Wide)

In [None]:
deep_input = layers.Input(shape=(sequence_length,))
deep_layer = layers.Embedding(vocabulary_size, embedding_size, input_length=sequence_length)(deep_input)
deep_layer = layers.Flatten()(deep_layer)
deep_output = layers.Dense(1, activation='linear')(deep_layer)

## Wide

In [None]:
wide_input = [
  layers.Input(shape=(vocabulary_size,)),
  layers.Input(shape=(variety_count,)),
]
wide_layer = layers.concatenate(wide_input)
wide_layer = layers.Dense(wide_size, activation='relu')(wide_layer)
wide_output = layers.Dense(1)(wide_layer)

## Composite

In [None]:
composite_layer = layers.concatenate([deep_output, wide_output])
composite_output = layers.Dense(1)(composite_layer)

In [None]:
composite_model = models.Model(inputs=[deep_input] + wide_input, outputs=composite_output)
composite_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
composite_model.summary()

# Training

In [None]:
TensorBoard.start('output')
tensorboard = callbacks.TensorBoard('output')

In [None]:
composite_model.fit([description_deep_train] + [description_wide_train, variety_wide_train],
                    data_train['price'], epochs=epoch_count, batch_size=batch_size,
                    callbacks=[tensorboard])

# Evaluation

In [None]:
composite_model.evaluate([description_deep_test] + [description_wide_test, variety_wide_test],
                         data_test['price'], batch_size=batch_size)

# Inference

In [None]:
predictions = composite_model.predict([description_deep_test] + [description_wide_test, variety_wide_test])
predictions = pd.DataFrame({'prediction': predictions.flatten()}, index=data_test.index)
predictions = data_test.join(predictions)[['price', 'prediction', 'variety', 'description']]

In [None]:
predictions[predictions['variety'] == 'Pinot Noir'].sample(n=20)