In [None]:
import numpy as np
import pandas as pd

from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import preprocessing

In [None]:
split_ratio = 7 / 3
vocabulary_size = 12000
sequence_length = 170
dense_unit_count = 256

# Data

In [None]:
data = pd.read_csv('data/data.csv')
data = data[['description', 'variety', 'price']]

In [None]:
split = int(split_ratio / (1 + split_ratio) * len(data))
data = data.sample(frac=1, random_state=42)
data_train, data_test = data[:split], data[:split]

# Encoding

In [None]:
encoder = LabelEncoder()
encoder.fit(data_train['variety'])
variety_count = len(encoder.classes_)

In [None]:
variety_wide_train = encoder.transform(data_train['variety']).reshape([-1, 1])
variety_wide_test = encoder.transform(data_test['variety']).reshape([-1, 1])

In [None]:
encoder = OneHotEncoder()
encoder.fit(variety_wide_train);

In [None]:
variety_wide_train = encoder.transform(variety_wide_train)
variety_wide_test = encoder.transform(variety_wide_test)

# Tokenization

In [None]:
vectorizer = CountVectorizer(binary=True, dtype=bool, max_features=vocabulary_size)
vectorizer.fit(data_train['description']);

In [None]:
def convert(descriptions):
    return [
        [vectorizer.vocabulary_.get(token, 0) for token in analyzer(description)]
        for description in descriptions
    ]

analyzer = vectorizer.build_analyzer()
description_deep_train = convert(data_train['description'].values)
description_deep_train = preprocessing.sequence.pad_sequences(description_deep_train, maxlen=sequence_length)
description_deep_test = convert(data_test['description'].values)
description_deep_test = preprocessing.sequence.pad_sequences(description_deep_test, maxlen=sequence_length)

In [None]:
description_wide_train = vectorizer.transform(data_train['description'])
description_wide_test = vectorizer.transform(data_test['description'])

# Deep Model

# Wide Model

In [None]:
description_inputs = layers.Input(shape=(vocabulary_size,))
variety_inputs = layers.Input(shape=(variety_count,))
wide_inputs = layers.concatenate([description_inputs, variety_inputs])
wide_layer = layers.Dense(dense_unit_count, activation='relu')(wide_inputs)
wide_outputs = layers.Dense(1)(wide_layer)
wide_model = models.Model(inputs=[description_inputs, variety_inputs], outputs=wide_outputs)
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])