In [None]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras import layers
from tensorflow.keras import models

In [None]:
split_ratio = 7 / 3
vocabulary_size = 12000
dense_unit_count = 256

# Data

In [None]:
data = pd.read_csv('data/data.csv')
data = data[['description', 'variety', 'price']]

In [None]:
split = int(split_ratio / (1 + split_ratio) * len(data))
data = data.sample(frac=1, random_state=42)
data_train, data_test = data[:split], data[:split]

# Encoding

In [None]:
encoder = LabelEncoder()
encoder.fit(data_train['variety'])
variety_count = len(encoder.classes_)

In [None]:
variety_train = encoder.transform(data_train['variety']).reshape([-1, 1])
variety_test = encoder.transform(data_test['variety']).reshape([-1, 1])

In [None]:
encoder = OneHotEncoder()
encoder.fit(variety_train);

In [None]:
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)

# Tokenization

In [None]:
vectorizer = CountVectorizer(binary=True, dtype=bool, max_features=vocabulary_size)
vectorizer.fit(data_train['description']);

In [None]:
description_train = vectorizer.transform(data_train['description'])
description_test = vectorizer.transform(data_test['description'])

# Wide Model

In [None]:
description_inputs = layers.Input(shape=(vocabulary_size,))
variety_inputs = layers.Input(shape=(variety_count,))
wide_inputs = layers.concatenate([description_inputs, variety_inputs])
wide_layer = layers.Dense(dense_unit_count, activation='relu')(wide_inputs)
wide_outputs = layers.Dense(1)(wide_layer)
wide_model = models.Model(inputs=[description_inputs, variety_inputs], outputs=wide_outputs)
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])