In [1]:
import itertools
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
layers = keras.layers

# This code was tested with TensorFlow v1.9
print("You have TensorFlow version", tf.__version__)

  from ._conv import register_converters as _register_converters


You have TensorFlow version 1.9.0


In [6]:
# Read in the data
data = pd.read_csv('dannon_james.csv')
# Shuffle the data
data = data.sample(frac=1)
# Print the first 5 rows
data.head()

Unnamed: 0,watchrate,taxonomy_type,tag_name,globalviewcount,videocount,avgduration,avgtotalsecondsperview,totalsecondsperview,sumtotalvideoviews,proscore
3111,201.640005,Rekognition-Celebrities,Anastasia Rodionova,28368.0,4,44.634,90,360,72,yogurt - rfg yogurt - dannon
1115,56.23981,Rekognition-Labels,Musical,62413580.0,162,403.628673,227,36835,7367,yogurt - rfg yogurt - dannon
234,63.194697,Rekognition-Labels,Rice,12384690000.0,2557,58.549217,37,96460,19292,yogurt - rfg yogurt - dannon
703,56.115021,Rekognition-Celebrities,Suzana Ćebić,587336200.0,519,90.884756,51,26625,5325,yogurt - rfg yogurt - dannon
22,53.433986,Rekognition-Labels,Word,1158450000000.0,26203,91.701937,49,1300015,260003,yogurt - rfg yogurt - dannon


In [7]:
# Do some preprocessing
data = data[pd.notnull(data['watchrate'])]
data = data[pd.notnull(data['tag_name'])]

In [8]:
# Split data into train and test
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 3061
Test size: 766


In [9]:
# Train features
tag_name_train = data['tag_name'][:train_size]
taxonomy_type_train = data['taxonomy_type'][:train_size]

# Train labels
labels_train = data['watchrate'][:train_size]

# Test features
tag_name_test = data['tag_name'][train_size:]
taxonomy_type_test = data['taxonomy_type'][train_size:]

# Test labels
labels_test = data['watchrate'][train_size:]

In [10]:
# Create a tokenizer to preprocess our tag names
vocab_size = 12000 # This is a hyperparameter, experiment with different values for the dataset
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(tag_name_train) # only fit on train

In [11]:
# Wide model feature 1: sparse bag of words (bow) vocab_size vector 
tag_name_bow_train = tokenize.texts_to_matrix(tag_name_train)
tag_name_bow_test = tokenize.texts_to_matrix(tag_name_test)

In [12]:
# Wide feature 2: one-hot vector of taxonomy types

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(taxonomy_type_train)
taxonomy_type_train = encoder.transform(taxonomy_type_train)
taxonomy_type_test = encoder.transform(taxonomy_type_test)
num_classes = np.max(taxonomy_type_train) + 1

# Convert labels to one hot
taxonomy_type_train = keras.utils.to_categorical(taxonomy_type_train, num_classes)
taxonomy_type_test = keras.utils.to_categorical(taxonomy_type_test, num_classes)

In [13]:
# Define our wide model with the functional API
bow_inputs = layers.Input(shape=(vocab_size,))
taxonomy_type_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs, taxonomy_type_inputs])
merged_layer = layers.Dense(256, activation='relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs=[bow_inputs, taxonomy_type_inputs], outputs=predictions)

In [14]:
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
print(wide_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 12010)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 256)          3074816     concatenate[0][0]                
__________

In [15]:
# Deep model feature: word embeddings of video tag names
train_embed = tokenize.texts_to_sequences(tag_name_train)
test_embed = tokenize.texts_to_sequences(tag_name_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(
    train_embed, maxlen=max_seq_length, padding="post")
test_embed = keras.preprocessing.sequence.pad_sequences(
    test_embed, maxlen=max_seq_length, padding="post")

In [16]:
# Define our deep model with the Functional API
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 170)               0         
_________________________________________________________________
embedding (Embedding)        (None, 170, 8)            96000     
_________________________________________________________________
flatten (Flatten)            (None, 1360)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1361      
Total params: 97,361
Trainable params: 97,361
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
deep_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])

In [21]:
# Combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Flatten()(merged_out)
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
print(combined_model.summary())

combined_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 170)          0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 12010)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________

In [22]:
# Run training
combined_model.fit([tag_name_train, taxonomy_type_train] + [train_embed], labels_train, epochs=10, batch_size=128)

ValueError: Error when checking input: expected input_1 to have shape (12000,) but got array with shape (1,)