In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import feature_column
from tensorflow.keras.layers import Embedding, Input, Dot, Dense, Flatten, Multiply, Concatenate, DenseFeatures
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adagrad, Adam, SGD, RMSprop
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError
import datetime

# Preprocess data

In [76]:
def preprocess_data(data_path):
    header_data = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv(data_path, sep='\t', names=header_data)
    data = data.drop("timestamp", axis=1).astype(int)
    
    num_items = data["item_id"].max()
    num_users = data["user_id"].max()
    
    y = data["rating"]
    y = np.where(y==5, 1, 0)

    X = data.drop("rating", axis=1)
#     X = X.astype('category')
    
    return X, y, num_items, num_users

In [77]:
# Read data
# User data
header_user = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
data_user = pd.read_csv('data/u.user', sep='|', names=header_user)
data_user = data_user.drop(['zip_code'], axis=1)
# data_user = data_user.astype('category')
# data_user["age"] = data_user["age"].astype('int')

# Item data
header_item = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
        'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
        'Thriller', 'War', 'Western']
data_item = pd.read_csv('data/u.item', sep='|', names=header_item, encoding = "ISO-8859-1")
data_item = data_item.drop(['release_date', 'title', 'video_release_date', 'IMDb_URL'], axis=1)
# data_item = data_item.astype('category')

# User Item interaction data
X_train, y_train, num_items, num_users = preprocess_data("data/ub.base")
X_test , y_test, _, _ = preprocess_data("data/ub.test")


In [78]:
# Data merge and label separation
X_train = X_train.merge(data_user, how="left", left_on="user_id", right_on="user_id", suffixes=(False, False))
X_train = X_train.merge(data_item, left_on="item_id", right_on="item_id")
X_test = X_test.merge(data_user, left_on="user_id", right_on="user_id")
X_test = X_test.merge(data_item, left_on="item_id", right_on="item_id")
X_train = X_train.drop(["user_id", "item_id"], axis=1)
X_test = X_test.drop(["user_id", "item_id"], axis=1)

In [79]:
X_train.head()

Unnamed: 0,age,gender,occupation,unknown,Action,Adventure,Animation,Children,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,24,M,technician,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,53,F,other,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,33,F,other,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,42,M,executive,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,53,M,lawyer,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(90570, 22) (90570,) (9430, 22) (9430,)


In [81]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(x, y, shuffle=True, batch_size=32):
  x = x.copy()
  ds = tf.data.Dataset.from_tensor_slices((dict(x), y))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(x))
  ds = ds.batch(batch_size)
  return ds

In [82]:
batch_size = 32 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(X_train, y_train, batch_size=batch_size)
val_ds = df_to_dataset(X_test, y_test, shuffle=False, batch_size=batch_size)

In [83]:
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['age'])
  print('A batch of Animation:', feature_batch['Animation'])
  print('A batch of targets:', label_batch )

Every feature: ['age', 'gender', 'occupation', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
A batch of ages: tf.Tensor(
[26 50 20 51 44 60 44 40 42 27 26 30 19 27 20 52 24 30 60 21 25 43 36 22
 44 25 31 18 19 22 57 32], shape=(32,), dtype=int64)
A batch of Animation: tf.Tensor([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(32,), dtype=int64)
A batch of targets: tf.Tensor([0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(32,), dtype=int64)


# Feature Engineering

In [11]:
# Numeric based feature
numeric = {
    'age' : tf.feature_column.numeric_column("age"),
}

# Category based feature
# Binary category feature
category = {
        categorical_feature: tf.feature_column.categorical_column_with_identity(categorical_feature,num_buckets=2)
        for categorical_feature in ['unknown','Action', 'Adventure', 'Animation', 'Children', 
                                    'Comedy', 'Crime','Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                                    'Horror', 'Musical','Mystery', 'Romance', 'Sci-Fi', 'Thriller', 
                                    'War', 'Western'] 
}
# Non-binary category feature
category.update({
        'occupation': tf.feature_column.categorical_column_with_vocabulary_list(
            'occupation', vocabulary_list=['technician', 'executive', 'administrator', 'programmer',
                                           'marketing', 'student', 'artist', 'engineer', 'librarian',
                                           'educator', 'other', 'scientist', 'homemaker', 'salesman',
                                           'healthcare', 'entertainment', 'retired', 'writer', 'none',
                                           'lawyer', 'doctor']),
        'gender': tf.feature_column.categorical_column_with_vocabulary_list('gender', vocabulary_list=['M', 'F']),
        'age_buckets': tf.feature_column.bucketized_column(numeric["age"], boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
})

inputs = {
    colname : tf.keras.layers.Input(name=colname, shape=(), dtype='int8') 
          for colname in numeric.keys()
}
inputs.update({
    colname : tf.keras.layers.Input(name=colname, shape=(), dtype='int8') 
          for colname in ['unknown','Action', 'Adventure', 'Animation', 'Children', 
                                    'Comedy', 'Crime','Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                                    'Horror', 'Musical','Mystery', 'Romance', 'Sci-Fi', 'Thriller', 
                                    'War', 'Western']
})
inputs.update({
    colname : tf.keras.layers.Input(name=colname, shape=(), dtype='string') 
          for colname in ["occupation", "gender", "age_buckets"]
})

In [12]:
wide = {col_name: column for col_name, column in category.items()}
wide.update({
    "agebuckets_occupation": tf.feature_column.crossed_column([category["age_buckets"], category['occupation']], 10*21),
    "gender_occupation": tf.feature_column.crossed_column([category["gender"], category['occupation']], 2*21)
})
# one-hot encode the sparse columns
wide = {
    colname : tf.feature_column.indicator_column(col)
          for colname, col in wide.items()
}

In [13]:
wide.keys()

dict_keys(['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'occupation', 'gender', 'age_buckets', 'agebuckets_occupation', 'gender_occupation'])

In [14]:
deep = {
    f"embed_{colname}": tf.feature_column.embedding_column(col, 10) for colname, col in category.items()
}
deep["age"] = numeric["age"]

In [15]:
deep.keys()

dict_keys(['embed_unknown', 'embed_Action', 'embed_Adventure', 'embed_Animation', 'embed_Children', 'embed_Comedy', 'embed_Crime', 'embed_Documentary', 'embed_Drama', 'embed_Fantasy', 'embed_Film-Noir', 'embed_Horror', 'embed_Musical', 'embed_Mystery', 'embed_Romance', 'embed_Sci-Fi', 'embed_Thriller', 'embed_War', 'embed_Western', 'embed_occupation', 'embed_gender', 'embed_age_buckets', 'age'])

# Get the model

In [16]:
DNN_HIDDEN_UNITS = '64,32'

In [17]:
# Build a wide-and-deep model.
def wide_and_deep_classifier(inputs, wide_columns, deep_columns, dnn_hidden_units):
    deep = tf.keras.layers.DenseFeatures(deep_columns, name='deep_inputs')(inputs)
    layers = [int(x) for x in dnn_hidden_units.split(',')]
    for layerno, numnodes in enumerate(layers):
        deep = tf.keras.layers.Dense(numnodes, activation='relu', name='dnn_{}'.format(layerno+1))(deep)        
    wide = tf.keras.layers.DenseFeatures(wide_columns, name='wide_inputs')(inputs)
    both = tf.keras.layers.concatenate([deep, wide], name='both')
    output = tf.keras.layers.Dense(1, activation='sigmoid', name='pred')(both)
    model = tf.keras.Model(inputs, output)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [18]:
model = wide_and_deep_classifier(
    inputs,
    wide_columns = wide.values(),
    deep_columns = deep.values(),
    dnn_hidden_units = DNN_HIDDEN_UNITS)

In [26]:
wide

{'unknown': IndicatorColumn(categorical_column=IdentityCategoricalColumn(key='unknown', number_buckets=2, default_value=None)),
 'Action': IndicatorColumn(categorical_column=IdentityCategoricalColumn(key='Action', number_buckets=2, default_value=None)),
 'Adventure': IndicatorColumn(categorical_column=IdentityCategoricalColumn(key='Adventure', number_buckets=2, default_value=None)),
 'Animation': IndicatorColumn(categorical_column=IdentityCategoricalColumn(key='Animation', number_buckets=2, default_value=None)),
 'Children': IndicatorColumn(categorical_column=IdentityCategoricalColumn(key='Children', number_buckets=2, default_value=None)),
 'Comedy': IndicatorColumn(categorical_column=IdentityCategoricalColumn(key='Comedy', number_buckets=2, default_value=None)),
 'Crime': IndicatorColumn(categorical_column=IdentityCategoricalColumn(key='Crime', number_buckets=2, default_value=None)),
 'Documentary': IndicatorColumn(categorical_column=IdentityCategoricalColumn(key='Documentary', number

In [86]:
history = model.fit(x=X_train,
                    y=y_train,
                    validation_data = (X_test, y_test),
                    epochs=50, 
                    steps_per_epoch=20
                   )

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [70]:
test = X_train.astype("category")
test = test.drop(["age"], axis=1)
# test = test[["age","occupation"]]

In [71]:
test.dtypes

gender         category
occupation     category
unknown        category
Action         category
Adventure      category
Animation      category
Children       category
Comedy         category
Crime          category
Documentary    category
Drama          category
Fantasy        category
Film-Noir      category
Horror         category
Musical        category
Mystery        category
Romance        category
Sci-Fi         category
Thriller       category
War            category
Western        category
dtype: object

In [72]:
tf.convert_to_tensor(test)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [None]:
X_train.dtypes

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [None]:
import pathlib

dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'datasets/petfinder-mini/petfinder-mini.csv'

tf.keras.utils.get_file('petfinder_mini.zip', dataset_url,
                        extract=True, cache_dir='.')
dataframe = pd.read_csv(csv_file)

In [None]:
# In the original dataset "4" indicates the pet was not adopted.
dataframe['target'] = np.where(dataframe['AdoptionSpeed']==4, 0, 1)

# Drop un-used columns.
dataframe = dataframe.drop(columns=['AdoptionSpeed', 'Description'])

In [None]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

In [None]:
train.dtypes

In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds