In [1]:
from collections import Counter
from itertools import combinations
import random
import numpy as np
import pandas as pd
import tensorflow
from tensorflow.keras.layers import Concatenate, Dense, Dot, Dropout, Embedding, Input, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import Callback, ModelCheckpoint

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Setting random seeds to replicate results easily
random.seed(0)
np.random.seed(0)
tensorflow.random.set_seed(0)

# Root Mean Squared Error (RMSE)
In this tutorial, RMSE is used to evaluate the performance of a model. A model that reports lower RMSE indicates that it is a better model and vice versa.

In [2]:
def rmse(pred, actual):
    '''
    params:
        pred <np.array>: an array containing all predicted ratings
        actual <np.array>: an array containing all ground truth ratings

    return:
        a scalar whose value is the rmse
    '''
    # Ignore ratings with value zero.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

# Wide and Deep Learning (WDL) Model Implementation
The Wide and Deep Learning Model can be divided into two parts, the "wide" part and the "deep" part.

**The wide component** is a generalized linear model that takes in the raw input features and the cross-product transformation of categorical features, which enables it to **learn the co-occurrence patterns of items or features**.

**The deep component** is a Feed-forward Neural Network (FNN) which takes in both continuous and categorical features as input. Specifically, the normalized values of continuous features are concatenated with the low-dimensional dense embedding vectors converted from categorical features. This concatenated vector is then fed into the FNN during each foward pass. This mechanism tends to **increase the diversity of recommendations**.

In [3]:
def build_wdl_model(len_continuous, deep_vocab_lens, len_wide, embed_size):
    '''
    params:
        len_continuous: number of continuous features
        deep_vocab_lens: an array of integers where deep_vocab_lens[i] represents the number of unique values of (i+1)-th deep categorical feature
        len_wide: number of wide features
        embed_size: dimension of the embedding vectors of deep categorical features

    return:
        a keras Model object for the constructed wdl model 
    '''
    # A list containing all input layers
    input_list = []
    
    # Input layer for continuous features
    continuous_input = Input(shape=(len_continuous,), dtype='float32', name='continuous_input')
    input_list.append(continuous_input)

    # Get embeddings for all deep categorical features
    emb_list = []
    for vocab_size in deep_vocab_lens:
        _input = Input(shape=(1,), dtype='int32')
        input_list.append(_input)
        _emb = Embedding(output_dim=embed_size, input_dim=vocab_size, input_length=1)(_input)
        _emb = Reshape((embed_size,))(_emb)
        emb_list.append(_emb)

    # Create input layer for deep component by concatenating the embeddings and continuous features' input layer
    deep_input = Concatenate()(emb_list + [continuous_input])

    # Construct deep component
    dense_1 = Dense(256, activation='relu')(deep_input)
    dense_1_dp = Dropout(0.3)(dense_1)
    dense_2 = Dense(128, activation='relu')(dense_1_dp)
    dense_2_dp = Dropout(0.3)(dense_2)
    dense_3 = Dense(64, activation='relu')(dense_2_dp)
    dense_3_dp = Dropout(0.3)(dense_3)

    # Create input layer for wide component
    wide_input = Input(shape=(len_wide,), dtype='float32')
    input_list.append(wide_input)

    # Concatenate the outputs of deep and wide components and feed the concatenated vector into the finall fully connected layer
    fc_input = Concatenate()([dense_3_dp, wide_input])
    model_output = Dense(1)(fc_input)

    model = Model(inputs=input_list, outputs=model_output)
    return model

# Data Preprocessing
Below are utility functions that helps us retrieve the numerical values of different features from the dataset, and generate combinations of features to be used by the WDL model.

### Retrieving Continuous Features

In [4]:
def get_continuous_features(df, continuous_columns):
    '''
    params:
        df: input dataframe
        continuous_columns: column names of continuous features

    return: 
        a numpy array where each row contains the values of continuous features in the corresponding row of the input dataframe
    '''
    continuous_features = df[continuous_columns].values
    return continuous_features

### Categorical Features Cross Product Transformation
This function is used to generate a variety of feature combinations that occurred frequently in the dataset.

For example, the following items occurred frequently in the dataset:

|Item Name|Occurrence|
|---|---|
|A|4|
|B|3|
|C|2|
|D|1|

If we set `topk=3`, it means that we will only consider the top 3 items with the highest occurrence to generate combinations.

If we set `comb_p=2`, it means that we will generate combinations with 2 items in each generated combinations.

In this case, the following combinations will be generated: `[('A', 'B'), ('A', 'C'), ('B', 'C')]`

Test code: `get_top_k_p_combinations(pd.DataFrame({'item_categories': ['A, B, C, D', 'A, B, C', 'A, B', 'A']}), comb_p=2, topk=3, output_freq=False)`

In [5]:
def get_top_k_p_combinations(df, comb_p, topk, output_freq=False):
    '''
    params:
        df: input dataframe
        comb_p: number of elements in each combination (e.g., there are two elements in the combination {fried chicken, chicken and waffle}, and three elements in the combination {fried chicken, chicken and waffle, chicken fried rice})
        topk: number of most frequent combinations to retrieve
        output_freq: whether to return the frequencies of retrieved combinations

    return:
        1. output_freq = True: a list X where each element is a tuple containing a combination tuple and corresponding frequency, and the elements are stored in the descending order of their frequencies
        2. output_freq = False: a list X where each element is a tuple containing a combination tuple, and the elements are stored in the descending order of their frequencies
    '''
    def get_category_combinations(categories_str, comb_p=2):
        categories = categories_str.split(', ')
        return list(combinations(categories, comb_p))
    all_categories_p_combos = df["item_categories"].apply(lambda x: get_category_combinations(x, comb_p)).values.tolist()
    all_categories_p_combos = [tuple(t) for item in all_categories_p_combos for t in item]
    tmp = dict(Counter(all_categories_p_combos))
    sorted_categories_combinations = list(sorted(tmp.items(), key=lambda x: x[1], reverse=True))
    if output_freq:
        return sorted_categories_combinations[:topk]
    else:
        return [t[0] for t in sorted_categories_combinations[:topk]]

### Building Wide Features

In [6]:
def get_wide_features(df, selected_categories_to_idx, top_combinations):
    '''
    params:
        df: input dataframe
        selected_categories_to_idx: a dictionary mapping item categories to corrresponding integral indices
        top_combinations: a list containing retrieved mostly frequent combinantions of item categories

    return:
        a numpy array where each row contains the categorical features' binary encodings and cross product transformations for the corresponding row of the input dataframe
    '''
    def categories_to_binary_output(categories):
        binary_output = [0 for _ in range(len(selected_categories_to_idx))]
        for category in categories.split(', '):
            if category in selected_categories_to_idx:
                binary_output[selected_categories_to_idx[category]] = 1
            else:
                binary_output[0] = 1
        return binary_output
    def categories_cross_transformation(categories):
        current_category_set = set(categories.split(', '))
        corss_transform_output = [0 for _ in range(len(top_combinations))]
        for k, comb_k in enumerate(top_combinations):
            if len(current_category_set & comb_k) == len(comb_k):
                corss_transform_output[k] = 1
            else:
                corss_transform_output[k] = 0
        return corss_transform_output

    category_binary_features = np.array(df.item_categories.apply(lambda x: categories_to_binary_output(x)).values.tolist())
    category_cross_transform_features = np.array(df.item_categories.apply(lambda x: categories_cross_transformation(x)).values.tolist())
    return np.concatenate((category_binary_features, category_cross_transform_features), axis=1)

# Ratings Prediction

### Loading train, validation and test rating tables

In [7]:
tr_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/valid.csv")
te_df = pd.read_csv("data/test.csv")

tr_ratings = tr_df['stars'].values
val_ratings = val_df['stars'].values

### Loading content feautures tables of users and items

In [8]:
user_df = pd.read_csv("data/user.csv", index_col=0)
item_df = pd.read_csv("data/business.csv", index_col=0)

# Renaming columns by adding prefixes to column names
user_df = user_df.rename(index=str, columns={t: 'user_' + t for t in user_df.columns if t != 'user_id'})
item_df = item_df.rename(index=str, columns={t: 'item_' + t for t in item_df.columns if t != 'business_id'})

### Expanding the table by using user_id and business_id
Expanding the train, valiation and test dataset by using `user_id` and `business_id` to query more features from `user_df` and `item_df`.

In [9]:
tr_df = pd.merge(pd.merge(tr_df, user_df, on='user_id'), item_df, on='business_id').reset_index(drop=True)
val_df = pd.merge(pd.merge(val_df, user_df, on='user_id'), item_df, on='business_id').reset_index(drop=True)
te_df = pd.merge(pd.merge(te_df, user_df, on='user_id'), item_df, on='business_id').reset_index(drop=True)

### Preparing continuous features

In [10]:
# Specify the columns containing conitnuous features
continuous_columns = ["user_average_stars", "user_cool", "user_fans", 
                      "user_review_count", "user_useful", "user_funny",
                      "item_is_open", "item_latitude", "item_longitude", 
                      "item_review_count", "item_stars"]

# Get values of continous features for train/validation/test sets using the utility function defined previously

tr_continuous_features = get_continuous_features(tr_df, continuous_columns)
val_continuous_features = get_continuous_features(val_df, continuous_columns)
te_continuous_features = get_continuous_features(te_df, continuous_columns)

# Standardize each feature by removing the mean of the training samples and scaling to unit variance.
# See https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html for more details.
scaler = StandardScaler().fit(tr_continuous_features)

tr_continuous_features = scaler.transform(tr_continuous_features)
val_continuous_features = scaler.transform(val_continuous_features)
te_continuous_features = scaler.transform(te_continuous_features)

### Preparing deep categorical features

In [11]:
# Sepcify column names of deep categorical features
item_deep_columns = ["item_city", "item_postal_code", "item_state"]

# An array of integers where deep_vocab_lens[i] represents the number of unique values of (i+1)-th deep categorical feature
# Transforming words into indices for each categorical columns
item_deep_vocab_lens = []
for col_name in item_deep_columns:
    # Getting unique values of this deep categorical feature
    unique_values = item_df[col_name].unique()
    
    # Creating a dictionary to map from unique values to the corresponding index
    vocab = dict(zip(unique_values, range(1, len(unique_values)+1)))
    
    # Getting the number of unique values of this deep categorical features
    item_deep_vocab_lens.append(len(vocab)+1)
    
    # Creating a new column where each entry stores the index of this deep categorical feature's value in the same row
    item_df[col_name + "_idx"] = item_df[col_name].apply(lambda x: vocab[x])


# Creating a dictionary mapping each business id to corresponding values of deep categorical features ('business_id' -> ['item_city_idx', 'item_postal_code_idx', 'item_state_idx'] in this case)
item_deep_idx_columns = [t + "_idx" for t in item_deep_columns]
item_to_deep_categorical_features = dict(zip(item_df['business_id'].values, item_df[item_deep_idx_columns].values.tolist()))

# Creating numpy arrays storing corresponding deep categorical features' values of train/validation/test sets using the above mapping
tr_deep_categorical_features = np.array(tr_df['business_id'].apply(lambda x: item_to_deep_categorical_features[x]).values.tolist())
val_deep_categorical_features = np.array(val_df['business_id'].apply(lambda x: item_to_deep_categorical_features[x]).values.tolist())
te_deep_categorical_features = np.array(te_df['business_id'].apply(lambda x: item_to_deep_categorical_features[x]).values.tolist())

### Preparing wide features

##### Preparing binary encoding for each selected category

In [12]:
# Collect the categories of all items 
all_categories = [category for category_list in item_df.item_categories.values for category in category_list.split(", ")]

# Sort all unique values of the item categories by their frequencies in descending order
category_sorted = sorted(Counter(all_categories).items(), key=lambda x: x[1], reverse=True)

# Select top 500 most frequent categories
selected_categories = [t[0] for t in category_sorted[:500]]

# Create a dictionary mapping each secleted category to a unique integral index
selected_categories_to_idx = dict(zip(selected_categories, range(1, len(selected_categories) + 1)))

# Map all categories unseen in the item df to index 0
selected_categories_to_idx['unk'] = 0

# Create a dictionary mapping each integral index to corresponding category
idx_to_selected_categories = {val: key for key, val in selected_categories_to_idx.items()}

##### Preparing cross product transformation for categories

In [13]:
# Get most frequent categories combinantions using the utility function defined previously and store them in the folloing list
top_combinations = []

# Get top 50 most frequent two-categories combinantions in the train set

top_combinations += get_top_k_p_combinations(tr_df, 2, 50, output_freq=False)

# Get top 30 most frequent three-categories combinantions in the train set
top_combinations += get_top_k_p_combinations(tr_df, 3, 30, output_freq=False)

# Get top 20 most frequent four-categories combinantions in the train set
top_combinations += get_top_k_p_combinations(tr_df, 4, 20, output_freq=False)

# Convert each combinantion in the list to a set data structure
top_combinations = [set(t) for t in top_combinations]

In [14]:
# Getting values of wide features for train/validation/test sets using the utility function defined previously
# The following matrices should have a shape of (n_samples, len(selected_categories_to_idx)+len(top_combinations))
tr_wide_features = get_wide_features(tr_df, selected_categories_to_idx, top_combinations)
val_wide_features = get_wide_features(val_df, selected_categories_to_idx, top_combinations)
te_wide_features = get_wide_features(te_df, selected_categories_to_idx, top_combinations)

### Concatenating continuous features, deep categorical features and wide features as an input list

In [15]:
tr_features = []
tr_features.append(tr_continuous_features)
tr_features += [tr_deep_categorical_features[:, i] for i in range(tr_deep_categorical_features.shape[1])]
tr_features.append(tr_wide_features)

val_features = []
val_features.append(val_continuous_features)
val_features += [val_deep_categorical_features[:, i] for i in range(val_deep_categorical_features.shape[1])]
val_features.append(val_wide_features)

te_features = []
te_features.append(te_continuous_features)
te_features += [te_deep_categorical_features[:, i] for i in range(te_deep_categorical_features.shape[1])]
te_features.append(te_wide_features)

### Building the WDL

In [16]:
wdl_model = build_wdl_model(len(tr_continuous_features[0]), item_deep_vocab_lens,  len(tr_wide_features[0]), embed_size=100)

2022-05-06 04:35:40.087537: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory
2022-05-06 04:35:40.089698: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-05-06 04:35:40.090155: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Train the model using Adagrad optimizer and mean squared error loss

In [17]:
wdl_model.compile(optimizer='adagrad', loss='mse')

history = wdl_model.fit(
        tr_features, 
        tr_ratings, 
        epochs=1, verbose=1, callbacks=[ModelCheckpoint('models/model.h5')])



### Evaluate the model on train and validation sets using RMSE¶

In [18]:
y_pred = wdl_model.predict(tr_features)
print("TRAIN RMSE: ", rmse(y_pred, tr_ratings))
y_pred = wdl_model.predict(val_features)
print("VALID RMSE: ", rmse(y_pred, val_ratings))

TRAIN RMSE:  1.238755676383003
VALID RMSE:  1.2352477734807417
