# Read in and prepare the data

In [None]:
!gdown --id '1XG6tpnEmp74aD0KR6hI80wsX5KkC1Ixc' # reviews
!gdown --id '1DCu90-aikc9TP1wTo1h2_CVSRxYLkvyY' # listings

Downloading...
From: https://drive.google.com/uc?id=1XG6tpnEmp74aD0KR6hI80wsX5KkC1Ixc
To: /content/customers_final_version.csv
42.8MB [00:00, 91.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1DCu90-aikc9TP1wTo1h2_CVSRxYLkvyY
To: /content/listings_final_version.csv
22.5MB [00:00, 137MB/s]


In [2]:
import pandas as pd
import numpy as np

In [None]:
listings=pd.read_csv("listings_final_version.csv")
customers=pd.read_csv("customers_final_version.csv")

In [None]:
customers.polarity_score.describe()

count    130602.000000
mean          0.757119
std           0.314819
min          -0.995800
25%           0.680800
50%           0.875000
75%           0.950400
max           0.999700
Name: polarity_score, dtype: float64

In [None]:
df_pair[df_pair.host_is_superhost == 0].polarity_score.mean()

0.7289552548024946

In [None]:
df_pair[df_pair.host_is_superhost == 1].polarity_score.mean()

0.8193620239796193

In [None]:
df_pair.host_is_superhost.value_counts()

0    87558
1    32194
Name: host_is_superhost, dtype: int64

In [None]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12222 entries, 0 to 12221
Data columns (total 68 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              12222 non-null  int64  
 1   description                     12222 non-null  object 
 2   host_since                      12222 non-null  object 
 3   host_is_superhost               12222 non-null  int64  
 4   latitude                        12222 non-null  float64
 5   longitude                       12222 non-null  float64
 6   accommodates                    12222 non-null  int64  
 7   bathrooms                       12222 non-null  float64
 8   bedrooms                        12222 non-null  float64
 9   beds                            12222 non-null  float64
 10  amenities                       12222 non-null  object 
 11  price                           12222 non-null  float64
 12  minimum_nights                  

In [None]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130602 entries, 0 to 130601
Data columns (total 14 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   reviewer_id                  130602 non-null  int64  
 1   listing_id                   130602 non-null  int64  
 2   date                         130602 non-null  object 
 3   comments                     130602 non-null  object 
 4   price                        130602 non-null  float64
 5   review_scores_rating         130602 non-null  float64
 6   review_scores_accuracy       130602 non-null  float64
 7   review_scores_cleanliness    130602 non-null  float64
 8   review_scores_checkin        130602 non-null  float64
 9   review_scores_communication  130602 non-null  float64
 10  review_scores_location       130602 non-null  float64
 11  review_scores_value          130602 non-null  float64
 12  polarity_score               130602 non-null  float64
 13 

In [None]:
listings.columns

Index(['id', 'description', 'host_since', 'host_is_superhost', 'latitude',
       'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'amenities', 'price', 'minimum_nights', 'maximum_nights',
       'maximum_nights_avg_ntm', 'number_of_reviews_ltm', 'instant_bookable',
       'calculated_host_listings_count', 'nearby_sites_rating',
       'nearest_station_dist', 'nearest_site_dist', 'nearest_site_rating',
       'host_response_time_cross_rate', 'email', 'phone', 'reviews', 'jumio',
       'offline_government_id', 'selfie', 'government_id', 'identity_manual',
       'work_email', 'manual_online', 'manual_offline', 'x0_Central Region',
       'x0_East Region', 'x0_North Region', 'x0_North-East Region',
       'x0_West Region', 'x0_apartment', 'x0_bed and breakfast', 'x0_boat',
       'x0_boutique hotel', 'x0_bungalow', 'x0_condominium', 'x0_guest suite',
       'x0_hostel', 'x0_hotel', 'x0_house', 'x0_loft', 'x0_other',
       'x0_townhouse', 'x0_Entire home/apt', 'x0_Ho

In [None]:
# Combine description and amenties.
listings['dtld_descr'] = listings['description'].str.cat(listings['amenities_list'].apply(lambda x:' '.join(x[1:-1].split(',')).replace("'",'')))

# Hybrid Recommender

1. Baseline linear model
2. Random Forest
3. XGBoost
4. Deep Neural Network

*Side note:* <br>
Tensorflow version is 1.15. <br>
*Lazy* approach. Hard to maintain but efficient.


## Data Preparation

In [None]:
!pip install tensorflow==1.15
import os
import tensorflow as tf
import tensorflow_hub as hub

In [None]:
# Determine csv and label columns
num_factor = 10
text_feat = 100

NON_FACTOR_COLUMNS = ["preference",'listing_id',
       'host_is_superhost', 'latitude',
       'longitude','price','number_of_reviews_ltm','calculated_host_listings_count','nearby_sites_rating',
       'nearest_station_dist', 'host_response_time_cross_rate']
FACTOR_COLUMNS1 = ["user_factor_{}".format(i) for i in range(num_factor)] + ["item_factor_{}".format(i) for i in range(num_factor)]
FACTOR_COLUMNS2 = ["text_factor_{}".format(i) for i in range(text_feat)] 
CSV_COLUMNS = NON_FACTOR_COLUMNS + FACTOR_COLUMNS1 + FACTOR_COLUMNS2
LABEL_COLUMN = "preference"

# Set default values for each CSV column
NON_FACTOR_DEFAULTS = [[0.0],['Unknown'],[0],[0.0],[0.0],[0.0],[0.0],[0.0],[0.0],[0.0],[0.0]]
FACTOR_DEFAULTS1 = [[0.0] for i in range(num_factor)] + [[0.0] for i in range(num_factor)] # user and item
FACTOR_DEFAULTS2 = [[0.0] for i in range(text_feat)]
DEFAULTS = NON_FACTOR_DEFAULTS + FACTOR_DEFAULTS1 + FACTOR_DEFAULTS2

In [None]:
# Text manipulation
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english',max_features=text_feat,ngram_range=(1,1))

tfidf_matrix = tfidf.fit_transform(listings['dtld_descr'])

tfidf_matrix.shape

(12222, 100)

In [None]:
df_pair = customers.drop(['price'],axis=1).merge(listings,left_on='listing_id',right_on='id')

avg_review = (df_pair.review_scores_rating/100 + (df_pair.review_scores_accuracy + df_pair.review_scores_cleanliness + df_pair.review_scores_checkin + \
df_pair.review_scores_communication + df_pair.review_scores_location + df_pair.review_scores_value)/10)/7

df_pair['preference'] = (df_pair['polarity_score']+1)*avg_review/2
df_pair = df_pair[['dtld_descr']+NON_FACTOR_COLUMNS]

In [None]:
df_pair.nunique()

dtld_descr                         6572
preference                        66281
listing_id                         7006
host_is_superhost                     2
latitude                           4217
longitude                          4721
price                               477
number_of_reviews_ltm                92
calculated_host_listings_count      139
nearby_sites_rating                6829
nearest_station_dist               6829
host_response_time_cross_rate       139
dtype: int64

In [None]:
df_pair.describe()

Unnamed: 0,reviewer_id,preference,listing_id,host_is_superhost,latitude,longitude,price,number_of_reviews_ltm,calculated_host_listings_count,nearby_sites_rating,nearest_station_dist,host_response_time_cross_rate
count,119752.0,119752.0,119752.0,119752.0,119752.0,119752.0,119752.0,119752.0,119752.0,119752.0,119752.0,119752.0
mean,96154530.0,0.825014,17252740.0,0.268839,1.311528,103.857428,135.892453,19.508927,15.305239,1.994652,0.493154,3.148956
std,84046370.0,0.161178,10440450.0,0.443358,0.027086,0.039913,129.097117,22.354198,30.487381,1.064402,0.371672,1.138508
min,277.0,0.00164,49091.0,0.0,1.24387,103.68746,14.0,0.0,1.0,0.84735,0.003055,0.0
25%,27917300.0,0.772561,8313733.0,0.0,1.29645,103.840018,65.0,3.0,2.0,1.174453,0.250401,2.79
50%,69172220.0,0.872043,16525300.0,0.0,1.30936,103.85275,99.0,12.0,6.0,1.589049,0.403779,3.8
75%,147190400.0,0.931327,24168250.0,1.0,1.31593,103.88292,169.0,29.0,14.0,2.455412,0.59405,4.0
max,385047300.0,0.99985,47282300.0,1.0,1.45379,103.97397,5000.0,191.0,352.0,7.146364,3.350734,4.0


In [None]:
rev_uniq = df_pair.reviewer_id.unique()
lst_uniq = df_pair.listing_id.unique()

In [None]:
!pip install surprise
from surprise import SVD, Reader, Dataset
from surprise.model_selection import KFold

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(df_pair[['reviewer_id', 'listing_id', 'preference']], reader)

SVD = SVD(n_factors=10)

SVD.fit(data.build_full_trainset())

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 5.7MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617572 sha256=c750425766c940ada1d62da0b6990bac661e66a1bc1655fb864fbe917cf6c196
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff1ba5e6710>

In [None]:
item_vec = SVD.qi
user_vec = SVD.pu

print(item_vec.shape,user_vec.shape)

(7006, 10) (106648, 10)


In [None]:
df_cross = df_pair.copy()

item_feats = []
user_feats = []

for (id, pair) in df_cross.iterrows():
  item_feats = item_feats + [item_vec[np.where(lst_uniq == pair.listing_id)[0][0]]]
  user_feats = user_feats + [user_vec[np.where(rev_uniq == pair.reviewer_id)[0][0]]]

for i in range(10):
  df_cross["item_factor_{}".format(i)] = pd.DataFrame(item_feats)[i]
  df_cross["user_factor_{}".format(i)] = pd.DataFrame(user_feats)[i]
# Time: 6min 12s

In [None]:
# Attach the textual features
for i in range(text_feat):
  df_cross["text_factor_{}".format(i)] = 0.0

tf_df = pd.DataFrame(tfidf_matrix.toarray())
temp = []

for (ind, row) in df_cross.iterrows():
  lst_id = row.listing_id
  lst_ord = listings[listings.id == lst_id].index[0]
  temp.append(list(tf_df.iloc[lst_ord,:].values))

# TIME: 1min 44s

In [None]:
temp_df = pd.DataFrame(temp)

for i in range(text_feat):
  df_cross["text_factor_{}".format(i)] = temp_df[i]

In [None]:
number_of_listing_id = df_cross.listing_id.nunique()
# number_of_reviewer_id = df_cross.reviewer_id.nunique()

In [None]:
# KEEP A COPY
# df_cross.to_csv('df_cross.csv')

In [3]:
# IN CASE: df_cross is missing:
!gdown --id '1R80ZMlGRXrcd874ei05Ov9WqlRzzaQD_'
df_cross = pd.read_csv('df_cross.csv',error_bad_lines=False).iloc[:,1:]

Downloading...
From: https://drive.google.com/uc?id=1R80ZMlGRXrcd874ei05Ov9WqlRzzaQD_
To: /content/df_cross.csv
195MB [00:01, 164MB/s]


In [9]:
df_cross.groupby(['reviewer_id']).count().sort_values(by=['listing_id'])['listing_id']

reviewer_id
277            1
116320402      1
116319937      1
116315076      1
116314455      1
            ... 
56833080      23
6406727       25
20715591      44
106345007     57
44250196     109
Name: listing_id, Length: 106648, dtype: int64

In [None]:
# Export all possible listings
all_listing = listings[['id'] + df_pair.columns[3:].tolist()].join(tf_df)
all_listing.columns = ['listing_id']+all_listing.columns.tolist()[1:-text_feat]+FACTOR_COLUMNS2

customer_record = df_cross[['listing_id','reviewer_id']].groupby(['reviewer_id']).agg(set).reset_index()

for i in FACTOR_COLUMNS1[:10]:
  customer_record[i] = df_cross[[i,'reviewer_id']].groupby(['reviewer_id']).mean().values

past_listing = df_cross.listing_id.unique()
count = 0
for (ind, listing) in all_listing.iterrows():
  if listing.listing_id in past_listing:
    all_listing.loc[ind, FACTOR_COLUMNS1[10:]] = df_cross.loc[ind, FACTOR_COLUMNS1[:10]].values
  else:
    all_listing.loc[ind, FACTOR_COLUMNS1[10:]] = 0
    count += 1
# TIME: 43s

In [None]:
# Non-reviewed listing count
count

5216

In [None]:
# all_listing.to_csv('all_listing.csv',index=False)
# customer_record.to_csv('customer_record.csv',index=False)

In [None]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest = train_test_split(df_cross, test_size=0.2, random_state=1)
Xtrain, Xval = train_test_split(Xtrain, test_size=0.2, random_state=1)
print(f"Shape of train data: {Xtrain.shape}")
print(f"Shape of validation data: {Xval.shape}")
print(f"Shape of test data: {Xtest.shape}")

Xtrain.to_csv('train.csv',header=False,index=False)
Xval.to_csv('val.csv',header=False,index=False)
Xtest.to_csv('test.csv',header=False,index=False)

Shape of train data: (76640, 131)
Shape of validation data: (19161, 131)
Shape of test data: (23951, 131)


In [None]:
# df_cross_txt = df_cross.drop(df_cross.columns[-101:-1],axis=1)

In [None]:
# df_cross_txt.to_csv('df_cross_txt.csv',index=False)

### Make auxiliary functions

In [None]:
lat_min = df_cross.latitude.min()
lat_max = df_cross.latitude.max()

lon_min = df_cross.longitude.min()
lon_max = df_cross.longitude.max()

norl_mean = df_cross.number_of_reviews_ltm.mean()
norl_std = df_cross.number_of_reviews_ltm.std()

chlc_mean = df_cross.calculated_host_listings_count.mean()
chlc_std = df_cross.calculated_host_listings_count.std()

nsr_mean = df_cross.nearby_sites_rating.mean()
nsr_std = df_cross.nearby_sites_rating.std()

nsd_mean = df_cross.nearest_station_dist.mean()
nsd_std = df_cross.nearest_station_dist.std()

In [None]:
# Create input function for train and eval
def read_dataset(filename, mode, batch_size = 512):
    def _input_fn(): 
        def decode_csv(value_column):
            columns = tf.decode_csv(records = value_column, record_defaults = DEFAULTS)
            features = dict(zip(CSV_COLUMNS, columns))          
            label = features.pop(LABEL_COLUMN)         
            return features, label

        # Create list of files that match pattern
        file_list = tf.gfile.Glob(filename = filename)

        # Create dataset from file list
        dataset = tf.data.TextLineDataset(filenames = file_list).map(map_func = decode_csv)

        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None # indefinitely
            dataset = dataset.shuffle(buffer_size = 10 * batch_size)
        else:
            num_epochs = 1 # end-of-input after this

        dataset = dataset.repeat(count = num_epochs).batch(batch_size = batch_size)
        res = dataset.make_one_shot_iterator().get_next()

        # if res.keys() in df_cross.columns:
        #   return dataset.make_one_shot_iterator().get_next()
        # else:
        return res
    return _input_fn

In [None]:
num_listing_embedding = 100
#num_reviewer_embedding = 100

# Create feature columns to be used in model
def create_feature_columns():
    # Create listing id feature column
    listing_id_column = tf.feature_column.categorical_column_with_hash_bucket(
        key = "listing_id",
        hash_bucket_size = number_of_listing_id)
    
    embedded_listing_id_column = tf.feature_column.embedding_column(
        categorical_column = listing_id_column,
        dimension = num_listing_embedding)    

    # Create reviewer id feature column
    # reviewer_id_column = tf.feature_column.categorical_column_with_hash_bucket(
    #     key = "reviewer_id",
    #     hash_bucket_size = number_of_reviewer_id)
    
    # embedded_reviewer_id_column = tf.feature_column.embedding_column(
    #     categorical_column = reviewer_id_column,
    #     dimension = num_reviewer_embedding)    

    # Create superhost feature column
    categorical_superhost_column = tf.feature_column.categorical_column_with_identity(
        key = 'host_is_superhost',
        num_buckets = 2)
    
    indicator_superhost_column = tf.feature_column.indicator_column(
        categorical_column = categorical_superhost_column
    )

    # Create bucketized price column
    bucketized_price_column = tf.feature_column.bucketized_column(
        tf.feature_column.numeric_column(key = 'price'),
        boundaries=list(np.arange(0,1000,100)))

    # Create numeric columns: 'number_of_reviews_ltm','calculated_host_listings_count','nearby_sites_rating',
    #'nearest_station_dist','host_response_time_cross_rate'

    number_of_reviews_ltm_column = tf.feature_column.numeric_column(
        key = 'number_of_reviews_ltm')#, 
        #normalizer_fn=lambda x:(x-norl_mean) / norl_std)
    
    calculated_host_listings_count_column = tf.feature_column.numeric_column(
        key = 'calculated_host_listings_count')
        #,normalizer_fn=lambda x:(x-chlc_mean) / chlc_std)
    
    nearby_sites_rating_column = tf.feature_column.numeric_column(
        key = 'nearby_sites_rating')
        #,normalizer_fn=lambda x:(x-nsr_mean) / nsr_std)
    
    nearest_station_dist_column = tf.feature_column.numeric_column(
        key = 'nearest_station_dist')
        #,normalizer_fn=lambda x:(x-nsd_mean) / nsd_std)

    host_response_time_cross_rate_column = tf.feature_column.numeric_column(
        key = 'host_response_time_cross_rate')
  

    # Create lat,lon boundaries list for our binning
    lat_boundaries = list(np.arange(lat_min, lat_max+0.01, 0.05))
    lon_boundaries = list(np.arange(lon_min, lon_max+0.01, 0.05))

    lat_column = tf.feature_column.numeric_column(
        key = "latitude")
    lon_column = tf.feature_column.numeric_column(
        key = "longitude")   
    
    # Create bucketized lat,lon feature column using our boundaries
    lat_bucketized_column = tf.feature_column.bucketized_column(
        source_column = lat_column,
        boundaries = lat_boundaries)
    
    lon_bucketized_column = tf.feature_column.bucketized_column(
        source_column = lon_column,
        boundaries = lon_boundaries)    
    
    # Cross bucketized lat column and bucketized lon column
    crossed_lat_lon_column = tf.feature_column.crossed_column(
        keys = [lat_bucketized_column, lon_bucketized_column],
        hash_bucket_size = len(lat_bucketized_column) * len(lon_bucketized_column))

    # Convert crossed categorical category and bucketized months since epoch column into indicator column so that it can be used in a DNN
    indicator_crossed_lat_lon_column = tf.feature_column.indicator_column(
            categorical_column = crossed_lat_lon_column)    

    # Create user and item factor feature columns from our trained WALS model
    user_factors = [tf.feature_column.numeric_column(key = "user_factor_" + str(i)) for i in range(num_factor)]
    item_factors =  [tf.feature_column.numeric_column(key = "item_factor_" + str(i)) for i in range(num_factor)]
    text_factors = [tf.feature_column.numeric_column(key = "text_factor_" + str(i)) for i in range(text_feat)]

    # Create list of feature columns
    feature_columns = [
      embedded_listing_id_column,    
      # embedded_reviewer_id_column,       
      indicator_superhost_column,
      bucketized_price_column,
      calculated_host_listings_count_column,
      nearby_sites_rating_column,
      nearest_station_dist_column,
      host_response_time_cross_rate_column,
      indicator_crossed_lat_lon_column] + user_factors + item_factors + text_factors

    return feature_columns

In [None]:
# Create custom model function for our custom estimator
def model_fn(features, labels, mode, params):
    # Create neural network input layer using our feature columns defined above
    net = tf.feature_column.input_layer(features = features, feature_columns = params["feature_columns"])

    # Create hidden layers by looping through hidden unit list
    for units in params["hidden_units"]:
        net = tf.nn.dropout(
          net, params['dropout_rate'], seed=1
        )
        net = tf.layers.dense(inputs = net, 
                              units = units, 
                              activation = tf.nn.relu,
                              kernel_regularizer = tf.keras.regularizers.l1_l2(l1=params["l1"],l2=params["l2"]))

    x_out = tf.layers.Dense(1, activation=tf.nn.sigmoid)(net)

    # If the mode is prediction
    if mode == tf.estimator.ModeKeys.PREDICT:
        # Create predictions dict
        predictions_dict = {
            "preference": tf.expand_dims(input = x_out, axis = -1),
        }

        # Create export outputs
        export_outputs = {"predict_export_outputs": tf.estimator.export.PredictOutput(outputs = predictions_dict)}

        return tf.estimator.EstimatorSpec( # return early since we"re done with what we need for prediction mode
          mode = mode,
          predictions = predictions_dict,
          loss = None,
          train_op = None,
          eval_metric_ops = None,
          export_outputs = export_outputs)


    # Compute loss using sparse softmax cross entropy since this is classification and our labels (content id indices) and probabilities are mutually exclusive
    loss = tf.losses.mean_squared_error(labels, x_out[0])

    # If the mode is evaluation
    if mode == tf.estimator.ModeKeys.EVAL:
        # Metrics
        rmse = tf.metrics.root_mean_squared_error(labels, x_out)
        mae = tf.metrics.mean_absolute_error(labels, x_out)

        # Put eval metrics into a dictionary
        eval_metric_ops = {
            "RMSE": rmse,
            "MAE": mae}

        # Create scalar summaries to see in TensorBoard
        tf.summary.scalar(name = "RMSE", tensor = rmse[1])
        tf.summary.scalar(name = "MAE", tensor = mae[1])   
        
        return tf.estimator.EstimatorSpec( # return early since we"re done with what we need for evaluation mode
            mode = mode,
            predictions = None,
            loss = loss,
            train_op = None,
            eval_metric_ops = eval_metric_ops,
            export_outputs = None)

    # Continue on with training mode

    # If the mode is training
    assert mode == tf.estimator.ModeKeys.TRAIN

    # Create a custom optimizer
    optimizer = tf.train.AdagradOptimizer(learning_rate = params["learning_rate"])

    # Create train op
    train_op = optimizer.minimize(loss = loss, global_step = tf.train.get_global_step())

    return tf.estimator.EstimatorSpec( # final return since we"re done with what we need for training mode
        mode = mode,
        predictions = None,
        loss = loss,
        train_op = train_op,
        eval_metric_ops = None,
        export_outputs = None)

In [None]:
# Create serving input function
def serving_input_fn():  
    feature_placeholders = {
        colname : tf.placeholder(dtype = tf.float64, shape = [None]) \
        for colname in NON_FACTOR_COLUMNS[3:]
    }

 #   feature_placeholders[NON_FACTOR_COLUMNS[0]] = tf.placeholder(dtype = tf.string, shape = [None])
    feature_placeholders[NON_FACTOR_COLUMNS[0]] = tf.placeholder(dtype = tf.float64, shape = [None])
    feature_placeholders[NON_FACTOR_COLUMNS[1]] = tf.placeholder(dtype = tf.string, shape = [None])
    feature_placeholders[NON_FACTOR_COLUMNS[2]] = tf.placeholder(dtype = tf.int64, shape = [None])

    for colname in FACTOR_COLUMNS1 + FACTOR_COLUMNS2:
        feature_placeholders[colname] = tf.placeholder(dtype = tf.float64, shape = [None])

    features = {
        key: tf.expand_dims(tensor, -1) \
        for key, tensor in feature_placeholders.items()
    }

    return tf.estimator.export.ServingInputReceiver(features = features, receiver_tensors = feature_placeholders)

### Train, evaluate and predict

In [None]:
# Create train and evaluate loop to combine all of the pieces together.
tf.logging.set_verbosity(tf.logging.INFO)
def train_and_evaluate(args):
    estimator = tf.estimator.Estimator(
        model_fn = model_fn,
        model_dir = args["output_dir"],
        params = {
        "feature_columns": create_feature_columns(),
        "hidden_units": args["hidden_units"],
        "learning_rate": args["learning_rate"],
        "dropout_rate": args["dropout_rate"],
        "l1": args["l1"],
        "l2": args["l2"]
        }
    )

    train_spec = tf.estimator.TrainSpec(
        input_fn = read_dataset(filename = args["train_data_paths"], mode = tf.estimator.ModeKeys.TRAIN, batch_size = args["batch_size"]),
        max_steps = args["train_steps"])

    exporter = tf.estimator.LatestExporter(name = "exporter", 
                                           serving_input_receiver_fn = serving_input_fn)

    eval_spec = tf.estimator.EvalSpec(
        input_fn = read_dataset(filename = args["eval_data_paths"], mode = tf.estimator.ModeKeys.EVAL, batch_size = args["batch_size"]),
        steps = None,
        start_delay_secs = args["start_delay_secs"],
        throttle_secs = args["throttle_secs"],
        exporters = exporter)

    tf.estimator.train_and_evaluate(estimator = estimator, train_spec = train_spec, eval_spec = eval_spec)
    return estimator

In [None]:
# Call train and evaluate loop
import shutil

outdir = "hybrid_recommendation_trained"
shutil.rmtree(path = outdir, ignore_errors = True) # start fresh each time

arguments = {
    "train_data_paths": "train.csv",
    "eval_data_paths": "test.csv",
    "output_dir": outdir,
    "batch_size": 128,
    "learning_rate": 0.1,
    "dropout_rate": 0.75,
    "l2": 0.01,
    "l1": 0.01,
    "hidden_units": [512, 256, 128, 64, 32],
    "train_steps": 1000,
    "start_delay_secs": 30,
    "throttle_secs": 30
}

model = train_and_evaluate(arguments)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'hybrid_recommendation_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa725df06d0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Not using Distribute Coordi

In [None]:
test_pred_raw = [x['preference'] for x in list(model.predict(input_fn=read_dataset('test.csv',mode=tf.estimator.ModeKeys.PREDICT)))]
test_pred = [x[0][0] for x in test_pred_raw]
testset_mse = sum(np.square(Xtest.preference - test_pred))/len(test_pred)
testset_mse

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from hybrid_recommendation_trained/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


0.03931196245572145

In [None]:
export_dir = '/content/exported_model/1'
model.export_saved_model(export_dir,serving_input_fn)

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Classify: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: None


INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict_export_outputs', 'serving_default']


INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict_export_outputs', 'serving_default']


INFO:tensorflow:Signatures INCLUDED in export for Train: None


INFO:tensorflow:Signatures INCLUDED in export for Train: None


INFO:tensorflow:Signatures INCLUDED in export for Eval: None


INFO:tensorflow:Signatures INCLUDED in export for Eval: None


INFO:tensorflow:Restoring parameters from hybrid_recommendation_trained/model.ckpt-500


INFO:tensorflow:Restoring parameters from hybrid_recommendation_trained/model.ckpt-500


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /content/exported_model/4/temp-b'1617604220'/saved_model.pb


INFO:tensorflow:SavedModel written to: /content/exported_model/4/temp-b'1617604220'/saved_model.pb


b'/content/exported_model/4/1617604220'

In [None]:
from tensorflow.contrib import predictor

predict_fn = predictor.from_saved_model('/content/exported_model/4/1617604220')
predictions = predict_fn(Xtest.astype({'listing_id':'string','number_of_reviews_ltm':'float64','calculated_host_listings_count':'float64'}))
print(predictions)

INFO:tensorflow:Restoring parameters from /content/exported_model/4/1617604220/variables/variables


INFO:tensorflow:Restoring parameters from /content/exported_model/4/1617604220/variables/variables


{'preference': array([[[0.88810277]],

       [[0.794929  ]],

       [[0.8260367 ]],

       ...,

       [[0.8244647 ]],

       [[0.8540371 ]],

       [[0.8230056 ]]], dtype=float32)}
