In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict

import torch
import torch.nn as nn
import torch.optim as optim
#from livelossplot import PlotLosses

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Load the dataset for recommenders

In [None]:
data_path = os.path.join("drive","MyDrive","Colab Notebooks","data", "hotel_data")

interactions_df = pd.read_csv(os.path.join(data_path, "hotel_data_interactions_df.csv"), index_col=0)

base_item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']

column_values_dict = {
    'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],
    'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],
    'rate_plan': ['Standard', 'Nonref'],
    'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],
    'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],
    'weekend_stay': ['True', 'False']
}

interactions_df.loc[:, 'term'] = pd.Categorical(
    interactions_df['term'], categories=column_values_dict['term'])
interactions_df.loc[:, 'length_of_stay_bucket'] = pd.Categorical(
    interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])
interactions_df.loc[:, 'rate_plan'] = pd.Categorical(
    interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])
interactions_df.loc[:, 'room_segment'] = pd.Categorical(
    interactions_df['room_segment'], categories=column_values_dict['room_segment'])
interactions_df.loc[:, 'n_people_bucket'] = pd.Categorical(
    interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])
interactions_df.loc[:, 'weekend_stay'] = interactions_df['weekend_stay'].astype('str')
interactions_df.loc[:, 'weekend_stay'] = pd.Categorical(
    interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])

display(HTML(interactions_df.head(15).to_html()))

Unnamed: 0,user_id,item_id,term,length_of_stay_bucket,rate_plan,room_segment,n_people_bucket,weekend_stay
0,1,0,WinterVacation,[2-3],Standard,[260-360],[5-inf],True
1,2,1,WinterVacation,[2-3],Standard,[160-260],[3-4],True
2,3,2,WinterVacation,[2-3],Standard,[160-260],[2-2],False
3,4,3,WinterVacation,[4-7],Standard,[160-260],[3-4],True
4,5,4,WinterVacation,[4-7],Standard,[0-160],[2-2],True
5,6,5,Easter,[4-7],Standard,[260-360],[5-inf],True
6,7,6,OffSeason,[2-3],Standard,[260-360],[5-inf],True
7,8,7,HighSeason,[2-3],Standard,[160-260],[1-1],True
8,9,8,HighSeason,[2-3],Standard,[0-160],[1-1],True
9,8,7,HighSeason,[2-3],Standard,[160-260],[1-1],True


In [None]:
def prepare_users_df(interactions_df):

    features = interactions_df[['user_id','term','length_of_stay_bucket','rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']]
    users_df = pd.get_dummies(features)
    users_df = users_df[['user_id','term_WinterVacation', 'term_Easter', 'term_OffSeason', 'term_HighSeason', 'term_LowSeason', 'term_MayLongWeekend', 'term_NewYear', 'term_Christmas',
                        'length_of_stay_bucket_[0-1]', 'length_of_stay_bucket_[2-3]', 'length_of_stay_bucket_[4-7]', 'length_of_stay_bucket_[8-inf]',
                        'rate_plan_Standard',
                        'room_segment_[0-160]', 'room_segment_[160-260]','room_segment_[260-360]','room_segment_[360-500]', 'room_segment_[500-900]',
                        'n_people_bucket_[2-2]', 'n_people_bucket_[3-4]','weekend_stay_True' ]]
    #users_df = users_df.groupby('user_id').sum().reset_index()
    users_df = users_df.rename(columns=lambda x: 'user_'+x )
    users_df = users_df.rename(columns={"user_user_id":"user_id"})
    #users_df = users_df.drop_duplicates(subset = ["user_id"])
    user_features = list(users_df)
    user_features.remove('user_id')
    return users_df, user_features
    

users_df_1, user_features = prepare_users_df(interactions_df)

print(user_features)

display(users_df_1.loc[users_df_1['user_id']])

['user_term_WinterVacation', 'user_term_Easter', 'user_term_OffSeason', 'user_term_HighSeason', 'user_term_LowSeason', 'user_term_MayLongWeekend', 'user_term_NewYear', 'user_term_Christmas', 'user_length_of_stay_bucket_[0-1]', 'user_length_of_stay_bucket_[2-3]', 'user_length_of_stay_bucket_[4-7]', 'user_length_of_stay_bucket_[8-inf]', 'user_rate_plan_Standard', 'user_room_segment_[0-160]', 'user_room_segment_[160-260]', 'user_room_segment_[260-360]', 'user_room_segment_[360-500]', 'user_room_segment_[500-900]', 'user_n_people_bucket_[2-2]', 'user_n_people_bucket_[3-4]', 'user_weekend_stay_True']


Unnamed: 0,user_id,user_term_WinterVacation,user_term_Easter,user_term_OffSeason,user_term_HighSeason,user_term_LowSeason,user_term_MayLongWeekend,user_term_NewYear,user_term_Christmas,user_length_of_stay_bucket_[0-1],...,user_length_of_stay_bucket_[8-inf],user_rate_plan_Standard,user_room_segment_[0-160],user_room_segment_[160-260],user_room_segment_[260-360],user_room_segment_[360-500],user_room_segment_[500-900],user_n_people_bucket_[2-2],user_n_people_bucket_[3-4],user_weekend_stay_True
1,2,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,1
2,3,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,4,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,1
4,5,1,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,1,0,1
5,6,0,1,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14315,14165,0,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,1
14364,706,0,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,1
14384,14228,0,0,0,0,1,0,0,0,1,...,0,1,0,1,0,0,0,1,0,1
14454,14298,0,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,1,0,1


In [None]:
def prepare_users_df(interactions_df):
    features = interactions_df[['user_id','term','length_of_stay_bucket','rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']]
    users_df_1=features.replace(['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 
                               'NewYear', 'Christmas','[0-1]', '[2-3]', '[4-7]', '[8-inf]','Standard', 'Nonref',
                               '[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]','[1-1]', '[2-2]', 
                               '[3-4]', '[5-inf]','True', 'False'],[3,5,4,7,8,6,2,1,
                                                                    1,2,3,4,1,2,1,2,3,4,5,1,2,3,4,1,2])
    users_df_1=users_df_1.sort_values(by='user_id')

    i=0
    user=1.0
    users_df_train=pd.DataFrame(columns=['user_id','term','length_of_stay_bucket','rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay'])
    users_df_test=pd.DataFrame(columns=['user_id','term','length_of_stay_bucket','rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay'])
    
    for index, row in users_df_1.iterrows():
      if row[0]==user and i<20:
        users_df=users_df.append(row,ignore_index=True)
      elif row[0]!=user:
        user=row[0]
        users_df=users_df.append(row,ignore_index=True)
        i=0
      i=i+1
    i=0
    user=1.0
    users_df_np=[]
    #print(users_df)
    
    #print(users_df_np)
    
    for index,row in users_df.iterrows():
      if row[0]!=user:
        while i < 25:
          users_df_np.append([row[0]-1,0,0,0,0,0,0])
          i=i+1
        i=0
        user=row[0]
      i=i+1
      users_df_np.append([row[0],row[1],row[2],row[3],row[4],row[5],row[6]])
      while i < 25:
          users_df_np.append([user,0,0,0,0,0,0])
          i=i+1
    users_df_np_2=np.array(users_df_np)
    return users_df_np_2, users_df
    

users_df, users_df_2 = prepare_users_df(interactions_df)
np.savetxt("users.csv", users_df, delimiter=",")


#display(users_df)

In [None]:
users_df_3=pd.DataFrame(users_df,columns=['user_id','term','length_of_stay_bucket','rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay'])
display(users_df)
display(users_df_2)
display(users_df_3)

array([[1.0000e+00, 3.0000e+00, 2.0000e+00, ..., 3.0000e+00, 4.0000e+00,
        1.0000e+00],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [1.4502e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.4502e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.4502e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]])

Unnamed: 0,user_id,term,length_of_stay_bucket,rate_plan,room_segment,n_people_bucket,weekend_stay
0,1.0,3.0,2.0,1.0,3.0,4.0,1.0
1,1.0,3.0,2.0,2.0,2.0,2.0,1.0
2,1.0,4.0,2.0,1.0,2.0,2.0,1.0
3,1.0,4.0,3.0,1.0,2.0,3.0,1.0
4,1.0,7.0,3.0,2.0,2.0,2.0,1.0
...,...,...,...,...,...,...,...
15183,14498.0,4.0,2.0,1.0,2.0,1.0,1.0
15184,14499.0,4.0,4.0,1.0,2.0,2.0,1.0
15185,14500.0,1.0,2.0,1.0,3.0,3.0,1.0
15186,14501.0,4.0,2.0,1.0,2.0,3.0,2.0


Unnamed: 0,user_id,term,length_of_stay_bucket,rate_plan,room_segment,n_people_bucket,weekend_stay
0,1.0,3.0,2.0,1.0,3.0,4.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
346335,14502.0,0.0,0.0,0.0,0.0,0.0,0.0
346336,14502.0,0.0,0.0,0.0,0.0,0.0,0.0
346337,14502.0,0.0,0.0,0.0,0.0,0.0,0.0
346338,14502.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def prepare_items_df(interactions_df):
    
    features = interactions_df[["item_id","term","length_of_stay_bucket","rate_plan", "room_segment", "n_people_bucket", "weekend_stay"]]
    items_df = pd.get_dummies(features)
    items_df = items_df[['item_id','term_WinterVacation', 'term_Easter', 'term_OffSeason', 'term_HighSeason', 'term_LowSeason', 'term_MayLongWeekend', 'term_NewYear', 'term_Christmas',
                        'length_of_stay_bucket_[0-1]', 'length_of_stay_bucket_[2-3]', 'length_of_stay_bucket_[4-7]', 'length_of_stay_bucket_[8-inf]',
                        'rate_plan_Standard',
                        'room_segment_[0-160]', 'room_segment_[160-260]','room_segment_[260-360]','room_segment_[360-500]', 'room_segment_[500-900]',
                        'n_people_bucket_[2-2]', 'n_people_bucket_[3-4]','weekend_stay_True' ]]
    items_df = items_df.rename(columns=lambda x: 'item_'+x )
    items_df = items_df.rename(columns={"item_item_id":"item_id"})
    items_df = items_df.drop_duplicates(subset = ["item_id"])
    item_features = list(items_df)
    item_features.remove('item_id')
    
    return items_df, item_features


items_df, item_features = prepare_items_df(interactions_df)

print(item_features)

display(items_df.loc[items_df['item_id'].isin([0, 1, 2, 3, 4, 5, 6])].head(15))

NameError: ignored

In [None]:
import tensorflow as tf
def prepare_data(interactions_df):
    data=interactions_df[['user_id','item_id']]
    #data['counted']= 1
    #data=data.groupby(['user_id','item_id'],as_index=False).count()
    return data
rec_data=prepare_data(interactions_df)
print(rec_data.loc[rec_data['user_id'].isin([1])])

rec_data['user_id']=rec_data.user_id.astype(np.str)
rec_data['item_id']=rec_data.item_id.astype(np.str)
rec_data['counted']=rec_data.counted.astype(np.float32)

dataset = tf.data.Dataset.from_tensor_slices((tf.cast(rec_data['user_id'].values.reshape(-1,1), tf.string),    tf.cast(rec_data['item_id'].values.reshape(-1,1), tf.string),
tf.cast(rec_data['counted'].values.reshape(-1,1),tf.float32)))


items = rec_data.item_id.values
users = rec_data.user_id.values
unique_items = np.unique(list(items))
unique_users = np.unique(list(users))

    user_id  item_id  counted
0         1        0        1
1         1       14        1
2         1       21        2
3         1       23        3
4         1       32        3
5         1       51        1
6         1       55        3
7         1       60        1
8         1       61        1
9         1       88        1
10        1      115        1
11        1      117        1
12        1      157        1
13        1      260        1
14        1      329        1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  # Remove the CWD from sys.path while we load stuff.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  # This is added back by InteractiveShellApp.init_path()


In [None]:
@tf.function
def rename(x0,x1,x2):
    y = {}
    y["user_id"] = x0
    y['item_id'] = x1
    y['counted'] = x2
    return y

dataset = dataset.map(rename)

In [None]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_users, mask_token=None),
      tf.keras.layers.Embedding(len(unique_users) + 1, embedding_dimension)
    ])

    # Compute embeddings for books.
    self.book_embeddings = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_items, mask_token=None),
      tf.keras.layers.Embedding(len(unique_items) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def __call__(self, x):
    
    user_id, item_id = x
    user_embedding = self.user_embeddings(user_id)
    item_embedding = self.book_embeddings(item_id)

    return self.ratings(tf.concat([user_embedding, item_embedding], axis=1))

In [None]:
!pip install tensorflow-recommenders
import tensorflow_recommenders as tfrs
class RecommenderModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel()
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tfrs.metrics.FactorizedTopK(candidates=items)]
    )

  def compute_loss(self, features, training=False) -> tf.Tensor:
    print(features)
    rating_predictions = self.ranking_model((features['user_id'], features["item_id"]))

    # The task computes the loss and the metrics.
    return self.task(labels=features["counted"], predictions=rating_predictions)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datetime import datetime
import keras
model = RecommenderModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
# Cache the dataset 
cache_dataset = dataset.cache()
# Tensorboard 
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)
# Training 
model.fit(cache_dataset, epochs=15)

Epoch 1/15
{'user_id': <tf.Tensor 'IteratorGetNext:2' shape=(1,) dtype=string>, 'item_id': <tf.Tensor 'IteratorGetNext:1' shape=(1,) dtype=string>, 'counted': <tf.Tensor 'IteratorGetNext:0' shape=(1,) dtype=float32>}


TypeError: ignored

In [None]:
self.user_id_embedding(inputs["user_id"]),
self.term_WinterVacation_embedding(inputs["term_WinterVacation"]),
self.term_Easter_embedding(inputs["term_Easter"]),
self.term_OffSeason_embedding(inputs["term_OffSeason"]),
self.term_HighSeason_embedding(inputs["term_HighSeason"]),
self.term_LowSeason_embedding(inputs["term_LowSeason"]),
self.term_MayLongWeekend_embedding(inputs["term_MayLongWeekend"]),
self.term_NewYear_embedding(inputs["term_NewYear"]),
self.term_Christmas_embedding(inputs["term_Christmas"]),
self.length_of_stay_bucket_1_embedding(inputs["length_of_stay_bucket_[0-1]"]),
self.length_of_stay_bucket_2_embedding(inputs["length_of_stay_bucket_[2-3]"]),
self.length_of_stay_bucket_4_embedding(inputs["length_of_stay_bucket_[4-7]"]),
self.length_of_stay_bucket_8_embedding(inputs["length_of_stay_bucket_[8-inf]"]),
self.rate_plan_Standard_embedding(inputs["rate_plan_Standard"]),
self.room_segment_0_embedding(inputs["room_segment_[0-160]"]),
self.room_segment_160_embedding(inputs["room_segment_[160-260]"]),
self.room_segment_260_embedding(inputs["room_segment_[260-360]"]),
self.room_segment_360_embedding(inputs["room_segment_[360-500]"]),
self.room_segment_500_embedding(inputs["room_segment_[500-900]"]),
self.n_people_bucket_2_embedding(inputs["n_people_bucket_[2-2]"]),
self.n_people_bucket_3_embedding(inputs["n_people_bucket_[3-4]"]),
self.weekend_stay_True_embedding(inputs["weekend_stay_True_embedding"]),