In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import tensorflow as tf
import tensorflow_ranking as tfr
from sklearn.utils import shuffle

2022-06-02 11:18:09.881108: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-02 11:18:09.881155: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# What is GAM

Interpretable Learning-to-Rank
Transparency and interpretability are important factors in deploying LTR models in ranking systems that can be involved in determining the outcomes of processes such as loan eligibility assessment, advertisement targeting, or guiding medical treatment decisions. In such cases, the contribution of each individual feature to the final ranking should be examinable and understandable to ensure transparency, accountability and fairness of the outcomes.

One possible way to achieve this is using generalized additive models (GAMs) — intrinsically interpretable machine learning models that are linearly composed of smooth functions of individual features. However, while GAMs have been extensively studied on regression and classification tasks, it is less clear how to apply them in a ranking setting. For instance, while GAMs can be straightforwardly applied to model each individual item in the list, modeling both item interactions and the context in which these items are ranked is a more challenging research problem. To this end, we have developed a neural ranking GAM — an extension of generalized additive models to ranking problems.

Unlike standard GAMs, a neural ranking GAM can take into account both the features of the ranked items and the context features (e.g., query or user profile) to derive an interpretable, compact model. This ensures that not only the contribution of each item-level feature is interpretable, but also the contribution of the context features. For example, in the figure below, using a neural ranking GAM makes visible how distance, price, and relevance, in the context of a given user device, contribute to the final ranking of the hotel. Neural ranking GAMs are now available as a part of TF-Ranking,

# Input
Items, Contexts (Features)

# Generalized Generative Model

Facilities + Prize + Distance 

# Output
Hotels Ranking

1. Understanding data (EDA)
2. Build Ranking Dataset
3. Build tfr.keras.layers.GAMLayer 


# Preprocessing

## Raw Data

In [3]:
df = pd.read_csv("hotels3.csv")


In [4]:
df

Unnamed: 0,Hotel,Star,Rating,Reviews,Harga,Places Nearby,Facil + Akomod
0,Hotel Indonesia Kempinski Jakarta,5.0,8.9,4363,2.480.500,Nearby Places\n\nJia Jia - Grand Indonesia (De...,Food and Drinks\nA la carte breakfast\nA la ca...
1,"The Langham, Jakarta",5.0,8.8,238,3.823.600,Nearby Places\n\nPig Me Up! - Ashta District 8...,Food and Drinks\nA la carte dinner\nA la carte...
2,Manhattan Hotel,5.0,8.5,8405,6.231.500,Nearby Places\n\nJia Jia - Grand Indonesia (De...,Hotel Services\nBellhop\nConcierge\nMoney chan...
3,Aloft South Jakarta,4.0,8.9,424,762.300,Nearby Places\n\nSouthside Rooftop Bar & Loung...,Public Facilities\nParking\nCoffee shop\nEleva...
4,"The Mayflower, Jakarta - Marriott Executive Ap...",5.0,9.0,393,1.756.254,Nearby Places\n\nSudirman Plaza\nBusiness\n2 m...,Food and Drinks\nA la carte breakfast\nA la ca...
...,...,...,...,...,...,...,...
84,POP! Hotel Airport Jakarta,2.0,7.9,11026,"385.200,00",Nearby Places\r\n\r\nsTREATs Restaurant - Ibis...,Hotel Services\r\nBellhop\r\n24-hour security\...
85,Sheraton Grand Jakarta Gandaria City Hotel,5.0,8.9,1511,"2.420.000,00",Nearby Places\r\n\r\nAnigre at Sheraton Gandar...,Food and Drinks\r\nA la carte breakfast\r\nA l...
86,Horison Suites & Residences Rasuna Jakarta,4.0,7.8,517,"688.000,00",Nearby Places\r\n\r\nMeZZa Restaurant at Aston...,General\r\nAC\r\nBallroom\r\nBanquet\r\nFamily...
87,grandkemang Hotel,4.0,8.4,2717,"431.250,00",Nearby Places\r\n\r\nSparca Lounge at grandkem...,Food and Drinks\r\nA la carte dinner\r\nA la c...


## Count Each Facilities and Acomodation

In [5]:
facilities_columns = ['Food and Drinks','Hotel Services','In-room Facilities', 'Business Facilities', 'Nearby Facilities', 'Public Facilities', 'General', 'Things to Do', 'Accessibilty', 'Connectivity', 'Transportation', 'Kids and Pets', 'Sports and Recreations', 'Shuttle Service']

facilities_columns.reverse()

for index , row in df.iterrows():
    
    # split per fasil and akomod
    arr = row['Facil + Akomod'].splitlines() 
    #iterate over fasil and akomod

    i = 0
    count = 0

    for  item in reversed(arr):
      count += 1
      if item in facilities_columns:
        df.at[index,item ] = count
        count = 0
        i += 1

In [6]:
df = df.fillna(0)

In [7]:
for index , row in df.iterrows():
    
    # split per fasil and akomod
    arr = row['Places Nearby'].splitlines() 
    
    #iterate over fasil and akomod

    i = 0
    count = 0
    
    for ind, item in enumerate(arr):
        itemsplits = item.split()
        for x in itemsplits:
            if x.isdigit():
                if itemsplits[1] == "km":
                    meters = itemsplits[0] * 1000
                else:
                    meters = itemsplits[0]
                    
                if meters.isdigit():
                    df.at[index,arr[ind-1]] = meters
                    print(meters)
                
                
                

0
80
140
460
35
65
116
152
0
80
140
460
1
76
258
496
2
6
122
455
23
445
552
641
40
165
802
840
26
395
427
491
4
318
365
778
11
147
210
699
39
648
890
963
20
163
303
304
39
513
698
849
13
153
190
377
32
119
189
198
25
68
115
812
34
230
472
625
741
30
162
418
586
144
169
246
453
42
367
388
598
25
155
187
324
3
34
201
232
21
72
166
331
18
154
172
218
30
181
781
181
25
68
115
812
52
111
160
258
21
72
166
331
65
129
329
21
98
223
652
36
353
545
675
16
158
197
478
31
129
189
307
4
235
359
374
21
80
161
275
11
32
223
346
32
119
189
198
16
133
476
904
84
307
481
560
19
352
360
636
14
244
354
436
10
330
399
18
245
499
16
507
563
629
107
363
492
742
613
7
144
193
525
7
85
129
162
10
267
740
894
58
149
385
884
58
214
372
415
15
402
472
538
15
402
472
538
86
217
354
563
13
142
361
582
26
151
213
349
57
254
501
515
10
41
8
317
335
787
19
48
506
539
3
70
203
784
31
210
372
977
25
268
313
412
21
578
583
729
21
728
58
377
640
700
46
475
583
4
253
858
19
236
326
591
13
466
535
738
9
125
389
833
25
408


In [8]:
print(df.columns)

Index(['Hotel', 'Star', 'Rating', 'Reviews', 'Harga', 'Places Nearby',
       'Facil + Akomod', 'Shuttle Service', 'Sports and Recreations',
       'Kids and Pets', 'Transportation', 'Connectivity', 'Accessibilty',
       'Things to Do', 'General', 'Public Facilities', 'Nearby Facilities',
       'Business Facilities', 'In-room Facilities', 'Hotel Services',
       'Food and Drinks', 'Fast Food', 'Shop & Gifts', 'Business',
       'Transportation Hub', 'Casual Dining', 'Nightlife', 'Park & Zoo',
       'Public Service', 'Arts & Sciences', 'Fine Dining', 'Sport',
       'Quick Bites', 'Education', 'Street Food', 'Activity & Games', 'Cafe',
       'Entertainment', 'Food Court', 'Sight & Landmark'],
      dtype='object')


In [9]:


df.Harga = df['Harga'].str.replace('.','', regex = True)
df.Harga = df['Harga'].str.replace(',','.', regex = True)
df.Harga = df['Harga'].astype(float).astype(int)


df.Reviews = df['Reviews'].str.replace('.','', regex = True)
df.Reviews = df['Reviews'].str.replace(',','.', regex = True)
df.Reviews = df['Reviews'].astype(float).astype(int)

In [10]:
df = df.fillna(10000)

In [11]:
c = df.select_dtypes(object).columns
df[c] = df[c].apply(pd.to_numeric,errors='coerce')

In [12]:
df.dtypes

Hotel                     float64
Star                      float64
Rating                    float64
Reviews                     int64
Harga                       int64
Places Nearby             float64
Facil + Akomod            float64
Shuttle Service           float64
Sports and Recreations    float64
Kids and Pets             float64
Transportation            float64
Connectivity              float64
Accessibilty              float64
Things to Do              float64
General                   float64
Public Facilities         float64
Nearby Facilities         float64
Business Facilities       float64
In-room Facilities        float64
Hotel Services            float64
Food and Drinks           float64
Fast Food                   int64
Shop & Gifts                int64
Business                    int64
Transportation Hub          int64
Casual Dining               int64
Nightlife                   int64
Park & Zoo                  int64
Public Service              int64
Arts & Science

In [13]:
df.describe()

Unnamed: 0,Hotel,Star,Rating,Reviews,Harga,Places Nearby,Facil + Akomod,Shuttle Service,Sports and Recreations,Kids and Pets,...,Fine Dining,Sport,Quick Bites,Education,Street Food,Activity & Games,Cafe,Entertainment,Food Court,Sight & Landmark
count,0.0,89.0,89.0,89.0,89.0,0.0,0.0,89.0,89.0,89.0,...,89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0
mean,,4.168539,8.555056,2939.516854,1597875.0,,,1.741573,2.651685,1.224719,...,9439.123596,9788.685393,9887.977528,8924.932584,9775.752809,9574.730337,9439.685393,9786.516854,9887.876404,9781.629213
std,,0.828964,0.249094,3582.740905,1606771.0,,,1.695771,2.751451,1.952579,...,2311.935395,1401.633524,1056.817886,3039.363797,1487.391541,1972.272603,2309.621076,1416.933903,1057.771884,1448.477244
min,,0.0,7.8,1.0,382400.0,,,0.0,0.0,0.0,...,0.0,545.0,30.0,142.0,21.0,181.0,10.0,158.0,21.0,193.0
25%,,4.0,8.5,9.0,675000.0,,,0.0,0.0,0.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
50%,,4.0,8.6,1755.0,847000.0,,,2.0,3.0,0.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
75%,,5.0,8.7,4678.0,1757813.0,,,3.0,4.0,3.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
max,,5.0,9.0,18892.0,7327822.0,,,10.0,11.0,9.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0


In [14]:
df

Unnamed: 0,Hotel,Star,Rating,Reviews,Harga,Places Nearby,Facil + Akomod,Shuttle Service,Sports and Recreations,Kids and Pets,...,Fine Dining,Sport,Quick Bites,Education,Street Food,Activity & Games,Cafe,Entertainment,Food Court,Sight & Landmark
0,,5.0,8.9,4363,2480500,,,2.0,3.0,4.0,...,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
1,,5.0,8.8,238,3823600,,,0.0,2.0,3.0,...,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
2,,5.0,8.5,8405,6231500,,,2.0,3.0,3.0,...,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
3,,4.0,8.9,424,762300,,,0.0,2.0,0.0,...,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
4,,5.0,9.0,393,1756254,,,2.0,3.0,7.0,...,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,,2.0,7.9,11026,385200,,,2.0,0.0,0.0,...,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
85,,5.0,8.9,1511,2420000,,,2.0,0.0,0.0,...,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
86,,4.0,7.8,517,688000,,,0.0,0.0,0.0,...,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
87,,4.0,8.4,2717,431250,,,2.0,3.0,0.0,...,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000


In [15]:
training = df.iloc[:20,:]
evaluation = shuffle(df.iloc[20:,:])

In [16]:
training = training[['Star','Rating','Reviews','Harga']]

# Understanding the Data (EDA)

# Build Ranking Dataset

training = tfr.data.build_ranking_dataset_with_parsing_fn(
    training,
    parsingfn,
    10,
    reader=tfr.keras.pipeline.DatasetHparams.dataset_reader,
    reader_args=None,
    num_epochs=None,
    shuffle=True,
    shuffle_buffer_size=10000,
    shuffle_seed=None,
    prefetch_buffer_size=tf.data.experimental.AUTOTUNE,
    reader_num_threads=tf.data.experimental.AUTOTUNE,
    sloppy_ordering=False,
    drop_final_batch=False,
    num_parser_threads=tf.data.experimental.AUTOTUNE
)

# Build tfr.keras.layers.GAMLayer