In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Convert SQLite to Dataframe

In [2]:
con = sqlite3.connect('../Dataset/hotel-directories-ORI.sqlite3')
df_room = pd.read_sql_query("SELECT * FROM hotel_rooms", con)
df_hotel = pd.read_sql_query("SELECT * FROM hotels", con)

In [3]:
df_room.to_csv('../Dataset/rooms.csv', index=False)
df_hotel.to_csv('../Dataset/hotels.csv', index=False)

In [4]:
df_room.head(2)

Unnamed: 0,id,hotelId,name,images,roomType,description,originalDescription,bedDescription,size,rate,...,numExtraBeds,numChargedRooms,numRemainingRooms,numBreakfastIncluded,isBreakfastIncluded,isWifiIncluded,isRefundable,hasLivingRoom,extraBedIsIncluded,facilities
0,1000009181,3000020003208,Family,"[""https://ik.imagekit.io/tvlk/generic-asset/dg...",Family,,,,30.0,"{""amount"":""989182"",""currency"":""IDR"",""tax"":""207...",...,0,1,3,,1,1,1,0,0,"[""AIR_CONDITIONING"",""BATHROBES"",""BLACKOUT_DRAP..."
1,1000009219,3000020003208,President Suite,"[""https://ik.imagekit.io/tvlk/generic-asset/dg...",President Suite,,,,75.0,"{""amount"":""3272727"",""currency"":""IDR"",""tax"":""68...",...,0,1,1,,1,1,1,0,0,"[""AIR_CONDITIONING"",""BATHROBES"",""BLACKOUT_DRAP..."


In [5]:
df_hotel.head(2)

Unnamed: 0,id,type,name,displayName,description,link,address,region,city,province,...,images,facilities,properties,nearestPointOfInterests,starRating,userRating,userRatingInfo,numReviews,latitude,longitude
0,3000020003208,Hotel,Kyriad Hotel Muraya Aceh,Kyriad Hotel Muraya Aceh,<p><b>Lokasi</b><br>Kyriad Hotel Muraya Aceh b...,https://www.traveloka.com/id-id/hotel/detail?s...,Jalan Tengku H. Mohd Daud Beureueuh No. 5 Kuta...,Aceh,Banda Aceh,Aceh,...,"[""https://ik.imagekit.io/tvlk/apr-asset/dgXfoy...","[""CARPARK"",""COFFEE_OR_TEA_IN_LOBBY"",""COFFEE_SH...","{""checkInTime"":""14:00"",""checkOutTime"":""12:00"",...","[{""landmarkId"":""6254734"",""geoId"":null,""name"":""...",4.0,8.6,Mengesankan,1845.0,5.556686,95.322269
1,3000010003879,Hotel,Hermes Palace by BENCOOLEN,Hermes Palace by BENCOOLEN,<p><b>Lokasi</b><br>Hermes Palace by BENCOOLEN...,https://www.traveloka.com/id-id/hotel/detail?s...,Jalan. T. Panglima Nyak Makam Banda Aceh Aceh ...,Aceh,Banda Aceh,Aceh,...,"[""https://ik.imagekit.io/tvlk/apr-asset/dgXfoy...","[""CARPARK"",""COFFEE_SHOP"",""ELEVATOR"",""HAS_24_HO...","{""checkInTime"":""14:00"",""checkOutTime"":""12:00"",...","[{""landmarkId"":""6254734"",""geoId"":null,""name"":""...",5.0,8.4,Mengesankan,2525.0,5.556202,95.344114


## Drop Unnecessary Column

In [6]:
room_drop = ['images', 'description', 'bedDescription', 'originalDescription', 'rate', 'maxOccupancy', 'numRemainingRooms', 'numBreakfastIncluded', 'extraBedIsIncluded', 'numExtraBeds', 'numChargedRooms']
for column_name in room_drop:
    df_room.drop(column_name, axis=1, inplace=True)

df_room.head(2)

Unnamed: 0,id,hotelId,name,roomType,size,originalRate,baseOccupancy,maxChildOccupancy,maxChildAge,isBreakfastIncluded,isWifiIncluded,isRefundable,hasLivingRoom,facilities
0,1000009181,3000020003208,Family,Family,30.0,"{""amount"":""999174"",""currency"":""IDR"",""tax"":""209...",2,1,10,1,1,1,0,"[""AIR_CONDITIONING"",""BATHROBES"",""BLACKOUT_DRAP..."
1,1000009219,3000020003208,President Suite,President Suite,75.0,"{""amount"":""3305785"",""currency"":""IDR"",""tax"":""69...",2,1,10,1,1,1,0,"[""AIR_CONDITIONING"",""BATHROBES"",""BLACKOUT_DRAP..."


In [7]:
hotel_drop = ['displayName', 'description', 'link', 'address', 'region', 'country', 'geodirectoryId', 'postal', 'image', 'images', 'userRating', 'latitude', 'longitude']
for column_name in hotel_drop:
    df_hotel.drop(column_name, axis=1, inplace=True)

df_hotel.head(2)

Unnamed: 0,id,type,name,city,province,facilities,properties,nearestPointOfInterests,starRating,userRatingInfo,numReviews
0,3000020003208,Hotel,Kyriad Hotel Muraya Aceh,Banda Aceh,Aceh,"[""CARPARK"",""COFFEE_OR_TEA_IN_LOBBY"",""COFFEE_SH...","{""checkInTime"":""14:00"",""checkOutTime"":""12:00"",...","[{""landmarkId"":""6254734"",""geoId"":null,""name"":""...",4.0,Mengesankan,1845.0
1,3000010003879,Hotel,Hermes Palace by BENCOOLEN,Banda Aceh,Aceh,"[""CARPARK"",""COFFEE_SHOP"",""ELEVATOR"",""HAS_24_HO...","{""checkInTime"":""14:00"",""checkOutTime"":""12:00"",...","[{""landmarkId"":""6254734"",""geoId"":null,""name"":""...",5.0,Mengesankan,2525.0


## Combine Dataset Based on ID

In [8]:
# rename columns

df_hotel.rename(columns={'id': 'hotelId'}, inplace=True)
df_hotel.rename(columns={'name': 'hotelName'}, inplace=True)
df_hotel.rename(columns={'facilities': 'hotelFacilities'}, inplace=True)
df_room.rename(columns={'facilities': 'roomFacilities'}, inplace=True)
df_hotel.head(2)

Unnamed: 0,hotelId,type,hotelName,city,province,hotelFacilities,properties,nearestPointOfInterests,starRating,userRatingInfo,numReviews
0,3000020003208,Hotel,Kyriad Hotel Muraya Aceh,Banda Aceh,Aceh,"[""CARPARK"",""COFFEE_OR_TEA_IN_LOBBY"",""COFFEE_SH...","{""checkInTime"":""14:00"",""checkOutTime"":""12:00"",...","[{""landmarkId"":""6254734"",""geoId"":null,""name"":""...",4.0,Mengesankan,1845.0
1,3000010003879,Hotel,Hermes Palace by BENCOOLEN,Banda Aceh,Aceh,"[""CARPARK"",""COFFEE_SHOP"",""ELEVATOR"",""HAS_24_HO...","{""checkInTime"":""14:00"",""checkOutTime"":""12:00"",...","[{""landmarkId"":""6254734"",""geoId"":null,""name"":""...",5.0,Mengesankan,2525.0


In [9]:
df = pd.merge(df_hotel, df_room, on='hotelId', how='inner')
df.head(2)

Unnamed: 0,hotelId,type,hotelName,city,province,hotelFacilities,properties,nearestPointOfInterests,starRating,userRatingInfo,...,size,originalRate,baseOccupancy,maxChildOccupancy,maxChildAge,isBreakfastIncluded,isWifiIncluded,isRefundable,hasLivingRoom,roomFacilities
0,3000020003208,Hotel,Kyriad Hotel Muraya Aceh,Banda Aceh,Aceh,"[""CARPARK"",""COFFEE_OR_TEA_IN_LOBBY"",""COFFEE_SH...","{""checkInTime"":""14:00"",""checkOutTime"":""12:00"",...","[{""landmarkId"":""6254734"",""geoId"":null,""name"":""...",4.0,Mengesankan,...,30.0,"{""amount"":""999174"",""currency"":""IDR"",""tax"":""209...",2,1,10,1,1,1,0,"[""AIR_CONDITIONING"",""BATHROBES"",""BLACKOUT_DRAP..."
1,3000020003208,Hotel,Kyriad Hotel Muraya Aceh,Banda Aceh,Aceh,"[""CARPARK"",""COFFEE_OR_TEA_IN_LOBBY"",""COFFEE_SH...","{""checkInTime"":""14:00"",""checkOutTime"":""12:00"",...","[{""landmarkId"":""6254734"",""geoId"":null,""name"":""...",4.0,Mengesankan,...,75.0,"{""amount"":""3305785"",""currency"":""IDR"",""tax"":""69...",2,1,10,1,1,1,0,"[""AIR_CONDITIONING"",""BATHROBES"",""BLACKOUT_DRAP..."


### Filter for only Bali Province

In [10]:
df = df[df['province'] == 'Bali']
df.reset_index(drop=True, inplace=True)
df.shape

(7221, 24)

In [11]:
# drop another columns
drop = ['hotelId', 'hotelName', 'province', 'properties', 'userRatingInfo', 'numReviews', 'id', 'name', 'roomType']
for column_name in drop:
    df.drop(column_name, axis=1, inplace=True)
print(df.shape)
df.head(3)

(7221, 15)


Unnamed: 0,type,city,hotelFacilities,nearestPointOfInterests,starRating,size,originalRate,baseOccupancy,maxChildOccupancy,maxChildAge,isBreakfastIncluded,isWifiIncluded,isRefundable,hasLivingRoom,roomFacilities
0,Hotel,Badung,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...","[{""landmarkId"":""91589773100576"",""geoId"":null,""...",4.0,46.0,"{""amount"":""1227273"",""currency"":""IDR"",""tax"":""25...",3,1,5,1,1,1,0,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB..."
1,Hotel,Badung,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...","[{""landmarkId"":""91589773100576"",""geoId"":null,""...",4.0,31.0,"{""amount"":""596694"",""currency"":""IDR"",""tax"":""125...",2,1,5,0,1,1,0,"[""AIR_CONDITIONING"",""BATHROBES"",""BATHTUB"",""BLA..."
2,Hotel,Badung,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...","[{""landmarkId"":""91589773100576"",""geoId"":null,""...",4.0,52.0,"{""amount"":""1450413"",""currency"":""IDR"",""tax"":""30...",2,1,5,1,1,1,1,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB..."


In [12]:
df.to_csv('../Dataset/bali_lodging.csv', index=False)

In [13]:
# Plotting Hotel Type
# plt.figure(figsize=(13, 5))
# sns.countplot(x='type', data=df)
# plt.show()

## Data Cleaning

In [14]:
df.isnull().sum()

type                          0
city                          0
hotelFacilities               0
nearestPointOfInterests       0
starRating                    0
size                       2084
originalRate                  0
baseOccupancy                 0
maxChildOccupancy             0
maxChildAge                   0
isBreakfastIncluded           0
isWifiIncluded                0
isRefundable                  0
hasLivingRoom                 0
roomFacilities                0
dtype: int64

In [15]:
df.duplicated().any()

True

In [16]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.shape

(5137, 15)

## Feature Processing

### Original Rate/Target Processing

In [17]:
df.loc[0, 'originalRate']

'{"amount":"1227273","currency":"IDR","tax":"257727"}'

In [18]:
# Exstract original rate
for i in range (len(df['originalRate'])):
    df.loc[i, 'originalRate'] = df['originalRate'][i].split('"')[3]
    # df['originalRate'][i] = int(df['originalRate'][i].split('"')[11])
df['originalRate'] = df['originalRate'].astype(int)
df.head(2)

Unnamed: 0,type,city,hotelFacilities,nearestPointOfInterests,starRating,size,originalRate,baseOccupancy,maxChildOccupancy,maxChildAge,isBreakfastIncluded,isWifiIncluded,isRefundable,hasLivingRoom,roomFacilities
0,Hotel,Badung,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...","[{""landmarkId"":""91589773100576"",""geoId"":null,""...",4.0,46.0,1227273,3,1,5,1,1,1,0,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB..."
1,Hotel,Badung,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...","[{""landmarkId"":""91589773100576"",""geoId"":null,""...",4.0,31.0,596694,2,1,5,0,1,1,0,"[""AIR_CONDITIONING"",""BATHROBES"",""BATHTUB"",""BLA..."


In [19]:
df.shape

(5137, 15)

In [20]:
# Remove originalRate below 1
df = df[df['originalRate'] > 10]
df.reset_index(drop=True, inplace=True)
df.shape

(5137, 15)

In [21]:
# Checking the data type
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5137 entries, 0 to 5136
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   type                     5137 non-null   object 
 1   city                     5137 non-null   object 
 2   hotelFacilities          5137 non-null   object 
 3   nearestPointOfInterests  5137 non-null   object 
 4   starRating               5137 non-null   float64
 5   size                     5137 non-null   object 
 6   originalRate             5137 non-null   int32  
 7   baseOccupancy            5137 non-null   int64  
 8   maxChildOccupancy        5137 non-null   int64  
 9   maxChildAge              5137 non-null   int64  
 10  isBreakfastIncluded      5137 non-null   int64  
 11  isWifiIncluded           5137 non-null   int64  
 12  isRefundable             5137 non-null   int64  
 13  hasLivingRoom            5137 non-null   int64  
 14  roomFacilities          

In [22]:
# change size data type to float
df['size'] = df['size'].astype(float)
df['size'].dtype

dtype('float64')

In [23]:
dfPrice = df[['originalRate']]
dfPrice.head(2)

Unnamed: 0,originalRate
0,1227273
1,596694


### Feature Processing Function

In [24]:
# Rooms facilities processing
# How to use : Facilities = getFacilities(df['Facilities'].tolist())
def getFacilities(facilitiesList):
    facilities = []
    for i in facilitiesList:
        m = int(i.count('"') / 2)
        for j in range(1, m+1, 2):
            temp = i.split('"')[j]
            if temp not in facilities:
                facilities.append(temp)
    return facilities

In [25]:
# Remove underscore and capitalize
def columnName(text):
    index1 = text.find('_')
    index2 = index1 + 1
    index3 = index2 + 1
    text = text[: index1]+ text[index2].swapcase()+text[index3 :]
    finalText = text.strip('_')
    return finalText

In [26]:
# hotel facilites will get error use nameFormatHotel instead
def nameFormat(facilitiesName):
    for i in range (len(facilitiesName)):
        facilitiesName[i] = facilitiesName[i].lower()
        if facilitiesName[i].find('_') > 0:
            while True:
                temp = columnName(facilitiesName[i])
                temporary = temp.isalpha()
                if temporary == False:
                    temp = columnName(facilitiesName[i])
                    facilitiesName[i] = columnName(temp)
                    if facilitiesName[i].isalpha() == True:
                        break
                if temporary == True:
                    facilitiesName[i] = temp
                    break
    return facilitiesName

In [27]:
# only for hotel facilities
def nameFormatHotel(facilitiesName):
    for i in range (len(facilitiesName)):
        facilitiesName[i] = facilitiesName[i].lower()
    return facilitiesName

### Getting extraxcted features

In [28]:
# Hotel Facilities to List
hotelFacilities = getFacilities(df['hotelFacilities'].tolist())
hotelFacilities[:5]

['CARPARK',
 'ELEVATOR',
 'HAS_24_HOUR_ROOM_SERVICE',
 'RESTAURANT',
 'RESTAURANT_FOR_BREAKFAST']

In [29]:
# Room Facilities to List
roomFacilities = getFacilities(df['roomFacilities'].tolist())
roomFacilities[:5]

['AIR_CONDITIONING',
 'BALCONY_TERRACE',
 'BATHROBES',
 'BATHTUB',
 'BLACKOUT_DRAPES_CURTAINS']

In [30]:
# nearestPointOfInterests to List
# landmarkType is present at indexes [19, 49, 79, 109]

landmark = df['nearestPointOfInterests'].to_list()
searchText = 'landmarkType'
nearestPoint = []
loopCount = 0
for x in range (len(landmark)):
    loopCount += 1
    i = 0
    column = landmark[x].split('"')
    matched_indexes = []
    while i < len(column):
        if searchText == column[i]:
            matched_indexes.append(i)
        i += 1
    for y in matched_indexes:
        if column[y+2] not in nearestPoint:
            nearestPoint.append(column[y+2])
if 'OTHERS' in nearestPoint:
    nearestPoint.remove('OTHERS')
nearestPoint[:5]

['SHOPPING_AREA', 'OFFICIAL_BUILDING', 'RESTAURANT', 'ATTRACTION', 'BEACH']

In [31]:
# Name formatting and create new dataframe
# hotelFacilities = nameFormatHotel(hotelFacilities)
# roomFacilities = nameFormat(roomFacilities)
# nearestPoint = nameFormat(nearestPoint)

In [32]:
# Create new dataframe
dfHotelEncode = pd.DataFrame(columns=hotelFacilities)
dfRoomEncode = pd.DataFrame(columns=roomFacilities)
dfPointEncode = pd.DataFrame(columns=nearestPoint)

### Extra Features

In [33]:
dfHotelEx = df[['originalRate','type','city','starRating']]
dfRoomEx = df[['originalRate','size', 'baseOccupancy', 'maxChildOccupancy', 'maxChildAge', 'isBreakfastIncluded', 'isWifiIncluded', 'isRefundable', 'hasLivingRoom']]
print(dfHotelEx.shape)
print(dfRoomEx.shape)

(5137, 4)
(5137, 9)


#### Hotel Extra Features

In [34]:
dfHotelEx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5137 entries, 0 to 5136
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   originalRate  5137 non-null   int32  
 1   type          5137 non-null   object 
 2   city          5137 non-null   object 
 3   starRating    5137 non-null   float64
dtypes: float64(1), int32(1), object(2)
memory usage: 140.6+ KB


In [35]:
dfHotelEx['city'].unique()

array(['Badung', 'Denpasar', 'Gianyar', 'Sanur', 'Bangli', 'Buleleng',
       'Klungkung', 'Tabanan', 'Jembrana', 'Karangasem'], dtype=object)

In [36]:
dfHotelEx['type'].unique()

array(['Hotel', 'Resor', 'Vila', 'Apartemen', 'Guest House', 'Homestay',
       'Hostel', 'B&B', 'Camping', 'Hotel Kapsul'], dtype=object)

In [37]:
cityEncode = pd.get_dummies(dfHotelEx['city'], prefix='City')
typeEncode = pd.get_dummies(dfHotelEx['type'], prefix='Type')
dfHotelEx = pd.concat([dfHotelEx, cityEncode, typeEncode], axis=1)

In [38]:
dfHotelEx.drop(['city', 'type'], axis=1, inplace=True)
print(dfHotelEx.shape)
dfHotelEx.head(2)

(5137, 22)


Unnamed: 0,originalRate,starRating,City_Badung,City_Bangli,City_Buleleng,City_Denpasar,City_Gianyar,City_Jembrana,City_Karangasem,City_Klungkung,...,Type_Apartemen,Type_B&B,Type_Camping,Type_Guest House,Type_Homestay,Type_Hostel,Type_Hotel,Type_Hotel Kapsul,Type_Resor,Type_Vila
0,1227273,4.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,596694,4.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [39]:
dfHotelEx.describe()

Unnamed: 0,originalRate,starRating,City_Badung,City_Bangli,City_Buleleng,City_Denpasar,City_Gianyar,City_Jembrana,City_Karangasem,City_Klungkung,...,Type_Apartemen,Type_B&B,Type_Camping,Type_Guest House,Type_Homestay,Type_Hostel,Type_Hotel,Type_Hotel Kapsul,Type_Resor,Type_Vila
count,5137.0,5137.0,5137.0,5137.0,5137.0,5137.0,5137.0,5137.0,5137.0,5137.0,...,5137.0,5137.0,5137.0,5137.0,5137.0,5137.0,5137.0,5137.0,5137.0,5137.0
mean,2411533.0,2.989877,0.507105,0.008565,0.05256,0.047693,0.197975,0.008176,0.032509,0.075141,...,0.015573,0.005061,0.00292,0.085848,0.04088,0.015379,0.447343,0.000973,0.16936,0.216663
std,3758227.0,1.557471,0.499998,0.092161,0.223175,0.213137,0.398512,0.090059,0.177365,0.263645,...,0.12383,0.07097,0.053963,0.280166,0.198031,0.123065,0.497268,0.031186,0.375106,0.412011
min,40313.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,545455.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1210744.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2892562.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,60000000.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
abs(dfHotelEx.corr('spearman')['originalRate']).sort_values(ascending=False)[:11]

originalRate        1.000000
starRating          0.642493
Type_Vila           0.399987
Type_Guest House    0.296345
Type_Resor          0.294015
Type_Homestay       0.242916
Type_Hotel          0.237248
City_Denpasar       0.222190
City_Klungkung      0.165832
Type_Hostel         0.160855
City_Gianyar        0.142126
Name: originalRate, dtype: float64

##### `dfHotelEx` (Feature with Correlation => 0.5)
- starRating          0.642493

In [41]:
dfHotelExPick = dfHotelEx[['starRating']]
dfHotelExPick.head(2)

Unnamed: 0,starRating
0,4.0
1,4.0


#### Room Extra Features

In [42]:
dfRoomEx.head(2)

Unnamed: 0,originalRate,size,baseOccupancy,maxChildOccupancy,maxChildAge,isBreakfastIncluded,isWifiIncluded,isRefundable,hasLivingRoom
0,1227273,46.0,3,1,5,1,1,1,0
1,596694,31.0,2,1,5,0,1,1,0


In [43]:
abs(round(dfRoomEx.corr('spearman'),2)['originalRate']).sort_values(ascending=False)[:11]

originalRate           1.00
size                   0.73
isBreakfastIncluded    0.31
baseOccupancy          0.28
isRefundable           0.19
hasLivingRoom          0.17
maxChildAge            0.08
isWifiIncluded         0.02
maxChildOccupancy      0.01
Name: originalRate, dtype: float64

##### `dfRoomEx` (Feature with Correlation => 0.5)
- size                   0.73

In [44]:
dfRoomExPick = dfRoomEx[['size']]
dfRoomExPick.head(2)

Unnamed: 0,size
0,46.0
1,31.0


### Hotel Feature Selection

In [45]:
dfHotel = pd.concat([df.pop('hotelFacilities'), dfHotelEncode], axis=1)
dfHotel = pd.concat([dfPrice, dfHotel], axis=1)
print(dfHotel.shape)
dfHotel.head(2)

(5137, 224)


Unnamed: 0,originalRate,hotelFacilities,CARPARK,ELEVATOR,HAS_24_HOUR_ROOM_SERVICE,RESTAURANT,RESTAURANT_FOR_BREAKFAST,RESTAURANT_FOR_DINNER,RESTAURANT_FOR_LUNCH,SAFETY_DEPOSIT_BOX,...,CRIBS,PRIVATE_BEACH_NEARBY,RECEPTION_HALL,BEACH_VOLLEYBALL,MINI_GOLF,PRIVATE_BEACH,CHILDREN_CLUB,DARTS,ENTERTAINMENT_PROGRAMME_FOR_CHILDREN,KARAOKE
0,1227273,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",,,,,,,,,...,,,,,,,,,,
1,596694,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",,,,,,,,,...,,,,,,,,,,


In [46]:
for i in range (len(dfHotel)):
    value = dfHotel['hotelFacilities'][i]
    if value == '[]':
        dfHotel.drop([i], axis=0, inplace=True)
        continue
    for j in range (2, len(dfHotel.columns)):
        column_name = dfHotel.columns[j]
        if column_name in value:
            dfHotel.loc[i, column_name] = 1
        else:
            dfHotel.loc[i, column_name] = 0
print(dfHotel.shape)
dfHotel.head()

(5074, 224)


Unnamed: 0,originalRate,hotelFacilities,CARPARK,ELEVATOR,HAS_24_HOUR_ROOM_SERVICE,RESTAURANT,RESTAURANT_FOR_BREAKFAST,RESTAURANT_FOR_DINNER,RESTAURANT_FOR_LUNCH,SAFETY_DEPOSIT_BOX,...,CRIBS,PRIVATE_BEACH_NEARBY,RECEPTION_HALL,BEACH_VOLLEYBALL,MINI_GOLF,PRIVATE_BEACH,CHILDREN_CLUB,DARTS,ENTERTAINMENT_PROGRAMME_FOR_CHILDREN,KARAOKE
0,1227273,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,1,0,0,0
1,596694,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,1,0,0,0
2,1450413,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,1,0,0,0
3,855372,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,1,0,0,0
4,2545455,"[""CARPARK"",""COFFEE_SHOP"",""ELEVATOR"",""HAS_24_HO...",1,1,1,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,0


In [47]:
# Remove hotelFacilities column
dfHotel.drop('hotelFacilities', axis=1, inplace=True)

In [48]:
for i in range (1, len(dfHotel.columns)):
    dfHotel = dfHotel.astype({dfHotel.columns[i]: int})

In [49]:
abs(round(dfHotel.corr('spearman'),2)['originalRate']).sort_values(ascending=False)[:11]

originalRate               1.00
HAIR_DRYER                 0.44
SAFETY_DEPOSIT_BOX         0.43
BATHTUB                    0.43
BATHROBE                   0.42
IN_ROOM_SAFE               0.41
BABYSITTING                0.41
POOL                       0.38
SPA                        0.37
SEPARATE_SHOWER_AND_TUB    0.35
MASSAGE                    0.35
Name: originalRate, dtype: float64

In [50]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
import xgboost as xgb

y = dfHotel['originalRate']
X = dfHotel.drop('originalRate', axis=1)
select = SelectKBest(score_func=f_regression, k=5)
X_new = select.fit(X, y)
X_new.get_feature_names_out()


array(['BATHTUB', 'BATHROBE', 'SEPARATE_SHOWER_AND_TUB', 'BABYSITTING',
       'WEDDING_SERVICE'], dtype=object)

##### `dfHotel` (Feature with Correlation > 0.5)
- No hotel feature that has more than 0.5 correlation with target feature

### Point of Interest Feature Selection

In [51]:
dfPoint = pd.concat([df.pop('nearestPointOfInterests'), dfPointEncode], axis=1)
dfPoint = pd.concat([dfPrice, dfPoint], axis=1)
dfPoint.head(2)

Unnamed: 0,originalRate,nearestPointOfInterests,SHOPPING_AREA,OFFICIAL_BUILDING,RESTAURANT,ATTRACTION,BEACH,MONUMENT,TERMINAL,PARK,HOSPITAL,MUSEUM,GALLERY,PLACE_OF_WORSHIP,TRAIN_STATION,ZOO,ENTERTAINMENT,GARDEN,THEATER
0,1227273,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",,,,,,,,,,,,,,,,,
1,596694,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",,,,,,,,,,,,,,,,,


In [52]:
for i in range (len(dfPoint)):
    value = dfPoint['nearestPointOfInterests'][i]
    if value == '[]':
        dfPoint.drop([i], axis=0, inplace=True)
        continue
    for j in range (2, len(dfPoint.columns)):
        column_name = dfPoint.columns[j]
        if column_name in value:
            dfPoint.loc[i, column_name] = 1
        else:
            dfPoint.loc[i, column_name] = 0
print(dfPoint.shape)
dfPoint.head()

(5113, 19)


Unnamed: 0,originalRate,nearestPointOfInterests,SHOPPING_AREA,OFFICIAL_BUILDING,RESTAURANT,ATTRACTION,BEACH,MONUMENT,TERMINAL,PARK,HOSPITAL,MUSEUM,GALLERY,PLACE_OF_WORSHIP,TRAIN_STATION,ZOO,ENTERTAINMENT,GARDEN,THEATER
0,1227273,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,596694,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1450413,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,855372,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2545455,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [53]:
dfPoint.drop('nearestPointOfInterests', axis=1, inplace=True)

In [54]:
# Convert to int
for i in range (1, len(dfPoint.columns)):
    dfPoint = dfPoint.astype({dfPoint.columns[i]: int})

In [55]:
abs(round(dfPoint.corr('spearman'),2)['originalRate']).sort_values(ascending=False)[:11]

originalRate         1.00
ATTRACTION           0.19
MONUMENT             0.18
TERMINAL             0.15
OFFICIAL_BUILDING    0.14
SHOPPING_AREA        0.11
PARK                 0.09
HOSPITAL             0.08
GALLERY              0.06
RESTAURANT           0.05
MUSEUM               0.05
Name: originalRate, dtype: float64

##### `dfPoint` (Feature with Correlation > 0.5)
- No Near Point of Interests that has more than 0.5 correlation with target feature

### Room Feature Selection

In [56]:
dfRoom = pd.concat([df.pop('roomFacilities'), dfRoomEncode], axis=1)
dfRoom = pd.concat([dfPrice, dfRoom], axis=1)
print(dfRoom.shape)
dfRoom.head(2)

(5137, 54)


Unnamed: 0,originalRate,roomFacilities,AIR_CONDITIONING,BALCONY_TERRACE,BATHROBES,BATHTUB,BLACKOUT_DRAPES_CURTAINS,COFFEE_TEA_MAKER,COMPLIMENTARY_BOTTLED_WATER,DESK,...,SOFA_BED,HOUSEKEEPING,IRONING_BOARD_ON_REQUEST,BALCONY,SHARED_BATHROOM,TWENTY_FOUR_HOUR_ROOM_SERVICE,FREE_CRIBS,JACUZZI_BATHTUB,SEPARATE_DINING_AREA,EXTRA_BEDS_AVAILABLE
0,1227273,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB...",,,,,,,,,...,,,,,,,,,,
1,596694,"[""AIR_CONDITIONING"",""BATHROBES"",""BATHTUB"",""BLA...",,,,,,,,,...,,,,,,,,,,


In [57]:
for i in range(len(dfRoom)):
    value = dfRoom['roomFacilities'][i]
    if value == '[]':
        dfRoom.drop([i], axis=0, inplace=True)
        continue
    for j in range (2, len(dfRoom.columns)):
        column_name = dfRoom.columns[j]
        if column_name in value:
            dfRoom.loc[i, column_name] = 1
        else:
            dfRoom.loc[i, column_name] = 0
print(dfRoom.shape)
dfRoom.head()

(4948, 54)


Unnamed: 0,originalRate,roomFacilities,AIR_CONDITIONING,BALCONY_TERRACE,BATHROBES,BATHTUB,BLACKOUT_DRAPES_CURTAINS,COFFEE_TEA_MAKER,COMPLIMENTARY_BOTTLED_WATER,DESK,...,SOFA_BED,HOUSEKEEPING,IRONING_BOARD_ON_REQUEST,BALCONY,SHARED_BATHROOM,TWENTY_FOUR_HOUR_ROOM_SERVICE,FREE_CRIBS,JACUZZI_BATHTUB,SEPARATE_DINING_AREA,EXTRA_BEDS_AVAILABLE
0,1227273,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB...",1,1,1,1,1,1,1,1,...,0,0,0,1,0,0,0,0,0,0
1,596694,"[""AIR_CONDITIONING"",""BATHROBES"",""BATHTUB"",""BLA...",1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1450413,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB...",1,1,1,1,1,1,1,1,...,0,0,0,1,0,0,0,0,0,0
3,855372,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB...",1,1,1,1,1,1,1,1,...,0,0,0,1,0,0,0,0,0,0
4,2545455,"[""AIR_CONDITIONING"",""BATHROBES"",""COFFEE_TEA_MA...",1,0,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [58]:
dfRoom.drop('roomFacilities', axis=1, inplace=True)

In [59]:
for i in range (1, len(dfRoom.columns)):
    dfRoom = dfRoom.astype({dfRoom.columns[i]: int})

In [60]:
abs(round(dfRoom.corr('spearman'),2)['originalRate']).sort_values(ascending=False)[:11]

originalRate                   1.00
HAIR_DRYER                     0.56
BATHTUB                        0.53
PRIVATE_POOL                   0.53
IN_ROOM_SAFE                   0.50
BATHROBES                      0.49
COFFEE_TEA_MAKER               0.44
MINIBAR                        0.41
COMPLIMENTARY_BOTTLED_WATER    0.37
SEPARATE_DINING_AREA           0.37
KITCHEN                        0.36
Name: originalRate, dtype: float64

##### `dfRoom` (Feature with Correlation => 0.5)
- HAIR_DRYER                     0.56
- BATHTUB                        0.53
- PRIVATE_POOL                   0.53
- IN_ROOM_SAFE                   0.50

In [61]:
dfRoomPick = dfRoom[['HAIR_DRYER', 'BATHTUB', 'PRIVATE_POOL', 'IN_ROOM_SAFE']]
dfRoomPick.head(2)

Unnamed: 0,HAIR_DRYER,BATHTUB,PRIVATE_POOL,IN_ROOM_SAFE
0,1,1,0,1
1,1,1,0,1


### Feature Selected
- `size`                           0.73
- `starRating`                     0.64
- `AIR_DRYER`                      0.56
- `BATHTUB`                        0.53
- `PRIVATE_POOL`                   0.53
- `IN_ROOM_SAFE`                   0.50

In [62]:
# combine all selected feture into dfReg
dfReg = df[['originalRate']]
dfReg = dfReg.join(dfHotelExPick)
dfReg = dfReg.join(dfRoomExPick)
dfReg = dfReg.join(dfRoomPick)
dfReg.head(2)

Unnamed: 0,originalRate,starRating,size,HAIR_DRYER,BATHTUB,PRIVATE_POOL,IN_ROOM_SAFE
0,1227273,4.0,46.0,1.0,1.0,0.0,1.0
1,596694,4.0,31.0,1.0,1.0,0.0,1.0


In [63]:
dfReg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5137 entries, 0 to 5136
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   originalRate  5137 non-null   int32  
 1   starRating    5137 non-null   float64
 2   size          5137 non-null   float64
 3   HAIR_DRYER    4948 non-null   float64
 4   BATHTUB       4948 non-null   float64
 5   PRIVATE_POOL  4948 non-null   float64
 6   IN_ROOM_SAFE  4948 non-null   float64
dtypes: float64(6), int32(1)
memory usage: 430.0+ KB


In [64]:
dfReg.to_csv('../Dataset/selected_feature.csv', index=False)

## Without Feature Selection

In [65]:
dfRoom.drop('originalRate', axis= 1, inplace=True)
dfHotel.drop('originalRate', axis= 1, inplace=True)
dfPoint.drop('originalRate', axis= 1, inplace=True)
dfFull = df[['originalRate']]


In [66]:
combine = [dfFull,  dfRoomExPick, dfHotelExPick, dfRoom, dfHotel, dfPoint]
dfFull = pd.concat(combine, axis=1)


In [67]:
dfFull.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5137 entries, 0 to 5136
Columns: 294 entries, originalRate to THEATER
dtypes: float64(293), int32(1)
memory usage: 11.5+ MB


In [68]:
dfFull.to_csv('../Dataset/full_feature.csv', index=False)