In [1]:
import pandas as pd
import numpy as np

## Input Data

In [2]:
df = pd.read_csv('../Dataset/clean_data.csv')
df.head(3)

Unnamed: 0,type,city,hotelFacilities,nearestPointOfInterests,starRating,size,originalRate,baseOccupancy,maxChildOccupancy,maxChildAge,isBreakfastIncluded,isWifiIncluded,isRefundable,hasLivingRoom,roomFacilities
0,Hotel,Badung,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...","[{""landmarkId"":""91589773100576"",""geoId"":null,""...",4.0,46.0,1227273,3,1,5,1,1,1,0,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB..."
1,Hotel,Badung,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...","[{""landmarkId"":""91589773100576"",""geoId"":null,""...",4.0,31.0,596694,2,1,5,0,1,1,0,"[""AIR_CONDITIONING"",""BATHROBES"",""BATHTUB"",""BLA..."
2,Hotel,Badung,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...","[{""landmarkId"":""91589773100576"",""geoId"":null,""...",4.0,52.0,1450413,2,1,5,1,1,1,1,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB..."


In [3]:
print(f'Total number of rows: {df.shape[0]}')
print(f'Total number of columns: {df.shape[1]}')

Total number of rows: 4683
Total number of columns: 15


### Feature Processing Function

In [4]:
# Rooms facilities processing
# How to use : Facilities = getFacilities(df['Facilities'].tolist())
def getFacilities(facilitiesList):
    facilities = []
    for i in facilitiesList:
        m = int(i.count('"') / 2)
        for j in range(1, m+1, 2):
            temp = i.split('"')[j]
            if temp not in facilities:
                facilities.append(temp)
    return facilities

In [5]:
# seperate the target variable
dfPrice = df[['originalRate']]
dfPrice.head(2)

Unnamed: 0,originalRate
0,1227273
1,596694


### Getting extraxcted features

In [6]:
# Hotel Facilities to List
hotelFacilities = getFacilities(df['hotelFacilities'].tolist())
hotelFacilities[:5]

['CARPARK',
 'ELEVATOR',
 'HAS_24_HOUR_ROOM_SERVICE',
 'RESTAURANT',
 'RESTAURANT_FOR_BREAKFAST']

In [7]:
# Room Facilities to List
roomFacilities = getFacilities(df['roomFacilities'].tolist())
roomFacilities[:5]

['AIR_CONDITIONING',
 'BALCONY_TERRACE',
 'BATHROBES',
 'BATHTUB',
 'BLACKOUT_DRAPES_CURTAINS']

In [8]:
# nearestPointOfInterests to List
# landmarkType is present at indexes [19, 49, 79, 109]

landmark = df['nearestPointOfInterests'].to_list()
searchText = 'landmarkType'
nearestPoint = []
loopCount = 0
for x in range (len(landmark)):
    loopCount += 1
    i = 0
    column = landmark[x].split('"')
    matched_indexes = []
    while i < len(column):
        if searchText == column[i]:
            matched_indexes.append(i)
        i += 1
    for y in matched_indexes:
        if column[y+2] not in nearestPoint:
            nearestPoint.append(column[y+2])
if 'OTHERS' in nearestPoint:
    nearestPoint.remove('OTHERS')
nearestPoint[:5]

['SHOPPING_AREA', 'OFFICIAL_BUILDING', 'RESTAURANT', 'ATTRACTION', 'BEACH']

In [9]:
# Create new dataframe
dfHotelEncode = pd.DataFrame(columns=hotelFacilities)
dfRoomEncode = pd.DataFrame(columns=roomFacilities)
dfPointEncode = pd.DataFrame(columns=nearestPoint)

### Extra Features

In [10]:
dfHotelEx = df[['originalRate','type','city','starRating']]
dfRoomEx = df[['originalRate','size', 'baseOccupancy', 'maxChildOccupancy', 'maxChildAge', 'isBreakfastIncluded', 'isWifiIncluded', 'isRefundable', 'hasLivingRoom']]
print(dfHotelEx.shape)
print(dfRoomEx.shape)

(4683, 4)
(4683, 9)


#### Hotel Extra Features

In [12]:
dfHotelEx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4683 entries, 0 to 4682
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   originalRate  4683 non-null   int64  
 1   type          4683 non-null   object 
 2   city          4683 non-null   object 
 3   starRating    4683 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 146.5+ KB


In [13]:
dfHotelEx['city'].unique()

array(['Badung', 'Denpasar', 'Gianyar', 'Sanur', 'Bangli', 'Buleleng',
       'Klungkung', 'Tabanan', 'Jembrana', 'Karangasem'], dtype=object)

In [14]:
dfHotelEx['type'].unique()

array(['Hotel', 'Resor', 'Vila', 'Apartemen', 'Guest House', 'Homestay',
       'Hostel', 'B&B', 'Camping', 'Hotel Kapsul'], dtype=object)

In [15]:
cityEncode = pd.get_dummies(dfHotelEx['city'], prefix='City')
typeEncode = pd.get_dummies(dfHotelEx['type'], prefix='Type')
dfHotelEx = pd.concat([dfHotelEx, cityEncode, typeEncode], axis=1)

In [16]:
dfHotelEx.drop(['city', 'type'], axis=1, inplace=True)
print(dfHotelEx.shape)
dfHotelEx.head(2)

(4683, 22)


Unnamed: 0,originalRate,starRating,City_Badung,City_Bangli,City_Buleleng,City_Denpasar,City_Gianyar,City_Jembrana,City_Karangasem,City_Klungkung,...,Type_Apartemen,Type_B&B,Type_Camping,Type_Guest House,Type_Homestay,Type_Hostel,Type_Hotel,Type_Hotel Kapsul,Type_Resor,Type_Vila
0,1227273,4.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,596694,4.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [17]:
dfHotelEx.describe()

Unnamed: 0,originalRate,starRating,City_Badung,City_Bangli,City_Buleleng,City_Denpasar,City_Gianyar,City_Jembrana,City_Karangasem,City_Klungkung,...,Type_Apartemen,Type_B&B,Type_Camping,Type_Guest House,Type_Homestay,Type_Hostel,Type_Hotel,Type_Hotel Kapsul,Type_Resor,Type_Vila
count,4683.0,4683.0,4683.0,4683.0,4683.0,4683.0,4683.0,4683.0,4683.0,4683.0,...,4683.0,4683.0,4683.0,4683.0,4683.0,4683.0,4683.0,4683.0,4683.0,4683.0
mean,1834629.0,2.934764,0.497971,0.009396,0.054452,0.049541,0.196882,0.008755,0.03438,0.077087,...,0.016442,0.005552,0.003203,0.09054,0.043562,0.014948,0.457613,0.000854,0.162716,0.20457
std,1805658.0,1.527116,0.500049,0.096485,0.226932,0.217018,0.397685,0.093168,0.182222,0.266758,...,0.127183,0.074313,0.056511,0.286985,0.20414,0.121356,0.498253,0.029217,0.369146,0.40343
min,40313.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,525950.5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1120868.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2520661.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,8760331.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Room Extra Features

In [18]:
dfRoomEx.head(2)

Unnamed: 0,originalRate,size,baseOccupancy,maxChildOccupancy,maxChildAge,isBreakfastIncluded,isWifiIncluded,isRefundable,hasLivingRoom
0,1227273,46.0,3,1,5,1,1,1,0
1,596694,31.0,2,1,5,0,1,1,0


### Hotel Feature Selection

In [19]:
dfHotel = pd.concat([df.pop('hotelFacilities'), dfHotelEncode], axis=1)
dfHotel = pd.concat([dfPrice, dfHotel], axis=1)
print(dfHotel.shape)
dfHotel.head(2)

(4683, 219)


Unnamed: 0,originalRate,hotelFacilities,CARPARK,ELEVATOR,HAS_24_HOUR_ROOM_SERVICE,RESTAURANT,RESTAURANT_FOR_BREAKFAST,RESTAURANT_FOR_DINNER,RESTAURANT_FOR_LUNCH,SAFETY_DEPOSIT_BOX,...,WEDDING_SERVICE,WATER_PARK_ACCESS_SURCHARGE,SURFING,CRIBS,PRIVATE_BEACH_NEARBY,RECEPTION_HALL,CHILDREN_CLUB,DARTS,ENTERTAINMENT_PROGRAMME_FOR_CHILDREN,KARAOKE
0,1227273,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",,,,,,,,,...,,,,,,,,,,
1,596694,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",,,,,,,,,...,,,,,,,,,,


In [20]:
for i in range (len(dfHotel)):
    value = dfHotel['hotelFacilities'][i]
    if value == '[]':
        dfHotel.drop([i], axis=0, inplace=True)
        continue
    for j in range (2, len(dfHotel.columns)):
        column_name = dfHotel.columns[j]
        if column_name in value:
            dfHotel.loc[i, column_name] = 1
        else:
            dfHotel.loc[i, column_name] = 0
print(dfHotel.shape)
dfHotel.head()

(4630, 219)


Unnamed: 0,originalRate,hotelFacilities,CARPARK,ELEVATOR,HAS_24_HOUR_ROOM_SERVICE,RESTAURANT,RESTAURANT_FOR_BREAKFAST,RESTAURANT_FOR_DINNER,RESTAURANT_FOR_LUNCH,SAFETY_DEPOSIT_BOX,...,WEDDING_SERVICE,WATER_PARK_ACCESS_SURCHARGE,SURFING,CRIBS,PRIVATE_BEACH_NEARBY,RECEPTION_HALL,CHILDREN_CLUB,DARTS,ENTERTAINMENT_PROGRAMME_FOR_CHILDREN,KARAOKE
0,1227273,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",1,1,1,1,1,1,1,1,...,1,0,0,0,0,0,1,0,0,0
1,596694,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",1,1,1,1,1,1,1,1,...,1,0,0,0,0,0,1,0,0,0
2,1450413,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",1,1,1,1,1,1,1,1,...,1,0,0,0,0,0,1,0,0,0
3,855372,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",1,1,1,1,1,1,1,1,...,1,0,0,0,0,0,1,0,0,0
4,2545455,"[""CARPARK"",""COFFEE_SHOP"",""ELEVATOR"",""HAS_24_HO...",1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Remove hotelFacilities column
dfHotel.drop('hotelFacilities', axis=1, inplace=True)

In [22]:
for i in range (1, len(dfHotel.columns)):
    dfHotel = dfHotel.astype({dfHotel.columns[i]: int})

### Point of Interest Feature Selection

In [23]:
dfPoint = pd.concat([df.pop('nearestPointOfInterests'), dfPointEncode], axis=1)
dfPoint = pd.concat([dfPrice, dfPoint], axis=1)
dfPoint.head(2)

Unnamed: 0,originalRate,nearestPointOfInterests,SHOPPING_AREA,OFFICIAL_BUILDING,RESTAURANT,ATTRACTION,BEACH,MONUMENT,TERMINAL,PARK,HOSPITAL,MUSEUM,GALLERY,PLACE_OF_WORSHIP,TRAIN_STATION,ZOO,ENTERTAINMENT,GARDEN,THEATER
0,1227273,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",,,,,,,,,,,,,,,,,
1,596694,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",,,,,,,,,,,,,,,,,


In [24]:
for i in range (len(dfPoint)):
    value = dfPoint['nearestPointOfInterests'][i]
    if value == '[]':
        dfPoint.drop([i], axis=0, inplace=True)
        continue
    for j in range (2, len(dfPoint.columns)):
        column_name = dfPoint.columns[j]
        if column_name in value:
            dfPoint.loc[i, column_name] = 1
        else:
            dfPoint.loc[i, column_name] = 0
print(dfPoint.shape)
dfPoint.head()

(4659, 19)


Unnamed: 0,originalRate,nearestPointOfInterests,SHOPPING_AREA,OFFICIAL_BUILDING,RESTAURANT,ATTRACTION,BEACH,MONUMENT,TERMINAL,PARK,HOSPITAL,MUSEUM,GALLERY,PLACE_OF_WORSHIP,TRAIN_STATION,ZOO,ENTERTAINMENT,GARDEN,THEATER
0,1227273,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,596694,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1450413,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,855372,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2545455,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
dfPoint.drop('nearestPointOfInterests', axis=1, inplace=True)

In [26]:
# Convert to int
for i in range (1, len(dfPoint.columns)):
    dfPoint = dfPoint.astype({dfPoint.columns[i]: int})

### Room Feature Selection

In [27]:
dfRoom = pd.concat([df.pop('roomFacilities'), dfRoomEncode], axis=1)
dfRoom = pd.concat([dfPrice, dfRoom], axis=1)
print(dfRoom.shape)
dfRoom.head(2)

(4683, 52)


Unnamed: 0,originalRate,roomFacilities,AIR_CONDITIONING,BALCONY_TERRACE,BATHROBES,BATHTUB,BLACKOUT_DRAPES_CURTAINS,COFFEE_TEA_MAKER,COMPLIMENTARY_BOTTLED_WATER,DESK,...,SOFA_BED,HOUSEKEEPING,IRONING_BOARD_ON_REQUEST,BALCONY,SHARED_BATHROOM,TWENTY_FOUR_HOUR_ROOM_SERVICE,FREE_CRIBS,JACUZZI_BATHTUB,SEPARATE_DINING_AREA,EXTRA_BEDS_AVAILABLE
0,1227273,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB...",,,,,,,,,...,,,,,,,,,,
1,596694,"[""AIR_CONDITIONING"",""BATHROBES"",""BATHTUB"",""BLA...",,,,,,,,,...,,,,,,,,,,


In [28]:
for i in range(len(dfRoom)):
    value = dfRoom['roomFacilities'][i]
    if value == '[]':
        dfRoom.drop([i], axis=0, inplace=True)
        continue
    for j in range (2, len(dfRoom.columns)):
        column_name = dfRoom.columns[j]
        if column_name in value:
            dfRoom.loc[i, column_name] = 1
        else:
            dfRoom.loc[i, column_name] = 0
print(dfRoom.shape)


(4561, 52)


In [29]:
dfRoom.drop('roomFacilities', axis=1, inplace=True)

In [30]:
for i in range (1, len(dfRoom.columns)):
    dfRoom = dfRoom.astype({dfRoom.columns[i]: int})

In [31]:
dfRoom.head()

Unnamed: 0,originalRate,AIR_CONDITIONING,BALCONY_TERRACE,BATHROBES,BATHTUB,BLACKOUT_DRAPES_CURTAINS,COFFEE_TEA_MAKER,COMPLIMENTARY_BOTTLED_WATER,DESK,DVD_PLAYER,...,SOFA_BED,HOUSEKEEPING,IRONING_BOARD_ON_REQUEST,BALCONY,SHARED_BATHROOM,TWENTY_FOUR_HOUR_ROOM_SERVICE,FREE_CRIBS,JACUZZI_BATHTUB,SEPARATE_DINING_AREA,EXTRA_BEDS_AVAILABLE
0,1227273,1,1,1,1,1,1,1,1,0,...,0,0,0,1,0,0,0,0,0,0
1,596694,1,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1450413,1,1,1,1,1,1,1,1,0,...,0,0,0,1,0,0,0,0,0,0
3,855372,1,1,1,1,1,1,1,1,0,...,0,0,0,1,0,0,0,0,0,0
4,2545455,1,0,1,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


## Without Feature Selection

In [32]:
dfRoom.drop('originalRate', axis= 1, inplace=True)
dfHotel.drop('originalRate', axis= 1, inplace=True)
dfPoint.drop('originalRate', axis= 1, inplace=True)
dfHotelEx.drop('originalRate', axis= 1, inplace=True)
dfRoomEx.drop('originalRate', axis= 1, inplace=True)
df = df[['originalRate']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfRoomEx.drop('originalRate', axis= 1, inplace=True)


In [33]:
combine = [df,  dfRoomEx, dfHotelEx, dfRoom, dfHotel, dfPoint]
df = pd.concat(combine, axis=1)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4490 entries, 0 to 4489
Columns: 314 entries, originalRate to THEATER
dtypes: float64(286), int64(8), uint8(20)
memory usage: 10.2 MB


In [35]:
df.to_csv('../Dataset/encoded_data.csv', index=False)