In [1]:
import pandas as pd
import numpy as np

## Input Data

In [2]:
df = pd.read_csv('../Dataset/clean_data.csv')
df.head(3)

Unnamed: 0,type,city,hotelFacilities,nearestPointOfInterests,starRating,size,originalRate,baseOccupancy,maxChildOccupancy,maxChildAge,isBreakfastIncluded,isWifiIncluded,isRefundable,hasLivingRoom,roomFacilities
0,Hotel,Badung,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...","[{""landmarkId"":""91589773100576"",""geoId"":null,""...",4,46.0,1227273,3,1,5,1,1,1,0,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB..."
1,Hotel,Badung,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...","[{""landmarkId"":""91589773100576"",""geoId"":null,""...",4,31.0,596694,2,1,5,0,1,1,0,"[""AIR_CONDITIONING"",""BATHROBES"",""BATHTUB"",""BLA..."
2,Hotel,Badung,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...","[{""landmarkId"":""91589773100576"",""geoId"":null,""...",4,52.0,1450413,2,1,5,1,1,1,1,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB..."


In [3]:
print(f'Total number of rows: {df.shape[0]}')
print(f'Total number of columns: {df.shape[1]}')

Total number of rows: 5239
Total number of columns: 15


### Feature Processing Function

In [4]:
# Rooms facilities processing
# How to use : Facilities = getFacilities(df['Facilities'].tolist())
def getFacilities(facilitiesList):
    facilities = []
    for i in facilitiesList:
        m = int(i.count('"') / 2)
        for j in range(1, m+1, 2):
            temp = i.split('"')[j]
            if temp not in facilities:
                facilities.append(temp)
    return facilities

In [5]:
# seperate the target variable
dfPrice = df[['originalRate']]
dfPrice.head(2)

Unnamed: 0,originalRate
0,1227273
1,596694


### Getting extraxcted features

In [6]:
# Hotel Facilities to List
hotelFacilities = getFacilities(df['hotelFacilities'].tolist())
print(f' Total Unique Hotel Facilities: {len(hotelFacilities)}')

 Total Unique Hotel Facilities: 240


In [7]:
# Room Facilities to List
roomFacilities = getFacilities(df['roomFacilities'].tolist())
print(f' Total Unique Hotel Facilities: {len(roomFacilities)}')

 Total Unique Hotel Facilities: 110


In [8]:
# nearestPointOfInterests to List
# landmarkType is present at indexes [19, 49, 79, 109]

landmark = df['nearestPointOfInterests'].to_list()
searchText = 'landmarkType'
nearestPoint = []
loopCount = 0
for x in range (len(landmark)):
    loopCount += 1
    i = 0
    column = landmark[x].split('"')
    matched_indexes = []
    while i < len(column):
        if searchText == column[i]:
            matched_indexes.append(i)
        i += 1
    for y in matched_indexes:
        if column[y+2] not in nearestPoint:
            nearestPoint.append(column[y+2])
if 'OTHERS' in nearestPoint:
    nearestPoint.remove('OTHERS')
print(f' Total Unique Hotel Facilities: {len(nearestPoint)}')

 Total Unique Hotel Facilities: 19


In [9]:
# Create new dataframe
dfHotelEncode = pd.DataFrame(columns=hotelFacilities)
dfRoomEncode = pd.DataFrame(columns=roomFacilities)
dfPointEncode = pd.DataFrame(columns=nearestPoint)

### Extra Features

In [10]:
dfHotelEx = df[['originalRate','type','city','starRating']]
dfRoomEx = df[['originalRate','size', 'baseOccupancy', 'maxChildOccupancy', 'maxChildAge', 'isBreakfastIncluded', 'isWifiIncluded', 'isRefundable', 'hasLivingRoom']]
print(dfHotelEx.shape)
print(dfRoomEx.shape)

(5239, 4)
(5239, 9)


#### Hotel Extra Features

In [11]:
dfHotelEx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5239 entries, 0 to 5238
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   originalRate  5239 non-null   int64 
 1   type          5239 non-null   object
 2   city          5239 non-null   object
 3   starRating    5239 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 163.8+ KB


In [12]:
dfHotelEx['city'].unique()

array(['Badung', 'Denpasar', 'Gianyar', 'Sanur', 'Bangli', 'Buleleng',
       'Klungkung', 'Tabanan', 'Jembrana', 'Karangasem'], dtype=object)

In [13]:
dfHotelEx['type'].unique()

array(['Hotel', 'Resor', 'Apartemen', 'Vila', 'Guest House', 'Homestay',
       'B&B', 'Hostel', 'Camping', 'Lainnya', 'Hotel Kapsul'],
      dtype=object)

In [14]:
cityEncode = pd.get_dummies(dfHotelEx['city'], prefix='City')
typeEncode = pd.get_dummies(dfHotelEx['type'], prefix='Type')
dfHotelEx = pd.concat([dfHotelEx, cityEncode, typeEncode], axis=1)

In [15]:
dfHotelEx.drop(['city', 'type'], axis=1, inplace=True)
print(dfHotelEx.shape)
dfHotelEx.head(2)

(5239, 23)


Unnamed: 0,originalRate,starRating,City_Badung,City_Bangli,City_Buleleng,City_Denpasar,City_Gianyar,City_Jembrana,City_Karangasem,City_Klungkung,...,Type_B&B,Type_Camping,Type_Guest House,Type_Homestay,Type_Hostel,Type_Hotel,Type_Hotel Kapsul,Type_Lainnya,Type_Resor,Type_Vila
0,1227273,4,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,596694,4,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [16]:
dfHotelEx.describe()

Unnamed: 0,originalRate,starRating,City_Badung,City_Bangli,City_Buleleng,City_Denpasar,City_Gianyar,City_Jembrana,City_Karangasem,City_Klungkung,...,Type_B&B,Type_Camping,Type_Guest House,Type_Homestay,Type_Hostel,Type_Hotel,Type_Hotel Kapsul,Type_Lainnya,Type_Resor,Type_Vila
count,5239.0,5239.0,5239.0,5239.0,5239.0,5239.0,5239.0,5239.0,5239.0,5239.0,...,5239.0,5239.0,5239.0,5239.0,5239.0,5239.0,5239.0,5239.0,5239.0,5239.0
mean,1314786.0,3.35293,0.427753,0.011071,0.073869,0.047337,0.196984,0.01298,0.053636,0.099256,...,0.020996,0.002672,0.128841,0.053064,0.020996,0.472418,0.000764,0.001145,0.137049,0.145829
std,1245683.0,1.162317,0.4948,0.104644,0.261583,0.21238,0.397758,0.113197,0.22532,0.299033,...,0.143386,0.05163,0.335056,0.224182,0.143386,0.499286,0.027624,0.033825,0.343932,0.352969
min,40313.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,454545.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,867769.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1714786.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,6611571.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Room Extra Features

In [17]:
dfRoomEx.head(2)

Unnamed: 0,originalRate,size,baseOccupancy,maxChildOccupancy,maxChildAge,isBreakfastIncluded,isWifiIncluded,isRefundable,hasLivingRoom
0,1227273,46.0,3,1,5,1,1,1,0
1,596694,31.0,2,1,5,0,1,1,0


### Hotel Feature Selection

In [18]:
dfHotel = pd.concat([df.pop('hotelFacilities'), dfHotelEncode], axis=1)
dfHotel = pd.concat([dfPrice, dfHotel], axis=1)
print(dfHotel.shape)
dfHotel.head(2)

(5239, 242)


Unnamed: 0,originalRate,hotelFacilities,CARPARK,ELEVATOR,HAS_24_HOUR_ROOM_SERVICE,RESTAURANT,RESTAURANT_FOR_BREAKFAST,RESTAURANT_FOR_DINNER,RESTAURANT_FOR_LUNCH,SAFETY_DEPOSIT_BOX,...,PETS_ALLOWED,SMALL_PETS_ALLOWED,WATER_PARK_ACCESS_SURCHARGE,PRIVATE_BEACH,SURFING,PRIVATE_BEACH_NEARBY,BEACH_SUN_LOUNGERS,DARTS,ENTERTAINMENT_PROGRAMME_FOR_CHILDREN,KARAOKE
0,1227273,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",,,,,,,,,...,,,,,,,,,,
1,596694,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",,,,,,,,,...,,,,,,,,,,


In [19]:
for i in range (len(dfHotel)):
    value = dfHotel['hotelFacilities'][i]
    if value == '[]':
        dfHotel.drop([i], axis=0, inplace=True)
        continue
    for j in range (2, len(dfHotel.columns)):
        column_name = dfHotel.columns[j]
        if column_name in value:
            dfHotel.loc[i, column_name] = 1
        else:
            dfHotel.loc[i, column_name] = 0
print(dfHotel.shape)
dfHotel.head()

(5177, 242)


Unnamed: 0,originalRate,hotelFacilities,CARPARK,ELEVATOR,HAS_24_HOUR_ROOM_SERVICE,RESTAURANT,RESTAURANT_FOR_BREAKFAST,RESTAURANT_FOR_DINNER,RESTAURANT_FOR_LUNCH,SAFETY_DEPOSIT_BOX,...,PETS_ALLOWED,SMALL_PETS_ALLOWED,WATER_PARK_ACCESS_SURCHARGE,PRIVATE_BEACH,SURFING,PRIVATE_BEACH_NEARBY,BEACH_SUN_LOUNGERS,DARTS,ENTERTAINMENT_PROGRAMME_FOR_CHILDREN,KARAOKE
0,1227273,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,596694,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1450413,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,855372,"[""CARPARK"",""ELEVATOR"",""HAS_24_HOUR_ROOM_SERVIC...",1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,2545455,"[""CARPARK"",""COFFEE_SHOP"",""ELEVATOR"",""HAS_24_HO...",1,1,1,1,1,1,1,1,...,0,0,0,1,0,0,0,0,0,0


In [20]:
# Remove hotelFacilities column
dfHotel.drop('hotelFacilities', axis=1, inplace=True)

In [21]:
for i in range (1, len(dfHotel.columns)):
    dfHotel = dfHotel.astype({dfHotel.columns[i]: int})

### Point of Interest Feature Selection

In [22]:
dfPoint = pd.concat([df.pop('nearestPointOfInterests'), dfPointEncode], axis=1)
dfPoint = pd.concat([dfPrice, dfPoint], axis=1)
dfPoint.head(2)

Unnamed: 0,originalRate,nearestPointOfInterests,SHOPPING_AREA,OFFICIAL_BUILDING,RESTAURANT,ATTRACTION,BEACH,MONUMENT,TERMINAL,PARK,...,MUSEUM,GALLERY,PLACE_OF_WORSHIP,TRAIN_STATION,ZOO,ENTERTAINMENT,GARDEN,THEATER,STORE,SCHOOL
0,1227273,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",,,,,,,,,...,,,,,,,,,,
1,596694,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",,,,,,,,,...,,,,,,,,,,


In [23]:
for i in range (len(dfPoint)):
    value = dfPoint['nearestPointOfInterests'][i]
    if value == '[]':
        dfPoint.drop([i], axis=0, inplace=True)
        continue
    for j in range (2, len(dfPoint.columns)):
        column_name = dfPoint.columns[j]
        if column_name in value:
            dfPoint.loc[i, column_name] = 1
        else:
            dfPoint.loc[i, column_name] = 0
print(dfPoint.shape)
dfPoint.head()

(5222, 21)


Unnamed: 0,originalRate,nearestPointOfInterests,SHOPPING_AREA,OFFICIAL_BUILDING,RESTAURANT,ATTRACTION,BEACH,MONUMENT,TERMINAL,PARK,...,MUSEUM,GALLERY,PLACE_OF_WORSHIP,TRAIN_STATION,ZOO,ENTERTAINMENT,GARDEN,THEATER,STORE,SCHOOL
0,1227273,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,596694,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1450413,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,855372,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2545455,"[{""landmarkId"":""91589773100576"",""geoId"":null,""...",1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
dfPoint.drop('nearestPointOfInterests', axis=1, inplace=True)

In [25]:
# Convert to int
for i in range (1, len(dfPoint.columns)):
    dfPoint = dfPoint.astype({dfPoint.columns[i]: int})

### Room Feature Selection

In [26]:
dfRoom = pd.concat([df.pop('roomFacilities'), dfRoomEncode], axis=1)
dfRoom = pd.concat([dfPrice, dfRoom], axis=1)
print(dfRoom.shape)
dfRoom.head(2)

(5239, 112)


Unnamed: 0,originalRate,roomFacilities,AIR_CONDITIONING,BALCONY_TERRACE,BATHROBES,BATHTUB,BLACKOUT_DRAPES_CURTAINS,COFFEE_TEA_MAKER,COMPLIMENTARY_BOTTLED_WATER,DESK,...,EXTRA_BEDS_AVAILABLE,FREE_INTERNATIONAL_CALLS,ROLLAWAY_OR_EXTRA_BEDS,YARD,VIDEO_GAMES,IN_ROOM_SAFE_SURCHARGE,HOUSEKEEPING_ON_REQUEST,NO_HOUSEKEEPING,CHANGING_TABLE,MICROWAVE_SURCHARGE
0,1227273,"[""AIR_CONDITIONING"",""BALCONY_TERRACE"",""BATHROB...",,,,,,,,,...,,,,,,,,,,
1,596694,"[""AIR_CONDITIONING"",""BATHROBES"",""BATHTUB"",""BLA...",,,,,,,,,...,,,,,,,,,,


In [27]:
for i in range(len(dfRoom)):
    value = dfRoom['roomFacilities'][i]
    if value == '[]':
        dfRoom.drop([i], axis=0, inplace=True)
        continue
    for j in range (2, len(dfRoom.columns)):
        column_name = dfRoom.columns[j]
        if column_name in value:
            dfRoom.loc[i, column_name] = 1
        else:
            dfRoom.loc[i, column_name] = 0
print(dfRoom.shape)


(5007, 112)


In [28]:
dfRoom.drop('roomFacilities', axis=1, inplace=True)

In [29]:
for i in range (1, len(dfRoom.columns)):
    dfRoom = dfRoom.astype({dfRoom.columns[i]: int})

In [30]:
dfRoom.head()

Unnamed: 0,originalRate,AIR_CONDITIONING,BALCONY_TERRACE,BATHROBES,BATHTUB,BLACKOUT_DRAPES_CURTAINS,COFFEE_TEA_MAKER,COMPLIMENTARY_BOTTLED_WATER,DESK,DVD_PLAYER,...,EXTRA_BEDS_AVAILABLE,FREE_INTERNATIONAL_CALLS,ROLLAWAY_OR_EXTRA_BEDS,YARD,VIDEO_GAMES,IN_ROOM_SAFE_SURCHARGE,HOUSEKEEPING_ON_REQUEST,NO_HOUSEKEEPING,CHANGING_TABLE,MICROWAVE_SURCHARGE
0,1227273,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,596694,1,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1450413,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,855372,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2545455,1,0,1,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# print dfRoom with mean value below 0.3
total1 = 0
total2 = 0
deleteIndex = []
for i in range (1, len(dfRoom.columns)):
    if dfRoom[dfRoom.columns[i]].mean() < 0.1:
        total1 +=1
        deleteIndex.append(dfRoom.columns[i])
        # print(dfRoom.columns[i], dfRoom[dfRoom.columns[i]].mean())
    if dfRoom[dfRoom.columns[i]].mean() > 0.9:
        total2 +=1
        deleteIndex.append(dfRoom.columns[i])
        # print(dfRoom.columns[i], dfRoom[dfRoom.columns[i]].mean())
print(f'Total Columns : {len(dfRoom.columns)}')

dfRoom.drop(deleteIndex, axis=1, inplace=True)
print(f'Total Mean Below 0.1 : {total1}')
print(f'Total Mean Above 0.9 : {total2}')
print(f'Total deleted index : {len(deleteIndex)}')
print(f'Remaining Columns : {len(dfRoom.columns)}')

Total Columns : 111
Total Mean Below 0.1 : 75
Total Mean Above 0.9 : 2
Total deleted index : 77
Remaining Columns : 34


In [32]:
# print dfHotel with mean value below 0.3
total1 = 0
total2 = 0
deleteIndex = []
for i in range (1, len(dfHotel.columns)):
    if dfHotel[dfHotel.columns[i]].mean() < 0.1:
        total1 +=1
        deleteIndex.append(dfHotel.columns[i])
        # print(dfHotel.columns[i], dfHotel[dfHotel.columns[i]].mean())
    if dfHotel[dfHotel.columns[i]].mean() > 0.9:
        total2 +=1
        deleteIndex.append(dfHotel.columns[i])
        # print(dfHotel.columns[i], dfHotel[dfHotel.columns[i]].mean())
print(f'Total Columns : {len(dfHotel.columns)}')

dfHotel.drop(deleteIndex, axis=1, inplace=True)
print(f'Total Mean Below 0.1 : {total1}')
print(f'Total Mean Above 0.9 : {total2}')
print(f'Total deleted index : {len(deleteIndex)}')
print(f'Remaining Columns : {len(dfHotel.columns)}')

Total Columns : 241
Total Mean Below 0.1 : 124
Total Mean Above 0.9 : 0
Total deleted index : 124
Remaining Columns : 117


## Combine Dataframe

In [33]:
dfRoom.drop('originalRate', axis= 1, inplace=True)
dfHotel.drop('originalRate', axis= 1, inplace=True)
dfPoint.drop('originalRate', axis= 1, inplace=True)
dfHotelEx.drop('originalRate', axis= 1, inplace=True)
dfRoomEx.drop('originalRate', axis= 1, inplace=True)
df = df[['originalRate']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfRoomEx.drop('originalRate', axis= 1, inplace=True)


In [34]:
combine = [df,  dfRoomEx, dfHotelEx, dfRoom, dfHotel, dfPoint]
df = pd.concat(combine, axis=1)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4941 entries, 0 to 4940
Columns: 199 entries, originalRate to SCHOOL
dtypes: float64(169), int64(9), uint8(21)
memory usage: 6.8 MB


In [36]:
df.to_csv('../Dataset/encoded_data.csv', index=False)