# DSA5104 Project Data Cleaning

In [1]:
filename_calendar = 'calendar.csv'
filename_listings_sum = 'listings_summary.csv'
filename_listings = 'listings.csv'
filename_reviews_sum = 'reviews_summary.csv'
filename_reviews = 'reviews.csv'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
calendar = pd.read_csv(filename_calendar, dtype = str)
listings_summary = pd.read_csv(filename_listings_sum, dtype = str)
listings = pd.read_csv(filename_listings, dtype = str)
reviews_summary = pd.read_csv(filename_reviews_sum, dtype = str)
reviews = pd.read_csv(filename_reviews, dtype = str)

# Data cleaning process

## calendar

In [5]:
calendar.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,645048906856594097,2022-06-20,t,$890.00,$890.00,1,365
1,645048906856594097,2022-06-21,t,$890.00,$890.00,1,365
2,645048906856594097,2022-06-22,t,$890.00,$890.00,1,365
3,645048906856594097,2022-06-23,t,$890.00,$890.00,1,365
4,645048906856594097,2022-06-24,t,$890.00,$890.00,1,365


In [6]:
calendar.shape

(6231645, 7)

In [7]:
calendar.isnull().sum()[calendar.isnull().sum() > 0]

minimum_nights    2
maximum_nights    2
dtype: int64

- for "minimum_nights" and "maximum_nights", we simply drop the null records since they are very few

In [8]:
calendar.drop(index = calendar[calendar.minimum_nights.isnull() == True].index, axis = 0, inplace = True)

In [9]:
calendar.isnull().sum()[calendar.isnull().sum() > 0]

Series([], dtype: int64)

- for 'price' and 'adjusted_price', we change it to float!

In [10]:
calendar['price'] = calendar['price'].apply(lambda x: x.replace('$', ''))
calendar['adjusted_price'] = calendar['adjusted_price'].apply(lambda x: x.replace('$', ''))

In [11]:
calendar.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,645048906856594097,2022-06-20,t,890.0,890.0,1,365
1,645048906856594097,2022-06-21,t,890.0,890.0,1,365
2,645048906856594097,2022-06-22,t,890.0,890.0,1,365
3,645048906856594097,2022-06-23,t,890.0,890.0,1,365
4,645048906856594097,2022-06-24,t,890.0,890.0,1,365


## listings

### Firstly, get some insights around data

In [18]:
listings[['neighbourhood_cleansed', 'neighbourhood_group_cleansed']].head()

Unnamed: 0,neighbourhood_cleansed,neighbourhood_group_cleansed
0,Bang Sue,
1,Bang Sue,
2,Lak Si,
3,Lak Si,
4,Khlong Sam Wa,


In [9]:
listings.head(1)['amenities']

0    ["Kitchen", "Air conditioning", "Free parking ...
Name: amenities, dtype: object

In [19]:
listings[['property_type', 'room_type']].head()

Unnamed: 0,property_type,room_type
0,Entire condo,Entire home/apt
1,Private room in rental unit,Private room
2,Private room in condo,Private room
3,Private room in condo,Private room
4,Private room in home,Private room


In [20]:
listings[['bathrooms','bathrooms_text']].head()

Unnamed: 0,bathrooms,bathrooms_text
0,,1 bath
1,,1 bath
2,,1 bath
3,,1 private bath
4,,2 shared baths


In [24]:
listings[['bedrooms', 'beds']].head()

Unnamed: 0,bedrooms,beds
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1


    - future x days, the availability situation

In [28]:
listings[['has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365']].head(10)

Unnamed: 0,has_availability,availability_30,availability_60,availability_90,availability_365
0,t,30,60,90,365
1,t,27,57,87,362
2,t,28,58,88,89
3,t,30,60,90,181
4,t,27,57,87,88
5,t,24,54,84,359
6,t,27,57,87,362
7,t,23,53,83,358
8,t,30,60,90,365
9,t,30,60,90,365


    - ltm: last 12 months
    - l30d: last 30 days

In [32]:
listings[['number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review']].head(10)

Unnamed: 0,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review
0,0,0,0,,
1,0,0,0,,
2,6,0,0,2019-11-15,2020-03-05
3,3,0,0,2019-06-01,2019-12-07
4,0,0,0,,
5,0,0,0,,
6,65,0,0,2012-04-07,2020-01-06
7,0,0,0,,
8,0,0,0,,
9,2,1,0,2016-07-13,2022-04-01


In [34]:
listings[['instant_bookable', 'has_availability']].head()

Unnamed: 0,instant_bookable,has_availability
0,t,t
1,f,t
2,t,t
3,t,t
4,t,t


In [12]:
listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,645048906856594097,https://www.airbnb.com/rooms/645048906856594097,20220620202126,2022-06-20,Furnished 1 bedroom condo with beautiful view,🏢Condo For rent: Lumpini Ville Nakhon In-River...,Its in Lumpini building,https://a0.muscache.com/pictures/miso/Hosting-...,4663219,https://www.airbnb.com/users/show/4663219,...,,,,,t,1,1,0,0,
1,15636062,https://www.airbnb.com/rooms/15636062,20220620202126,2022-06-20,Akesin place ngamwongwan,ที่ของฉันใกล้กับร้านอาหารและของกิน และ ขนส่งสา...,,https://a0.muscache.com/pictures/f68098b9-3ff2...,100840975,https://www.airbnb.com/users/show/100840975,...,,,,,f,1,0,1,0,
2,39544047,https://www.airbnb.com/rooms/39544047,20220620202126,2022-06-20,IMPACT Thunder Dome Muangthong ND condo ( C1 ),,IMPACT ศูนย์แสดงสินค้า 550 m<br />IMPACT Chal...,https://a0.muscache.com/pictures/cc410e7d-922f...,303573998,https://www.airbnb.com/users/show/303573998,...,5.0,4.67,4.67,,t,2,0,2,0,0.19
3,34002624,https://www.airbnb.com/rooms/34002624,20220620202126,2022-06-20,Popular Condo Near IMPACT,คอนโดเมืองทองอยู่ในบริเวณเดียวกับศูนย์แสดงสินค...,คอนโดเมืองทองอยู่ในบริเวณเดียวกับศูนย์แสดงสินค...,https://a0.muscache.com/pictures/e4811699-0153...,256747704,https://www.airbnb.com/users/show/256747704,...,4.0,4.5,4.0,,t,1,0,1,0,0.08
4,22678899,https://www.airbnb.com/rooms/22678899,20220620202126,2022-06-20,บ้านเบนซ์,เป็นหมู่บ้านขนาดใหญ่มีหลายเฟส บ้านเป็นทาวน์โฮม...,,https://a0.muscache.com/pictures/02e4ae53-dbe6...,167136634,https://www.airbnb.com/users/show/167136634,...,,,,,t,1,0,1,0,


In [13]:
listings.shape

(17074, 74)

In [14]:
listings.isnull().sum()[listings.isnull().sum() > 0]

name                                9
description                       905
neighborhood_overview            7615
host_name                           2
host_since                          2
host_location                      44
host_about                       7241
host_response_time               5629
host_response_rate               5629
host_acceptance_rate             6779
host_is_superhost                   2
host_thumbnail_url                  2
host_picture_url                    2
host_neighbourhood               7262
host_listings_count                 2
host_total_listings_count           2
host_has_profile_pic                2
host_identity_verified              2
neighbourhood                    7615
neighbourhood_group_cleansed    17074
bathrooms                       17074
bathrooms_text                    137
bedrooms                         1703
beds                              517
minimum_minimum_nights              1
maximum_minimum_nights              1
minimum_maxi

In [15]:
listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_upd

### Check those attributes which are all NaN

In [8]:
delete_list = []
for attr in listings.isnull().sum()[listings.isnull().sum() == listings.shape[0]].index:
    delete_list.append(attr)

In [9]:
delete_list

['neighbourhood_group_cleansed', 'bathrooms', 'calendar_updated', 'license']

In [10]:
listings.drop(delete_list, axis = 1, inplace = True)

In [11]:
listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,645048906856594097,https://www.airbnb.com/rooms/645048906856594097,20220620202126,2022-06-20,Furnished 1 bedroom condo with beautiful view,🏢Condo For rent: Lumpini Ville Nakhon In-River...,Its in Lumpini building,https://a0.muscache.com/pictures/miso/Hosting-...,4663219,https://www.airbnb.com/users/show/4663219,...,,,,,t,1,1,0,0,
1,15636062,https://www.airbnb.com/rooms/15636062,20220620202126,2022-06-20,Akesin place ngamwongwan,ที่ของฉันใกล้กับร้านอาหารและของกิน และ ขนส่งสา...,,https://a0.muscache.com/pictures/f68098b9-3ff2...,100840975,https://www.airbnb.com/users/show/100840975,...,,,,,f,1,0,1,0,
2,39544047,https://www.airbnb.com/rooms/39544047,20220620202126,2022-06-20,IMPACT Thunder Dome Muangthong ND condo ( C1 ),,IMPACT ศูนย์แสดงสินค้า 550 m<br />IMPACT Chal...,https://a0.muscache.com/pictures/cc410e7d-922f...,303573998,https://www.airbnb.com/users/show/303573998,...,5.0,5.0,4.67,4.67,t,2,0,2,0,0.19
3,34002624,https://www.airbnb.com/rooms/34002624,20220620202126,2022-06-20,Popular Condo Near IMPACT,คอนโดเมืองทองอยู่ในบริเวณเดียวกับศูนย์แสดงสินค...,คอนโดเมืองทองอยู่ในบริเวณเดียวกับศูนย์แสดงสินค...,https://a0.muscache.com/pictures/e4811699-0153...,256747704,https://www.airbnb.com/users/show/256747704,...,5.0,4.0,4.5,4.0,t,1,0,1,0,0.08
4,22678899,https://www.airbnb.com/rooms/22678899,20220620202126,2022-06-20,บ้านเบนซ์,เป็นหมู่บ้านขนาดใหญ่มีหลายเฟส บ้านเป็นทาวน์โฮม...,,https://a0.muscache.com/pictures/02e4ae53-dbe6...,167136634,https://www.airbnb.com/users/show/167136634,...,,,,,t,1,0,1,0,


## 1. Check those attributes which have few NaNs (in details)
- for those attributes, since the quantity is very small, we can simply drop them!

In [12]:
listings.isnull().sum()[(listings.isnull().sum() < 50) & (listings.isnull().sum() > 0)]

name                          9
host_name                     2
host_since                    2
host_location                44
host_is_superhost             2
host_thumbnail_url            2
host_picture_url              2
host_listings_count           2
host_total_listings_count     2
host_has_profile_pic          2
host_identity_verified        2
minimum_minimum_nights        1
maximum_minimum_nights        1
minimum_maximum_nights        1
maximum_maximum_nights        1
minimum_nights_avg_ntm        1
maximum_nights_avg_ntm        1
dtype: int64

- name, for this case, it is likely to be the case that, they only lose their title name for some reason
    - I think it is not a crucial problem, so I choose to still save these records and set null value for 'name'
    - for the simplicity, we kick them out

In [13]:
listings[listings.name.isnull() == True]

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
441,4245018,https://www.airbnb.com/rooms/4245018,20220620202126,2022-06-20,,<b>The space</b><br />The old shop house was b...,,https://a0.muscache.com/pictures/53314583/1591...,22030043,https://www.airbnb.com/users/show/22030043,...,,,,,f,1,0,1,0,
527,4720818,https://www.airbnb.com/rooms/4720818,20220620202126,2022-06-21,,Room for rent near MRT Huay Kwang station.<br ...,,https://a0.muscache.com/pictures/59268815/48c0...,24386225,https://www.airbnb.com/users/show/24386225,...,,,,,f,1,0,1,0,
637,4549768,https://www.airbnb.com/rooms/4549768,20220620202126,2022-06-21,,,,https://a0.muscache.com/pictures/57555128/251b...,18852579,https://www.airbnb.com/users/show/18852579,...,,,,,f,1,0,1,0,
1081,6148415,https://www.airbnb.com/rooms/6148415,20220620202126,2022-06-21,,<b>The space</b><br />Feature / description<br...,Location<br />Absolute convenience / Transport...,https://a0.muscache.com/pictures/76635501/2816...,31895202,https://www.airbnb.com/users/show/31895202,...,,,,,f,1,1,0,0,
1195,8055144,https://www.airbnb.com/rooms/8055144,20220620202126,2022-06-21,,"Luxury Condominium in CBD area of Bangkok, Suk...",,https://a0.muscache.com/pictures/102664030/c25...,42521288,https://www.airbnb.com/users/show/42521288,...,,,,,t,1,0,1,0,
1623,10000742,https://www.airbnb.com/rooms/10000742,20220620202126,2022-06-21,,Diamond Bangkok<br />Near BTS Ratchathewi Stat...,Diamond Bangkok<br />Near BTS Ratchathewi Stat...,https://a0.muscache.com/pictures/e69fd5f9-b619...,51374914,https://www.airbnb.com/users/show/51374914,...,3.0,3.0,4.17,3.83,t,1,0,1,0,0.08
1870,10710165,https://www.airbnb.com/rooms/10710165,20220620202126,2022-06-21,,"-Fully furnished with modern closet, dressing ...",,https://a0.muscache.com/pictures/fee3ff6b-a268...,55347997,https://www.airbnb.com/users/show/55347997,...,,,,,t,1,0,1,0,
2307,13142743,https://www.airbnb.com/rooms/13142743,20220620202126,2022-06-21,,"Errday Guest House for rent daily, monthly, co...",,https://a0.muscache.com/pictures/1887b079-baae...,73275200,https://www.airbnb.com/users/show/73275200,...,5.0,5.0,3.5,4.5,f,3,0,3,0,0.03
16438,5709288,https://www.airbnb.com/rooms/5709288,20220620202126,2022-06-20,,Private room for 2 persons with city view fro...,It has many favourite places around here.,https://a0.muscache.com/pictures/71049021/b057...,29602043,https://www.airbnb.com/users/show/29602043,...,,,,,f,1,0,1,0,


In [14]:
listings.drop(index = listings[listings.name.isnull() == True].index, axis = 0, inplace =True)

- host_name, it can be observed that, all host-related information is loss for this case
    - since there are not too many records, we choose to kick out them for the well-structured of DB

In [15]:
listings[listings.host_name.isnull() == True][['id', 'host_is_superhost', 'host_picture_url', 'host_name', 'host_location']]

Unnamed: 0,id,host_is_superhost,host_picture_url,host_name,host_location
7860,31809614,,,,
12775,42181585,,,,


In [16]:
listings.drop(index = listings[listings.host_name.isnull() == True][['id', 'host_is_superhost', 'host_picture_url', 'host_name', 'host_location']].index, axis = 0, inplace = True)

- host_location, drop it!

In [17]:
listings[listings.host_location.isnull() == True]

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
764,5727012,https://www.airbnb.com/rooms/5727012,20220620202126,2022-06-21,40 m2-1BR-condo#BTS#Long term rent,A 40 sqm. 1 bedroom type condo located on Sukh...,,https://a0.muscache.com/pictures/b3af9000-d568...,17996661,https://www.airbnb.com/users/show/17996661,...,4.86,5.0,4.57,4.57,f,1,1,0,0,0.08
1379,8855331,https://www.airbnb.com/rooms/8855331,20220620202126,2022-06-21,1 Bedroom Ideo Condo Sathorn-Thapra,One Bedroom with Balcony Fully Furnished: LED ...,,https://a0.muscache.com/pictures/5cc211e4-eac8...,42953784,https://www.airbnb.com/users/show/42953784,...,,,,,f,1,0,1,0,
1682,10324572,https://www.airbnb.com/rooms/10324572,20220620202126,2022-06-21,Zanthia Place on Sukhumvit road,The exclusive of your private living on Sukhum...,,https://a0.muscache.com/pictures/568c6a53-cf30...,51627960,https://www.airbnb.com/users/show/51627960,...,,,,,f,1,1,0,0,
1799,11288172,https://www.airbnb.com/rooms/11288172,20220620202126,2022-06-21,温馨整洁的联排别墅,"干净整洁,交通便利,出巷子就能出高速方便到达曼谷各大景点及购物场所,著名的易三仓大学皇马校区...",安静,https://a0.muscache.com/pictures/d6428331-8e81...,48866549,https://www.airbnb.com/users/show/48866549,...,,,,,f,1,0,1,0,
2224,13451554,https://www.airbnb.com/rooms/13451554,20220620202126,2022-06-21,Luxury1 bed/1min BTS/ fully equipped facilities,"On Sukhumvit Road, Only 100 m. away from BTS ...",,https://a0.muscache.com/pictures/e733ac5f-e565...,75129764,https://www.airbnb.com/users/show/75129764,...,5.0,5.0,5.0,5.0,f,1,1,0,0,0.05
2475,13982700,https://www.airbnb.com/rooms/13982700,20220620202126,2022-06-21,Luxury condo with cozy 1 bedroom,"Close to Bts, fitness, gorgeous swimming pools...",,https://a0.muscache.com/pictures/08fcfd9e-214c...,40601400,https://www.airbnb.com/users/show/40601400,...,,,,,f,1,1,0,0,
2590,14149434,https://www.airbnb.com/rooms/14149434,20220620202126,2022-06-21,泰国曼谷五星级酒店式公寓，和中国大使馆人员做邻居！,"此区域内为数不多的大户型公寓,夏威夷式游泳池、拥有热带风光、阳光与泳池刚刚好哟,近曼谷排名第...",,https://a0.muscache.com/pictures/5733d9ef-033a...,82822850,https://www.airbnb.com/users/show/82822850,...,,,,,t,2,2,0,0,
2592,14150329,https://www.airbnb.com/rooms/14150329,20220620202126,2022-06-21,泰国曼谷五星级酒店式公寓，和中国大使馆人员做邻居,"宽敞、明亮、大阳台!无敌曼谷夜景,位于泰国金融CBD区,联系人:小曾 (SENSITIVE ...",,https://a0.muscache.com/pictures/8d443351-1f5e...,82822850,https://www.airbnb.com/users/show/82822850,...,,,,,f,2,2,0,0,
2632,14903399,https://www.airbnb.com/rooms/14903399,20220620202126,2022-06-21,U,Uno Inn is a well furnished guesthouse with su...,,https://a0.muscache.com/pictures/0b1157a4-780b...,92902989,https://www.airbnb.com/users/show/92902989,...,,,,,t,1,0,1,0,
2672,14941205,https://www.airbnb.com/rooms/14941205,20220620202126,2022-06-20,15 minutes from Suvarnabhumi International Air...,A quiet riverside condominium. Just 15 minutes...,,https://a0.muscache.com/pictures/249a9bd6-4f16...,47181155,https://www.airbnb.com/users/show/47181155,...,,,,,t,1,0,1,0,


In [18]:
listings.drop(index = listings[listings.host_location.isnull() == True].index, axis = 0, inplace = True)

- minimum_minimum_nights, drop it

In [19]:
listings[listings.minimum_minimum_nights.isnull() == True]

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
13729,44563108,https://www.airbnb.com/rooms/44563108,20220620202126,2022-06-21,Somerset Maison Asoke Bangkok,,,https://a0.muscache.com/pictures/miso/Hosting-...,360620448,https://www.airbnb.com/users/show/360620448,...,,,,,f,1,0,0,0,


In [20]:
listings.drop(index = listings[listings.minimum_minimum_nights.isnull() == True].index, axis = 0, inplace =True)

### check the table after dealing with the FEW NULL value

In [21]:
listings.isnull().sum()[(listings.isnull().sum() < 50) & (listings.isnull().sum() > 0)]

Series([], dtype: int64)

## 2. Check those features which have many NULL value

In [22]:
listings.isnull().sum()[listings.isnull().sum() != 0]

description                     899
neighborhood_overview          7581
host_about                     7192
host_response_time             5585
host_response_rate             5585
host_acceptance_rate           6732
host_neighbourhood             7222
neighbourhood                  7581
bathrooms_text                  136
bedrooms                       1699
beds                            515
first_review                   7023
last_review                    7023
review_scores_rating           7023
review_scores_accuracy         7246
review_scores_cleanliness      7246
review_scores_checkin          7252
review_scores_communication    7247
review_scores_location         7253
review_scores_value            7254
reviews_per_month              7023
dtype: int64

- description (null imputation)

In [23]:
listings.loc[3, 'description']

'คอนโดเมืองทองอยู่ในบริเวณเดียวกับศูนย์แสดงสินค้าอิมแพค เมืองทองธานี ถ.แจ้งวัฒนะ มีสิ่งอำนวยความสะดวกครบครัน<br /><br />พร้อมดูแล ท่าน สนใจห้องพัก อื่นๆ หรืออยากเปลี่ยนห้องพัก หรือต้องการสิ่งใด โปรดติดต่อ ได้ตลอดเวลา<br /><br />Popular Condo สถานที่ตั้งของที่พักใกล้ - IMPACT , Challenger และ Hall 1-8 เดินทางสะดวกเพียงใช้เวลาเดินเพียง 5-10 นาที -  มหาวิทยาลัยสุโขทัยธรรมาธิราชเพียง 1.8 กิโลเมตร - สนามบินดอนเมืองเพียง 5 กิโลเมตร - เซ็นทรัลพลาซ่าแจ้งวัฒนะ เดินทางโดยรถยนต์ 10 นาที<br /><br /><b>The space</b><br />Popular Condo  สถานที่ตั้งของที่พักใกล้ - IMPACT , Challenger และ Hall 1-8 เดินทางสะดวกเพียงใช้เวลาเดินเพียง 5-10 นาที - ใกล้ Out let และห้างคอสโมสะดวกสบายในการหาของกิน - มหาวิทยาลัยสุโขทัยธรรมาธิราชเพียง 1.8 กิโลเมตร - สนามบินดอนเมืองเพียง 5 กิโลเมตร - เซ็นทรัลพลาซ่าแจ้งวัฒนะ เดินทางโดยรถยนต์ 10 นาที ที่พักมี Free Wifi ทีวีจอแบน ตู้เย็น ไมโครเวฟ เครื่องทำน้าอุ่น ฟรีที่จอดรถบริเวรใกล้เคียง<br /><br /><b>Guest access</b><br />สามารถใช้ได้ทั้งหมด<br /><br /><b>Other things to note</b

In [24]:
listings.loc[0, 'description']

'🏢Condo For rent: Lumpini Ville Nakhon In-Riverview Condo 🏢<br /><br /> 🚘 Only 1.3 km. from the Purple Line  🚄 Yaek Tiwanon Station, near Lotus and Makro, Nakhon In and BigC Tiwanon<br /><br /><b>The space</b><br />32 sqm<br /><br /><b>Guest access</b><br />You will get the entire condo access alont with elevator card'

- neighborhood_overview (null imputation)

In [25]:
listings.loc[0, 'neighborhood_overview']

'Its in Lumpini building'

In [26]:
listings.loc[2, 'neighborhood_overview']

'IMPACT ศูนย์แสดงสินค้า  550 m<br />IMPACT Challenger        650 m<br />IMPACT Arena.                 400 m<br />IMPACT Forum ,hall4.     850 m<br />IMPACT Hall5.                    450 m<br />Cosmo Bazaar.                  350 m<br />เซ็นทรัลพล่าซ่า แจ้งวัฒนะ.  2.6 km.<br />SCG Stadium.                      1.1 km.<br />Thunder Dome.                  1.0 km.'

In [27]:
listings.loc[1, 'neighborhood_overview']

nan

- host_about, sentences to introduce the host (null imputation)

In [28]:
listings.loc[8, 'host_about']

'Welcome to Familyroom Apartment.\r\n\r\nA Chic and stylish apartment situated close to the heart of the city. '

- host_response_time (null imputation)

In [29]:
listings.loc[6, 'host_response_time']

nan

- host_response_rate (**impute with mean**)
    - Idea: it is reasonable to use the mean to predict those hosts who do not have response

In [30]:
listings['host_response_rate'] = listings['host_response_rate'].fillna(-1)

In [31]:
listings['host_response_rate'] = listings['host_response_rate'].apply(lambda x : x if x == -1 else float(x[:-1]))

In [32]:
listings.host_response_rate

0        100.0
1         -1.0
2          0.0
3         -1.0
4         -1.0
         ...  
17069    100.0
17070      0.0
17071     50.0
17072     75.0
17073     75.0
Name: host_response_rate, Length: 17020, dtype: float64

In [33]:
# calculate the mean
np.mean(listings.host_response_rate[listings.host_response_rate != -1])

81.55574989068649

In [34]:
listings['host_response_rate'] = listings['host_response_rate'].apply(lambda x: 81 if x == -1 else x)

In [35]:
listings.host_response_rate.head()

0    100.0
1     81.0
2      0.0
3     81.0
4     81.0
Name: host_response_rate, dtype: float64

- host_acceptance_rate (impute with mean)
    - for the similar reason

In [36]:
listings['host_acceptance_rate'] = listings['host_acceptance_rate'].fillna(-1)
listings['host_acceptance_rate'] = listings['host_acceptance_rate'].apply(lambda x : x if x == -1 else float(x[:-1]))
listings['host_acceptance_rate']

0         -1.0
1         -1.0
2         -1.0
3        100.0
4         -1.0
         ...  
17069    100.0
17070     -1.0
17071     -1.0
17072     90.0
17073     90.0
Name: host_acceptance_rate, Length: 17020, dtype: float64

In [37]:
np.mean(listings.host_acceptance_rate[listings.host_acceptance_rate != -1])

85.7900466562986

In [38]:
listings['host_acceptance_rate'] = listings['host_acceptance_rate'].apply(lambda x: 86 if x == -1 else x)
listings.host_acceptance_rate.head()

0     86.0
1     86.0
2     86.0
3    100.0
4     86.0
Name: host_acceptance_rate, dtype: float64

- 'host_neighbourhood', 'neighbourhood', 'bathrooms_text', 'bedrooms', 'beds' (impute with null)
    - consider whether some attrs are really meaningful

In [39]:
listings[['host_neighbourhood', 'neighbourhood', 'bathrooms_text', 'bedrooms', 'beds']].head(10) 

Unnamed: 0,host_neighbourhood,neighbourhood,bathrooms_text,bedrooms,beds
0,,"Tambon Talat Kwan, Chang Wat Nonthaburi, Thailand",1 bath,1,1
1,,,1 bath,1,1
2,,"Tambon Ban Mai, จ.นนทบุรี, Thailand",1 bath,1,1
3,,"ตำบล บางพูด, Chang Wat Nonthaburi, Thailand",1 private bath,1,1
4,,,2 shared baths,1,1
5,,,3 shared baths,1,2
6,Victory Monument,"Samsen Nai, Bangkok, Thailand",1.5 baths,1,1
7,,,1 bath,1,2
8,Bang Kapi,"Bangkok, Thailand",1 bath,1,1
9,,"Bangkok, Thailand",1 private bath,1,1


- first_review, last_review (impute with null)

In [40]:
listings[['id', 'first_review', 'last_review']].head()

Unnamed: 0,id,first_review,last_review
0,645048906856594097,,
1,15636062,,
2,39544047,2019-11-15,2020-03-05
3,34002624,2019-06-01,2019-12-07
4,22678899,,


- review_scores_rating， review_scores_accurarcy, ..., review_scores_value (impute with mean)

In [41]:
reviews_list = list(listings.isnull().sum()[listings.isnull().sum() != 0].index[12:-1])
reviews_list

['review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value']

In [42]:
# showcase
listings[['id'] + reviews_list].head()

Unnamed: 0,id,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,645048906856594097,,,,,,
1,15636062,,,,,,
2,39544047,5.0,5.0,5.0,5.0,4.67,4.67
3,34002624,5.0,5.0,5.0,4.0,4.5,4.0
4,22678899,,,,,,


In [43]:
for attr in reviews_list:
    listings[attr] = listings[attr].astype(float)
    listings[attr] = listings[attr].fillna(np.mean(listings[attr]))

In [44]:
listings[['id'] + reviews_list].head()

Unnamed: 0,id,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,645048906856594097,4.710614,4.676348,4.754167,4.764909,4.600394,4.641145
1,15636062,4.710614,4.676348,4.754167,4.764909,4.600394,4.641145
2,39544047,5.0,5.0,5.0,5.0,4.67,4.67
3,34002624,5.0,5.0,5.0,4.0,4.5,4.0
4,22678899,4.710614,4.676348,4.754167,4.764909,4.600394,4.641145


- reviews_per_month (impute with null)

In [45]:
listings[['id', 'reviews_per_month']].head()

Unnamed: 0,id,reviews_per_month
0,645048906856594097,
1,15636062,
2,39544047,0.19
3,34002624,0.08
4,22678899,


### Appendix: While checking, I find that: (different attribute name)
    - listings : neighbourhood_cleansed corresponds to
    - listings_summary: neighbourhood

In [46]:
listings_summary[['neighbourhood', 'id']][listings_summary.id == '645048906856594097']

Unnamed: 0,neighbourhood,id
3,Bang Sue,645048906856594097


In [47]:
listings[['neighbourhood_cleansed', 'id']][listings.id == '645048906856594097']

Unnamed: 0,neighbourhood_cleansed,id
0,Bang Sue,645048906856594097


## Now, the null value is:

In [48]:
listings.isnull().sum()[listings.isnull().sum() != 0]

description               899
neighborhood_overview    7581
host_about               7192
host_response_time       5585
host_neighbourhood       7222
neighbourhood            7581
bathrooms_text            136
bedrooms                 1699
beds                      515
first_review             7023
last_review              7023
review_scores_rating     7023
reviews_per_month        7023
dtype: int64

## Data after cleaning

In [49]:
listings

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,645048906856594097,https://www.airbnb.com/rooms/645048906856594097,20220620202126,2022-06-20,Furnished 1 bedroom condo with beautiful view,🏢Condo For rent: Lumpini Ville Nakhon In-River...,Its in Lumpini building,https://a0.muscache.com/pictures/miso/Hosting-...,4663219,https://www.airbnb.com/users/show/4663219,...,4.754167,4.764909,4.600394,4.641145,t,1,1,0,0,
1,15636062,https://www.airbnb.com/rooms/15636062,20220620202126,2022-06-20,Akesin place ngamwongwan,ที่ของฉันใกล้กับร้านอาหารและของกิน และ ขนส่งสา...,,https://a0.muscache.com/pictures/f68098b9-3ff2...,100840975,https://www.airbnb.com/users/show/100840975,...,4.754167,4.764909,4.600394,4.641145,f,1,0,1,0,
2,39544047,https://www.airbnb.com/rooms/39544047,20220620202126,2022-06-20,IMPACT Thunder Dome Muangthong ND condo ( C1 ),,IMPACT ศูนย์แสดงสินค้า 550 m<br />IMPACT Chal...,https://a0.muscache.com/pictures/cc410e7d-922f...,303573998,https://www.airbnb.com/users/show/303573998,...,5.000000,5.000000,4.670000,4.670000,t,2,0,2,0,0.19
3,34002624,https://www.airbnb.com/rooms/34002624,20220620202126,2022-06-20,Popular Condo Near IMPACT,คอนโดเมืองทองอยู่ในบริเวณเดียวกับศูนย์แสดงสินค...,คอนโดเมืองทองอยู่ในบริเวณเดียวกับศูนย์แสดงสินค...,https://a0.muscache.com/pictures/e4811699-0153...,256747704,https://www.airbnb.com/users/show/256747704,...,5.000000,4.000000,4.500000,4.000000,t,1,0,1,0,0.08
4,22678899,https://www.airbnb.com/rooms/22678899,20220620202126,2022-06-20,บ้านเบนซ์,เป็นหมู่บ้านขนาดใหญ่มีหลายเฟส บ้านเป็นทาวน์โฮม...,,https://a0.muscache.com/pictures/02e4ae53-dbe6...,167136634,https://www.airbnb.com/users/show/167136634,...,4.754167,4.764909,4.600394,4.641145,t,1,0,1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17069,22947580,https://www.airbnb.com/rooms/22947580,20220620202126,2022-06-20,Apartment T9 Muang Thong Thani by KhunNutt2B7,ห้องพักส่วนตัวสวยหรูอยู่สบาย สะอาด ราคาประหยัด...,ที่พักอยู่ใกล้ห้างคอสโม แหล่งช้อปปิ้ง Sport Fa...,https://a0.muscache.com/pictures/2cbf3a10-76e8...,169985358,https://www.airbnb.com/users/show/169985358,...,4.754167,4.764909,4.600394,4.641145,t,1,1,0,0,
17070,5955949,https://www.airbnb.com/rooms/5955949,20220620202126,2022-06-20,Cozy Manor house with Lake view 520 square meters,Nice place for relax with wide lake at opposit...,There are many Thai restaurant near entrance o...,https://a0.muscache.com/pictures/93a7608f-f35c...,20002101,https://www.airbnb.com/users/show/20002101,...,4.754167,4.764909,4.600394,4.641145,f,2,1,1,0,
17071,32650300,https://www.airbnb.com/rooms/32650300,20220620202126,2022-06-20,Studio Superior (Double Bed) Room 1,- Room size 25 sq.m. or 322 sq.ft.<br />- Quee...,"Bangna Tower<br />40,000 square meters project...",https://a0.muscache.com/pictures/9a88370c-111c...,245251233,https://www.airbnb.com/users/show/245251233,...,4.754167,4.764909,4.600394,4.641145,t,29,0,29,0,
17072,46851152,https://www.airbnb.com/rooms/46851152,20220620202126,2022-06-20,Family Room Dmk Don mueang Airport 2 bedrooms,"Apartment in Pak Kret, 2 bedrooms, 1 bathroom,...",,https://a0.muscache.com/pictures/prohost-api/H...,201741826,https://www.airbnb.com/users/show/201741826,...,4.754167,4.764909,4.600394,4.641145,t,48,23,18,7,
