### Data Ingestion and Cleaning

In [2]:
# Importing required packages for dataframe manipulation and data ingestion

import pandas as pd
import zipfile
import numpy as np
import os
%matplotlib inline
import matplotlib.pyplot as plt
import requests

pd.set_option('display.max_columns', None) # Enable visualization of all columns in Pandas

In [3]:
# Import the Airbnb review and listing data for the city of Hawaii 
# from http://insideairbnb.com/get-the-data.html

zf_listings = zipfile.ZipFile("Hawaii_listings.csv.zip")
listings = pd.read_csv(zf_listings.open("Hawaii_listings.csv")) # Listing information for merging later

zf_reviews = zipfile.ZipFile("Hawaii_reviews.csv.zip")
reviews = pd.read_csv(zf_reviews.open("Hawaii_reviews.csv")) # Airbnb review data to perform NLP on

In [4]:
print(listings.info())
print(reviews.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21808 entries, 0 to 21807
Data columns (total 74 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            21808 non-null  int64  
 1   listing_url                                   21808 non-null  object 
 2   scrape_id                                     21808 non-null  int64  
 3   last_scraped                                  21808 non-null  object 
 4   name                                          21808 non-null  object 
 5   description                                   21630 non-null  object 
 6   neighborhood_overview                         12309 non-null  object 
 7   picture_url                                   21808 non-null  object 
 8   host_id                                       21808 non-null  int64  
 9   host_url                                      21808 non-null 

##### Explore the review data

In [5]:
reviews.isna().sum()

listing_id         0
id                 0
date               0
reviewer_id        0
reviewer_name      0
comments         300
dtype: int64

In [6]:
# Remove all entries without comments (reviews)

reviews.dropna(inplace = True)

In [7]:
reviews.head(15)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,5065,3578629,2013-02-18,4574728,Terry,The place was difficult to find and communicat...
1,5065,4412184,2013-05-03,3067352,Olivia,Wayne was very friendly and his place is sweet...
2,5065,55331648,2015-11-29,33781202,Elspeth And Adam Dobres,We loved our time at this BnB! Beautiful surro...
3,5065,57598810,2015-12-27,12288841,Lydia,"The organisation was very uncomplicated,\r<br/..."
4,5065,58905911,2016-01-05,41538214,Andrew,Place was great for what we wanted. Be ready t...
5,5065,59863454,2016-01-16,21422382,Dan,The place was nice and clean and the people ar...
6,5065,61711822,2016-02-07,21575744,Heidi,We spent 2 nights in the Mauka B&B. It was a l...
7,5065,62181322,2016-02-12,9094704,Kammaleathahh,This place was the perfect location for us whe...
8,5065,64963298,2016-03-09,13790006,Andrea,"Hi Wayne and Shani,\r<br/>\r<br/>I really enjo..."
9,5065,68559593,2016-04-05,59420862,Hillery,Very comfortable place to stay!! Had everythin...


##### Check for any non-English reviews.

In [8]:
from langdetect import detect

def detect_language(text):
   try:
       return detect(text)
   except:
       return 'unknown'

reviews["Language"] = reviews["comments"].apply(detect_language)

In [9]:
reviews["Language"].value_counts()

en         571363
de           6434
fr           5440
ja           5183
ko           3741
zh-cn        1822
ro           1603
es           1328
af            818
unknown       650
it            639
so            435
nl            378
pt            352
ca            348
tl            307
cs            204
no            194
pl            193
sv            179
sw            168
da            129
ru            122
hr             98
hu             94
fi             87
vi             74
id             70
et             68
cy             56
sl             45
zh-tw          39
tr             35
sk             30
he              6
lt              5
sq              4
th              3
ar              2
lv              1
bg              1
Name: Language, dtype: int64

In [10]:
# Only retain English reviews

reviews = reviews[reviews["Language"] == "en"]

In [12]:
# Save current "reviews" data frame to pickle file for convenient import at later checkpoints

import pickle 

reviews.to_pickle("reviews_pickle.p")

In [14]:
# Perform left merge on review data with the listing data

listing_reviews = pd.merge(reviews, listings, left_on = "listing_id", right_on = "id", how = "left")

In [15]:
listing_reviews

Unnamed: 0,listing_id,id_x,date,reviewer_id,reviewer_name,comments,Language,id_y,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,5065,3578629,2013-02-18,4574728,Terry,The place was difficult to find and communicat...,en,5065,https://www.airbnb.com/rooms/5065,20210708132536,2021-07-09,MAUKA BB,"Perfect for your vacation, Staycation or just ...",Neighbors here are friendly but are not really...,https://a0.muscache.com/pictures/36718112/1f0e...,7257,https://www.airbnb.com/users/show/7257,Wayne,2009-01-31,hawaii,HI\r\nWE LIVE HERE IN HONOKAA ON \r\nFARM JUS...,within a few hours,100%,0%,f,https://a0.muscache.com/im/users/7257/profile_...,https://a0.muscache.com/im/users/7257/profile_...,Hamakua Coast,2.0,2.0,"['email', 'phone', 'reviews', 'offline_governm...",t,t,"Honokaa, Hawaii, United States",Hamakua,Hawaii,20.042660,-155.432590,Entire bed and breakfast,Entire home/apt,2,,1 bath,,1.0,"[""Free parking on premises"", ""Wifi"", ""Long ter...",$85.00,2,700,2,2,700,700,2.0,700.0,,t,0,0,0,250,2021-07-09,42,0,0,2013-02-18,2020-03-22,4.6,4.69,4.69,4.79,4.71,4.48,4.76,,f,1,1,0,0,0.41
1,5065,4412184,2013-05-03,3067352,Olivia,Wayne was very friendly and his place is sweet...,en,5065,https://www.airbnb.com/rooms/5065,20210708132536,2021-07-09,MAUKA BB,"Perfect for your vacation, Staycation or just ...",Neighbors here are friendly but are not really...,https://a0.muscache.com/pictures/36718112/1f0e...,7257,https://www.airbnb.com/users/show/7257,Wayne,2009-01-31,hawaii,HI\r\nWE LIVE HERE IN HONOKAA ON \r\nFARM JUS...,within a few hours,100%,0%,f,https://a0.muscache.com/im/users/7257/profile_...,https://a0.muscache.com/im/users/7257/profile_...,Hamakua Coast,2.0,2.0,"['email', 'phone', 'reviews', 'offline_governm...",t,t,"Honokaa, Hawaii, United States",Hamakua,Hawaii,20.042660,-155.432590,Entire bed and breakfast,Entire home/apt,2,,1 bath,,1.0,"[""Free parking on premises"", ""Wifi"", ""Long ter...",$85.00,2,700,2,2,700,700,2.0,700.0,,t,0,0,0,250,2021-07-09,42,0,0,2013-02-18,2020-03-22,4.6,4.69,4.69,4.79,4.71,4.48,4.76,,f,1,1,0,0,0.41
2,5065,55331648,2015-11-29,33781202,Elspeth And Adam Dobres,We loved our time at this BnB! Beautiful surro...,en,5065,https://www.airbnb.com/rooms/5065,20210708132536,2021-07-09,MAUKA BB,"Perfect for your vacation, Staycation or just ...",Neighbors here are friendly but are not really...,https://a0.muscache.com/pictures/36718112/1f0e...,7257,https://www.airbnb.com/users/show/7257,Wayne,2009-01-31,hawaii,HI\r\nWE LIVE HERE IN HONOKAA ON \r\nFARM JUS...,within a few hours,100%,0%,f,https://a0.muscache.com/im/users/7257/profile_...,https://a0.muscache.com/im/users/7257/profile_...,Hamakua Coast,2.0,2.0,"['email', 'phone', 'reviews', 'offline_governm...",t,t,"Honokaa, Hawaii, United States",Hamakua,Hawaii,20.042660,-155.432590,Entire bed and breakfast,Entire home/apt,2,,1 bath,,1.0,"[""Free parking on premises"", ""Wifi"", ""Long ter...",$85.00,2,700,2,2,700,700,2.0,700.0,,t,0,0,0,250,2021-07-09,42,0,0,2013-02-18,2020-03-22,4.6,4.69,4.69,4.79,4.71,4.48,4.76,,f,1,1,0,0,0.41
3,5065,57598810,2015-12-27,12288841,Lydia,"The organisation was very uncomplicated,\r<br/...",en,5065,https://www.airbnb.com/rooms/5065,20210708132536,2021-07-09,MAUKA BB,"Perfect for your vacation, Staycation or just ...",Neighbors here are friendly but are not really...,https://a0.muscache.com/pictures/36718112/1f0e...,7257,https://www.airbnb.com/users/show/7257,Wayne,2009-01-31,hawaii,HI\r\nWE LIVE HERE IN HONOKAA ON \r\nFARM JUS...,within a few hours,100%,0%,f,https://a0.muscache.com/im/users/7257/profile_...,https://a0.muscache.com/im/users/7257/profile_...,Hamakua Coast,2.0,2.0,"['email', 'phone', 'reviews', 'offline_governm...",t,t,"Honokaa, Hawaii, United States",Hamakua,Hawaii,20.042660,-155.432590,Entire bed and breakfast,Entire home/apt,2,,1 bath,,1.0,"[""Free parking on premises"", ""Wifi"", ""Long ter...",$85.00,2,700,2,2,700,700,2.0,700.0,,t,0,0,0,250,2021-07-09,42,0,0,2013-02-18,2020-03-22,4.6,4.69,4.69,4.79,4.71,4.48,4.76,,f,1,1,0,0,0.41
4,5065,58905911,2016-01-05,41538214,Andrew,Place was great for what we wanted. Be ready t...,en,5065,https://www.airbnb.com/rooms/5065,20210708132536,2021-07-09,MAUKA BB,"Perfect for your vacation, Staycation or just ...",Neighbors here are friendly but are not really...,https://a0.muscache.com/pictures/36718112/1f0e...,7257,https://www.airbnb.com/users/show/7257,Wayne,2009-01-31,hawaii,HI\r\nWE LIVE HERE IN HONOKAA ON \r\nFARM JUS...,within a few hours,100%,0%,f,https://a0.muscache.com/im/users/7257/profile_...,https://a0.muscache.com/im/users/7257/profile_...,Hamakua Coast,2.0,2.0,"['email', 'phone', 'reviews', 'offline_governm...",t,t,"Honokaa, Hawaii, United States",Hamakua,Hawaii,20.042660,-155.432590,Entire bed and breakfast,Entire home/apt,2,,1 bath,,1.0,"[""Free parking on premises"", ""Wifi"", ""Long ter...",$85.00,2,700,2,2,700,700,2.0,700.0,,t,0,0,0,250,2021-07-09,42,0,0,2013-02-18,2020-03-22,4.6,4.69,4.69,4.79,4.71,4.48,4.76,,f,1,1,0,0,0.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571358,50599343,402318999779516646,2021-07-08,203623421,Alaa,"Amazing views, very comfortable space. Conveni...",en,50599343,https://www.airbnb.com/rooms/50599343,20210708132536,2021-07-09,2 Bedroom Penthouse Condo With Ocean View And ...,Penthouse Condo with Ocean View of Holualoa Ba...,Next to the famous Banyan Surf Park. Best surf...,https://a0.muscache.com/pictures/miso/Hosting-...,408826682,https://www.airbnb.com/users/show/408826682,Kristina,2021-06-23,"Kailua-Kona, Hawaii, United States",,within an hour,100%,100%,f,https://a0.muscache.com/im/pictures/user/95902...,https://a0.muscache.com/im/pictures/user/95902...,,0.0,0.0,"['email', 'phone']",t,t,"Kailua-Kona, Hawaii, United States",North Kona,Hawaii,19.607210,-155.976120,Entire condominium,Entire home/apt,6,,2 baths,2.0,4.0,"[""Carbon monoxide alarm"", ""Stove"", ""Extra pill...",$257.00,3,27,3,7,1125,1125,3.3,1125.0,,t,2,20,31,235,2021-07-09,2,2,2,2021-07-05,2021-07-08,5.0,5.00,5.00,5.00,5.00,5.00,4.50,STVR-19-347315,t,1,1,0,0,2.00
571359,50682739,402321928223695709,2021-07-08,363127255,Theresa,"Karina was a fantastic host, super accommodati...",en,50682739,https://www.airbnb.com/rooms/50682739,20210708132536,2021-07-09,Waikiki studio balcony 5 min walk to the Beack,Value-packed studio with lots of amenities in ...,,https://a0.muscache.com/pictures/6f2846ad-283c...,149275171,https://www.airbnb.com/users/show/149275171,Karina,2017-09-05,"Honolulu, Hawaii, United States",Aloha! My family have opportunity to host peop...,within an hour,100%,100%,f,https://a0.muscache.com/im/pictures/user/b69d9...,https://a0.muscache.com/im/pictures/user/b69d9...,Waikiki,3.0,3.0,"['email', 'phone']",t,t,,Primary Urban Center,Honolulu,21.286110,-157.840150,Entire condominium,Entire home/apt,2,,1 bath,,1.0,"[""Mini fridge"", ""Carbon monoxide alarm"", ""Extr...",$98.00,1,365,1,1,365,365,1.0,365.0,,t,5,14,22,73,2021-07-09,1,1,1,2021-07-08,2021-07-08,5.0,5.00,5.00,5.00,5.00,5.00,5.00,"260110210028, 622, TA-233-455-4556-22",f,3,3,0,0,1.00
571360,50710529,402284071928698022,2021-07-08,254399956,Alfred & Ashley,Greta place to stay highly recommended,en,50710529,https://www.airbnb.com/rooms/50710529,20210708132536,2021-07-09,Puakenikeni Hilo Hale,Private unit in a lovely open oasis of a yard ...,"Wonderful district, centrally located from maj...",https://a0.muscache.com/pictures/miso/Hosting-...,408995110,https://www.airbnb.com/users/show/408995110,Tulpe,2021-06-24,"Hilo, Hawaii, United States","I live in Hilo, and enjoy the fresh smell foll...",within an hour,100%,100%,f,https://a0.muscache.com/im/pictures/user/3d823...,https://a0.muscache.com/im/pictures/user/3d823...,,0.0,0.0,"['email', 'phone']",t,t,"Hilo, Hawaii, United States",South Hilo,Hawaii,19.700066,-155.073502,Entire apartment,Entire home/apt,2,,1 bath,1.0,1.0,"[""Washer"", ""Shampoo"", ""Free parking on premise...",$150.00,1,365,1,1,365,365,1.0,365.0,,t,9,31,61,336,2021-07-09,1,1,1,2021-07-08,2021-07-08,5.0,5.00,5.00,5.00,5.00,5.00,5.00,065-102-7456-01,t,1,1,0,0,1.00
571361,50736557,398615833710369020,2021-07-03,156753224,Miku,Best Airbnb I’ve stayed at. The place is so ne...,en,50736557,https://www.airbnb.com/rooms/50736557,20210708132536,2021-07-09,Modern Villa,Aloha and welcome to our brand new home just b...,Our neighborhood is quite safe and relaxing. I...,https://a0.muscache.com/pictures/a2056364-9871...,299553576,https://www.airbnb.com/users/show/299553576,Roshani,2019-10-02,"Ewa Beach, Hawaii, United States",Aloha and Mahalo for looking into my profile. ...,within an hour,100%,100%,f,https://a0.muscache.com/im/pictures/user/daf8a...,https://a0.muscache.com/im/pictures/user/daf8a...,Ewa,1.0,1.0,['phone'],t,t,"Ewa Beach, Hawaii, United States",Ewa,Honolulu,21.360280,-158.048220,Entire guest suite,Entire home/apt,2,,1 bath,1.0,1.0,"[""Mini fridge"", ""Stove"", ""Air conditioning"", ""...",$100.00,2,30,2,2,30,30,2.0,30.0,,t,8,8,8,8,2021-07-09,1,1,1,2021-07-03,2021-07-03,5.0,5.00,5.00,5.00,5.00,5.00,5.00,"910171380086, 602, TA-108-110-3360-02",t,1,1,0,0,1.00


In [16]:
# Keep only the columns that may prove to be useful

listing_reviews = listing_reviews[["listing_id", "name", "description", "date", "accommodates", "price", "reviewer_id", "listing_url", "latitude", "longitude", "review_scores_rating", "comments"]]

In [17]:
listing_reviews

Unnamed: 0,listing_id,name,description,date,accommodates,price,reviewer_id,listing_url,latitude,longitude,review_scores_rating,comments
0,5065,MAUKA BB,"Perfect for your vacation, Staycation or just ...",2013-02-18,2,$85.00,4574728,https://www.airbnb.com/rooms/5065,20.042660,-155.432590,4.6,The place was difficult to find and communicat...
1,5065,MAUKA BB,"Perfect for your vacation, Staycation or just ...",2013-05-03,2,$85.00,3067352,https://www.airbnb.com/rooms/5065,20.042660,-155.432590,4.6,Wayne was very friendly and his place is sweet...
2,5065,MAUKA BB,"Perfect for your vacation, Staycation or just ...",2015-11-29,2,$85.00,33781202,https://www.airbnb.com/rooms/5065,20.042660,-155.432590,4.6,We loved our time at this BnB! Beautiful surro...
3,5065,MAUKA BB,"Perfect for your vacation, Staycation or just ...",2015-12-27,2,$85.00,12288841,https://www.airbnb.com/rooms/5065,20.042660,-155.432590,4.6,"The organisation was very uncomplicated,\r<br/..."
4,5065,MAUKA BB,"Perfect for your vacation, Staycation or just ...",2016-01-05,2,$85.00,41538214,https://www.airbnb.com/rooms/5065,20.042660,-155.432590,4.6,Place was great for what we wanted. Be ready t...
...,...,...,...,...,...,...,...,...,...,...,...,...
571358,50599343,2 Bedroom Penthouse Condo With Ocean View And ...,Penthouse Condo with Ocean View of Holualoa Ba...,2021-07-08,6,$257.00,203623421,https://www.airbnb.com/rooms/50599343,19.607210,-155.976120,5.0,"Amazing views, very comfortable space. Conveni..."
571359,50682739,Waikiki studio balcony 5 min walk to the Beack,Value-packed studio with lots of amenities in ...,2021-07-08,2,$98.00,363127255,https://www.airbnb.com/rooms/50682739,21.286110,-157.840150,5.0,"Karina was a fantastic host, super accommodati..."
571360,50710529,Puakenikeni Hilo Hale,Private unit in a lovely open oasis of a yard ...,2021-07-08,2,$150.00,254399956,https://www.airbnb.com/rooms/50710529,19.700066,-155.073502,5.0,Greta place to stay highly recommended
571361,50736557,Modern Villa,Aloha and welcome to our brand new home just b...,2021-07-03,2,$100.00,156753224,https://www.airbnb.com/rooms/50736557,21.360280,-158.048220,5.0,Best Airbnb I’ve stayed at. The place is so ne...


In [28]:
# Save current "reviews" data frame to pickle file for convenient import at later checkpoints

listing_reviews.to_pickle("listing_reviews_pickle.p")