In [1]:
import pandas as pd
import numpy as np
import re
import scipy.stats as stats
from sklearn import linear_model
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier

In [2]:
au = pd.read_csv('listings_AU.csv')
au.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,2265,https://www.airbnb.com/rooms/2265,20181018034013,2018-10-18,Zen-East in the Heart of Austin,Zen East is situated in a vibrant & diverse mu...,This colorful and clean 1923 house was complet...,Zen East is situated in a vibrant & diverse mu...,none,,...,f,,"{""Texas State""}",f,f,strict_14_with_grace_period,f,f,3,0.2
1,5245,https://www.airbnb.com/rooms/5245,20181018034013,2018-10-18,"Green, Colorful, Clean & Cozy home",,Situated in a vibrant & diverse multicultural ...,Situated in a vibrant & diverse multicultural ...,none,,...,f,,"{""Texas State""}",f,f,strict_14_with_grace_period,f,f,3,0.08
2,5456,https://www.airbnb.com/rooms/5456,20181018034013,2018-10-18,"Walk to 6th, Rainey St and Convention Ctr",Fabulous location for walking to Convention Ce...,Cute Private Studio apartment located in Willo...,Fabulous location for walking to Convention Ce...,none,My neighborhood is ideally located if you want...,...,f,,"{""Texas State""}",f,f,strict_14_with_grace_period,f,t,1,3.9
3,5769,https://www.airbnb.com/rooms/5769,20181018034013,2018-10-18,NW Austin Room,,Looking for a comfortable inexpensive room to ...,Looking for a comfortable inexpensive room to ...,none,Quiet neighborhood with lots of trees and good...,...,f,,"{""Texas State""}",f,f,moderate,t,t,1,2.28
4,6413,https://www.airbnb.com/rooms/6413,20181018034013,2018-10-18,Gem of a Studio near Downtown,"Great studio apartment, perfect for couples or...","(License #114332) Large, contemporary studio a...","Great studio apartment, perfect for couples or...",none,Travis Heights is one of the oldest neighborho...,...,f,32041657928.0,"{""Texas State""}",t,f,strict_14_with_grace_period,f,f,1,0.73


In [3]:
keep = ['host_is_superhost', 'host_total_listings_count', 'host_identity_verified',
        'zipcode','property_type','bathrooms', 'bedrooms', 'beds','price','cancellation_policy',
        'availability_30','availability_60','availability_90','availability_365',
        'reviews_per_month','review_scores_location']
au_sub = au[keep]

In [4]:
au_sub.isnull().sum()

host_is_superhost               2
host_total_listings_count       2
host_identity_verified          2
zipcode                       146
property_type                   0
bathrooms                      32
bedrooms                        6
beds                           23
price                           0
cancellation_policy             0
availability_30                 0
availability_60                 0
availability_90                 0
availability_365                0
reviews_per_month            3042
review_scores_location       3219
dtype: int64

#### Step 1, Impute `review_scores_location`

1. extract relevant col's from original dataset

In [5]:
listDtaSubset = au[['latitude','longitude',"neighbourhood_cleansed", "review_scores_location"]]

2. one-hot encode neighbourhood variable

In [6]:
neighbourhood = pd.get_dummies(listDtaSubset.neighbourhood_cleansed)  # a matrix with number of rows equal to the number of rows in the original dataset and number cols = number unique neighborhood values. For each row, only one element is 1, everything else are 0's.
listDtaSubset = listDtaSubset.join(neighbourhood)
listDtaSubset.drop(columns = "neighbourhood_cleansed", inplace= True)

3. use subset of data to train KNN (rows with all model variable and label non-missing)

In [7]:
# drop all observations with either x or y missing w.r.t. the model
# Note: I remember none of the latitude, longiture info is missing
data = listDtaSubset.dropna(axis = 0, subset = ['latitude','longitude',"review_scores_location"])
# data.shape

4. S4 Use KNN model to predict all location review scores

In [8]:
# split valid data set into training and testing data sets
tmp_train, tmp_test = model_selection.train_test_split(data, test_size = 0.3)
tmp_train_X = tmp_train.drop(columns="review_scores_location")
tmp_train_Y = tmp_train["review_scores_location"]

tmp_test_X = tmp_test.drop(columns="review_scores_location")
tmp_test_Y = tmp_test["review_scores_location"]

In [9]:
# Train KNN model
KNN = KNeighborsClassifier(n_neighbors = 11)  #  I tried k = 3-15. 11 is significantly better than 3, but similar to 12-15 or 10

KNN.fit(tmp_train_X, tmp_train_Y)
KNN_train_score = KNN.score(tmp_train_X,tmp_train_Y)
KNN_test_score = KNN.score(tmp_test_X, tmp_test_Y)

print("training accuracy: ", KNN_train_score, "\ntesting accuracy: ", KNN_test_score)

training accuracy:  0.788399222294232 
testing accuracy:  0.7532123960695389


5. fill missing location review score with predicted missing value

In [10]:
data2 = listDtaSubset.dropna(axis = 0, subset = ['latitude','longitude'])  # should not make any change since none of the latitude and longitude is missing in LA listing dataset
# data2.shape
data2 = data2.drop(columns="review_scores_location")

# predict the location score for ALL observations
review_scores_location_imputed = KNN.predict(data2)

In [11]:
# For observation with NA as review score, substitue in the predicted score
mask = au["review_scores_location"].isna()
au.loc[mask,"review_scores_location"] = review_scores_location_imputed[mask]

#### Step 1, clean up NaN values

In [12]:
au_sub = au[keep]
au_sub.isnull().sum()

host_is_superhost               2
host_total_listings_count       2
host_identity_verified          2
zipcode                       146
property_type                   0
bathrooms                      32
bedrooms                        6
beds                           23
price                           0
cancellation_policy             0
availability_30                 0
availability_60                 0
availability_90                 0
availability_365                0
reviews_per_month            3042
review_scores_location          0
dtype: int64

In [13]:
summary = pd.DataFrame(au_sub.isnull().sum())
man = ['Drop rows with NaN(2)','/','/','Drop rows with NaN(146)','/','Drop rows with NaN(32)','Drop rows with NaN(5)','Drop rows with NaN(6)','/','/','/','/','/','/','?','/']
summary['manipulation']=man
summary

Unnamed: 0,0,manipulation
host_is_superhost,2,Drop rows with NaN(2)
host_total_listings_count,2,/
host_identity_verified,2,/
zipcode,146,Drop rows with NaN(146)
property_type,0,/
bathrooms,32,Drop rows with NaN(32)
bedrooms,6,Drop rows with NaN(5)
beds,23,Drop rows with NaN(6)
price,0,/
cancellation_policy,0,/


In [14]:
#host_is_superhost #2 missing features
au[au['host_is_superhost'].isnull()][['host_id','host_is_superhost']]
au_sub = au_sub[pd.notnull(au_sub['host_is_superhost'])]
#host with missing values just do not have values for all of the listings
#Drop NaN values, then host_total_listings_count and host_identity_verified are cleaned up as well

In [15]:
#zipcode #drop the NaN values
au_sub = au_sub[pd.notnull(au_sub['zipcode'])]

In [16]:
au_sub = au_sub[pd.notnull(au_sub['bathrooms'])]
au_sub = au_sub[pd.notnull(au_sub['bedrooms'])]
au_sub = au_sub[pd.notnull(au_sub['beds'])].reset_index(drop=True)

In [17]:
au_sub.isnull().sum()

host_is_superhost               0
host_total_listings_count       0
host_identity_verified          0
zipcode                         0
property_type                   0
bathrooms                       0
bedrooms                        0
beds                            0
price                           0
cancellation_policy             0
availability_30                 0
availability_60                 0
availability_90                 0
availability_365                0
reviews_per_month            2954
review_scores_location          0
dtype: int64

#### Step 2, Features Transformation

In [18]:
##host_is_superhost: 0-False, 1-True
au_sub['host_is_superhost']=[0 if au_sub['host_is_superhost'][i]=='f' else 1 for i in range(au_sub.shape[0])]

In [19]:
##host_identity_verified: 0-False, 1-True
au_sub['host_identity_verified']= [0 if au_sub['host_identity_verified'][i]=='f' else 1 for i in range(au_sub.shape[0])]

In [20]:
##transform zipcode to be text string
au_sub['zipcode'] = [str(au_sub['zipcode'][i])[0:5] for i in range(au_sub.shape[0])]

In [21]:
##zipcode all cleaned
au_sub['len']=[len(au_sub['zipcode'][i]) for i in range(au_sub.shape[0])]
au_sub['len'].unique()

array([5])

In [22]:
##drop len column
au_sub = au_sub.drop(['len'],axis=1)

In [23]:
##cancellation_policy
au_sub['cancellation_policy'].unique()

array(['strict_14_with_grace_period', 'moderate', 'flexible',
       'super_strict_30', 'super_strict_60'], dtype=object)

In [24]:
au_sub['cancellation_policy'] = au_sub['cancellation_policy'].replace(['strict_14_with_grace_period','super_strict_30','strict','super_strict_60'],'strict')
binary_encoded = pd.get_dummies(au_sub.cancellation_policy)
newcols = binary_encoded.columns
au_sub[newcols] = binary_encoded

In [25]:
###remove $ from price
au_sub['price'] = (au_sub['price'].str[1:])
au_sub['price'] = au_sub['price'].str.replace(',', '')
au_sub['price'] = au_sub['price'].astype(float)

In [26]:
au_sub.head()

Unnamed: 0,host_is_superhost,host_total_listings_count,host_identity_verified,zipcode,property_type,bathrooms,bedrooms,beds,price,cancellation_policy,availability_30,availability_60,availability_90,availability_365,reviews_per_month,review_scores_location,flexible,moderate,strict
0,1,3.0,1,78702,House,2.0,2.0,2.0,225.0,strict,15,28,43,68,0.2,8.0,0,0,1
1,1,3.0,1,78702,House,1.0,1.0,1.0,125.0,strict,29,59,89,179,0.08,10.0,0,0,1
2,1,1.0,1,78702,Guesthouse,1.0,1.0,2.0,95.0,strict,12,38,63,318,3.9,9.0,0,0,1
3,1,1.0,1,78729,House,1.0,1.0,1.0,40.0,moderate,0,0,8,8,2.28,9.0,0,1,0
4,1,2.0,1,78704,Guesthouse,1.0,1.0,2.0,129.0,strict,15,31,35,180,2.18,10.0,0,0,1


In [27]:
au_sub['property_type'].value_counts()

House                 5882
Apartment             3036
Condominium            770
Guesthouse             435
Townhouse              395
Bungalow               348
Guest suite            223
Loft                   140
Serviced apartment     107
Camper/RV               85
Other                   69
Cottage                 47
Tiny house              40
Bed and breakfast       39
Cabin                   36
Boutique hotel          35
Villa                   24
Hostel                  22
Resort                  22
Tent                    16
Aparthotel              15
Campsite                11
Boat                     9
Farm stay                9
Yurt                     7
Tipi                     5
Barn                     4
Treehouse                3
Houseboat                3
Chalet                   3
Nature lodge             2
Dome house               2
Hotel                    1
Earth house              1
Name: property_type, dtype: int64

In [28]:
###binary-encoding for property_type
au_sub['property_type_cleaned'] = au_sub['property_type']
au_sub.loc[(au_sub['property_type'] != 'Apartment') & (au_sub['property_type'] != 'House') 
           & (au_sub['property_type'] != 'Condominium') & (au_sub['property_type'] != 'Guesthouse')
           & (au_sub['property_type'] != 'Townhouse') & (au_sub['property_type'] != 'Guest suite')
           & (au_sub['property_type'] != 'Bungalow') & (au_sub['property_type'] != 'Loft'), 
           'property_type_cleaned'] = 'Other'

In [29]:
au_sub['property_type_cleaned'].value_counts()

House          5882
Apartment      3036
Condominium     770
Other           617
Guesthouse      435
Townhouse       395
Bungalow        348
Guest suite     223
Loft            140
Name: property_type_cleaned, dtype: int64

In [30]:
#one-hot encoding 
binary_encoded1 = pd.get_dummies(au_sub.property_type_cleaned)
newcols1 = binary_encoded1.columns
au_sub[newcols1] = binary_encoded1

#### Step 3, fill missing value in `reviews_per_month` with median value

In [31]:
sub = au_sub["reviews_per_month"].median()
au_sub["reviews_per_month"] = au_sub["reviews_per_month"].fillna(value =sub)

In [32]:
au_sub.isnull().sum()

host_is_superhost            0
host_total_listings_count    0
host_identity_verified       0
zipcode                      0
property_type                0
bathrooms                    0
bedrooms                     0
beds                         0
price                        0
cancellation_policy          0
availability_30              0
availability_60              0
availability_90              0
availability_365             0
reviews_per_month            0
review_scores_location       0
flexible                     0
moderate                     0
strict                       0
property_type_cleaned        0
Apartment                    0
Bungalow                     0
Condominium                  0
Guest suite                  0
Guesthouse                   0
House                        0
Loft                         0
Other                        0
Townhouse                    0
dtype: int64

In [33]:
au_sub.to_csv("listings_au_cleaned.csv")