# Airbnb Price recommendations

In [139]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Data preparation

1. Remove all listings without a review
2. Remove all listings where price is > $500 or 0
3. Handle missing values in the data
4. Create train/test csv files

In [140]:
# Load data from the pre-cleaned listings-cleaned.csv file
airbnb = pd.read_csv("Data/listings-cleaned.csv")
airbnb.head()

Unnamed: 0,id,hos2_is_superhos2,host_listings_count,neighbourhood_cleansed,neighbourhood_group_cleansed,city,zipcode,property_type,room_type,accommodates,...,maximum_nights_avg_ntm,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,2595,1.0,6.0,Midtown,Manhattan,New York,10018,Apartment,Entire home/apt,1,...,1125.0,48,7,94.0,9.0,9.0,10.0,10.0,10.0,9.0
1,3831,1.0,1.0,Clinton Hill,Brooklyn,Brooklyn,11238,Guest suite,Entire home/apt,3,...,730.0,295,75,90.0,9.0,9.0,10.0,9.0,10.0,9.0
2,5099,1.0,1.0,Murray Hill,Manhattan,New York,10016,Apartment,Entire home/apt,2,...,21.0,78,8,90.0,10.0,9.0,10.0,10.0,10.0,9.0
3,5121,1.0,1.0,Bedford-Stuyvesant,Brooklyn,Brooklyn,11216,Apartment,Private room,2,...,730.0,49,0,90.0,8.0,8.0,10.0,10.0,9.0,9.0
4,5178,1.0,1.0,Hell's Kitchen,Manhattan,New York,10019,Apartment,Private room,2,...,14.0,454,47,84.0,9.0,7.0,9.0,9.0,10.0,8.0


In [141]:
# total number listings
airbnb['id'].count()

50599

In [167]:
# check data types
airbnb.dtypes

id                                int64
hos2_is_superhos2               float64
host_listings_count             float64
neighbourhood_cleansed           object
neighbourhood_group_cleansed     object
city                             object
zipcode                          object
property_type                    object
room_type                        object
accommodates                      int64
bathrooms                       float64
bedrooms                        float64
beds                            float64
bed_type                         object
amenities                        object
price                           float64
security_deposit                 object
cleaning_fee                     object
guests_included                   int64
extra_people                     object
minimum_nights                    int64
maximum_nights                    int64
minimum_minimum_nights            int64
maximum_minimum_nights            int64
minimum_maximum_nights            int64


### Remove all listings without a review in a last 12 months (number_of_reviews_ltm)

In [142]:
# check that number_of_reviews_ltm column matches
airbnb['number_of_reviews_ltm'].count()

50599

In [143]:
# replace 0's with NaN, drop all NaNs
airbnb['number_of_reviews_ltm'] = airbnb['number_of_reviews_ltm'].replace(0, pd.np.nan)
airbnb = airbnb.dropna(axis=0, how='any', subset=['number_of_reviews_ltm'])

# remaining number of rows
airbnb['id'].count()

29839

### Remove listings with price = 0 or > $500

In [144]:
# remove $ sign and commas from price
airbnb['price'] = airbnb['price'].str.replace('$','').str.replace(',','').astype('float')
airbnb.price

0        225.0
1         89.0
2        200.0
4         79.0
6        150.0
         ...  
50473    100.0
50500    120.0
50567    150.0
50577     50.0
50580     60.0
Name: price, Length: 29839, dtype: float64

In [145]:
# check max price
max(airbnb.price)

10000.0

In [146]:
# remove prices = 0
# replace 0's with NaN, drop all NaNs
airbnb['price'] = airbnb['price'].replace(0, pd.np.nan)
airbnb = airbnb.dropna(axis=0, how='any', subset=['price'])

# remaining number of rows
airbnb['price'].count()

29826

In [147]:
airbnb = airbnb[airbnb['price'] <= 500]

# max price
max(airbnb.price)

500.0

In [148]:
# remaining number of rows
airbnb['price'].count()

29351

### Handle missing values

In [150]:
airbnb.isna().sum()

id                                  0
hos2_is_superhos2                 360
host_listings_count               360
neighbourhood_cleansed              0
neighbourhood_group_cleansed        0
city                               90
zipcode                           219
property_type                       0
room_type                           0
accommodates                        0
bathrooms                          17
bedrooms                           27
beds                               28
bed_type                            0
amenities                           0
square_feet                     29095
price                               0
security_deposit                 7088
cleaning_fee                     3213
guests_included                     0
extra_people                        0
minimum_nights                      0
maximum_nights                      0
minimum_minimum_nights              0
maximum_minimum_nights              0
minimum_maximum_nights              0
maximum_maxi

In [151]:
# drop sqaure_feet colummn (mostly NaN values)
airbnb = airbnb.drop('square_feet', axis=1)

In [152]:
# get column names
cols = airbnb.columns.tolist()

# ignore security deposit and cleaning fee when dropping NaN values.
cols.remove('security_deposit')
cols.remove('cleaning_fee')
cols

['id',
 'hos2_is_superhos2',
 'host_listings_count',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'zipcode',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'price',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value']

In [155]:
# drop all na values
airbnb = airbnb.dropna(axis=0, how='any', subset=cols)
airbnb.count()

id                              28268
hos2_is_superhos2               28268
host_listings_count             28268
neighbourhood_cleansed          28268
neighbourhood_group_cleansed    28268
city                            28268
zipcode                         28268
property_type                   28268
room_type                       28268
accommodates                    28268
bathrooms                       28268
bedrooms                        28268
beds                            28268
bed_type                        28268
amenities                       28268
price                           28268
security_deposit                21550
cleaning_fee                    25280
guests_included                 28268
extra_people                    28268
minimum_nights                  28268
maximum_nights                  28268
minimum_minimum_nights          28268
maximum_minimum_nights          28268
minimum_maximum_nights          28268
maximum_maximum_nights          28268
minimum_nigh

In [156]:
airbnb.to_csv('Data/data.csv', index = False)

### Create train/test splits

In [1]:
y = airbnb['price']
X = airbnb.drop('price', 1)

NameError: name 'airbnb' is not defined

In [158]:
# create train and test sets, seed = 123
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size = 0.7, random_state = 123)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(19787, 36) (19787,)
(8481, 36) (8481,)


In [159]:
# export csv files for train and test
train = pd.concat([y_train, X_train], axis=1)
train.to_csv('Data/train.csv', index = False)

test = pd.concat([y_test, X_test], axis=1)
test.to_csv('Data/test.csv', index = False)