In [115]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [46]:
# first step is to get familiar with your data 

In [47]:
airbnb_data_path = '../data/new-york-city-airbnb-open-data/AB_NYC_2019.csv'

airbnb = pd.read_csv(airbnb_data_path)

airbnb.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,19017140.0,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,10983110.0,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,9471945.0,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,19677280.0,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,29152180.0,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [57]:
airbnb = airbnb.dropna(axis=0)

airbnb.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,1,108,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,2,127,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,1,41,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,2,61,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,2,137,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129


In [58]:
# create a mapping for the neighbourhoods to turn them into integers

neighbourhood_map = {}
for i,x in enumerate(airbnb.groupby('neighbourhood').neighbourhood.unique()):
    neighbourhood_map[x[0]] = i
neighbourhood_map

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 11: 11,
 12: 12,
 13: 13,
 14: 14,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 41,
 42: 42,
 43: 43,
 44: 44,
 45: 45,
 46: 46,
 47: 47,
 48: 48,
 49: 49,
 50: 50,
 51: 51,
 52: 52,
 53: 53,
 54: 54,
 55: 55,
 56: 56,
 57: 57,
 58: 58,
 59: 59,
 60: 60,
 61: 61,
 62: 62,
 63: 63,
 64: 64,
 65: 65,
 66: 66,
 67: 67,
 68: 68,
 69: 69,
 70: 70,
 71: 71,
 72: 72,
 73: 73,
 74: 74,
 75: 75,
 76: 76,
 77: 77,
 78: 78,
 79: 79,
 80: 80,
 81: 81,
 83: 82,
 84: 83,
 85: 84,
 86: 85,
 87: 86,
 88: 87,
 89: 88,
 90: 89,
 91: 90,
 92: 91,
 93: 92,
 94: 93,
 95: 94,
 96: 95,
 97: 96,
 98: 97,
 99: 98,
 100: 99,
 101: 100,
 102: 101,
 103: 102,
 104: 103,
 105: 104,
 106: 105,
 107: 106,
 108: 107,
 109: 108,
 110: 109,
 111: 110

In [59]:
# define a mapping for the buroughs to turn them into integers

neighbourhood_group_map = {}
for i,x in enumerate(airbnb.groupby('neighbourhood_group').neighbourhood_group.unique()):
    neighbourhood_group_map[x[0]] = i
neighbourhood_group_map

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4}

In [60]:
# apply the mappings made above to the data so we can make a model

def translate_neighbourhood_info(row):
    row.neighbourhood = neighbourhood_map[row.neighbourhood]
    row.neighbourhood_group = neighbourhood_group_map[row.neighbourhood_group]
    return row

# airbnb = airbnb.apply(translate_neighbourhood_info, axis='columns')
airbnb.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,1,108,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,2,127,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,1,41,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,2,61,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,2,137,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129


In [61]:
y = airbnb.price # prediction target

airbnb_features = ['neighbourhood_group','neighbourhood','latitude', 'longitude','reviews_per_month']
X = airbnb[airbnb_features]

In [62]:
X.head()

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,reviews_per_month
0,1,108,40.64749,-73.97237,0.21
1,2,127,40.75362,-73.98377,0.38
3,1,41,40.68514,-73.95976,4.64
4,2,61,40.79851,-73.94399,0.1
5,2,137,40.74767,-73.975,0.59


In [65]:
# Define model. Specify a number for random_state to ensure same results each run
airbnb_model = DecisionTreeRegressor(random_state=1)

# Fit model
airbnb_model.fit(X, y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=1, splitter='best')

In [67]:
print("Making predictions for the following 5 houses:")
print(X)
print("The predictions are")
print(airbnb_model.predict(X))

Making predictions for the following 5 houses:
       neighbourhood_group  neighbourhood  latitude  longitude  \
0                        1            108  40.64749  -73.97237   
1                        2            127  40.75362  -73.98377   
3                        1             41  40.68514  -73.95976   
4                        2             61  40.79851  -73.94399   
5                        2            137  40.74767  -73.97500   
...                    ...            ...       ...        ...   
48782                    2            201  40.78099  -73.95366   
48790                    3             77  40.75104  -73.81459   
48799                    4             90  40.54179  -74.14275   
48805                    0            134  40.80787  -73.92400   
48852                    1             28  40.69805  -73.92801   

       reviews_per_month  
0                   0.21  
1                   0.38  
3                   4.64  
4                   0.10  
5                   0.59 

In [69]:
airbnb.price

0        149
1        225
3         89
4         80
5        200
        ... 
48782    129
48790     45
48799    235
48805    100
48852     30
Name: price, Length: 38821, dtype: int64

In [70]:
# looking here, we seem to have perfect guesses??
# I think it is because we are training the model on this data and it did a good job training
# I'm guessing it wont be as good when we start to test it

In [71]:
# error=actual−predicted is the mean absolute error 

In [75]:
predictions = airbnb_model.predict(X)
mean_absolute_error(y, predictions)
# we get 0??

0.0

In [None]:
# when we 

In [103]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

print(len(X))
# Define model
airbnb_model = DecisionTreeRegressor()
# Fit model
airbnb_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = airbnb_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

38821
93.50236966824644


In [110]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [111]:
get_mae(17, train_X, val_X, train_y, val_y)

71.31452088878432

In [114]:
for max_leaf_nodes in [2, 10, 101, 1000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 2  		 Mean Absolute Error:  73
Max leaf nodes: 10  		 Mean Absolute Error:  71
Max leaf nodes: 101  		 Mean Absolute Error:  69
Max leaf nodes: 1000  		 Mean Absolute Error:  75


In [117]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
airbnb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, airbnb_preds))

73.23932309911393
