In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import seaborn as sns
import geopandas as gp
import folium
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import OneHotEncoder as ohe
from sklearn.preprocessing import MinMaxScaler as mms
from scipy.spatial.distance import cdist
from sklearn.linear_model import LinearRegression as LR
from sklearn import metrics

In [2]:
sns.set(rc = {'figure.figsize':(20,8)})

**Aquiring the data**

In [3]:
df=pd.read_csv("../input/new-york-city-airbnb-open-data/AB_NYC_2019.csv")
df.head()

In [4]:
df.describe()

In [5]:
df.describe(include=['O'])

In [6]:
25409/48895

In [7]:
df.info()

In [8]:
df.isnull().sum()

In [9]:
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

**NEAREST LANDMARK**

In [10]:
nylm = pd.read_csv("../input/nylandmark/nylandmark.csv")
nylm

In [11]:
ary = cdist(df.iloc[:,6:8], nylm.iloc[:,1:], metric='euclidean')
test = pd.DataFrame(ary)
testing = test.idxmin(axis=1)
results = pd.DataFrame(testing)
clustered = pd.concat([df, results], axis = 1)
clustered.columns = [*clustered.columns[:-1], 'nearest_landmark']
clustered

In [12]:
clustered['nearest_landmark'] = clustered['nearest_landmark'].astype(str)
clustered["nearest_landmark"].replace({"0": "The Metropolitan Museum of Art",
                                       "1": "Central Park",
                                       "2": "Statue of Liberty",
                                       "3": "Guggenheim Museum",
                                       "4": "Empire State Building",
                                       "5": "Times Square",
                                       "6": "American Museum of Natural History",
                                       "7": "Brooklyn Bridge",
                                       "8": "The High Line",
                                       "9": "Grand Central Station",
                                       "10": "9/11 Memorial Pools",
                                       "11": "Brooklyn Museum",
                                       "12": "Rockefeller",
                                       "13": "New York Botanical Garden",
                                       "14": "Coney Island",
                                       "15": "Chrysler Building",
                                       "16": "Bronx Zoo",
                                       "17": "Flatiron Building",
                                       "18": "St Patrick's Cathedral",
                                       "19": "Madison Square Garden",
                                       "20": "Yankee Stadium",
                                       "21": "UN Headquarters",
                                       "22": "New York City Hall",
                                       "23": "Wall Street",
                                       "24": "Dyson Store",}, inplace=True)
clustered

In [13]:
lm_mean = clustered[['nearest_landmark', 'price']].groupby(['nearest_landmark'], as_index = False).mean().sort_values(by = 'price', ascending = True)
lm_mean

In [14]:
plot_order = lm_mean.sort_values(by='price', ascending=False).nearest_landmark.values
LandmarkMean = sns.barplot(x=lm_mean.nearest_landmark, y=lm_mean.price, data=lm_mean, order=plot_order)
plt.xticks(rotation=270)
LandmarkMean.set_ylabel('Mean Price')
LandmarkMean.set_xlabel('Nearest Landmark')
LandmarkMean.set_title('Mean Price of Listings by Nearest Landmark')

In [15]:
nybb = gp.read_file('../input/new-york-boroughs/geo_export_782dc836-1ce6-418b-bfe4-90af19c5617c.shp')
gdfcl = gp.GeoDataFrame(clustered, geometry=gp.points_from_xy(clustered.longitude, clustered.latitude))
gdflm = gp.GeoDataFrame(nylm, geometry=gp.points_from_xy(nylm.longitude, nylm.latitude))

In [16]:
nybb = gp.read_file('../input/new-york-boroughs/geo_export_782dc836-1ce6-418b-bfe4-90af19c5617c.shp')
ax = nybb.plot(color='white', edgecolor='black')
bhplt=gdfcl.plot(ax=ax, column = 'nearest_landmark', legend = True, cmap = 'tab20')
lmplt = gdflm.plot(ax=ax, column = 'landmark', color='gold', marker = '*', markersize = 75)
plt.show()

**NEIGHBOURHOOD_GROUP**

In [17]:
sns.histplot(data = df, x = 'price', stat = "count", binwidth = 20)

In [18]:
df2 = df[df.price <= 1000]
df2 = df2[(df2.neighbourhood_group == 'Brooklyn')|(df2.neighbourhood_group == 'Manhattan')]
sns.histplot(data = df2, x = 'price', hue = 'neighbourhood_group', stat = "count", binwidth = 20)

In [19]:
mean = df[['neighbourhood_group', 'price']].groupby(['neighbourhood_group'], as_index = False).mean().sort_values(by = 'neighbourhood_group', ascending = True)
median = df[['neighbourhood_group', 'price']].groupby(['neighbourhood_group'], as_index = False).median().sort_values(by = 'neighbourhood_group', ascending = True)
median = median.drop(['neighbourhood_group'], axis = 1)
median.rename(columns = {'price':'median_price'}, inplace = True)
mean.rename(columns = {'price':'mean_price'}, inplace = True)
df1 = pd.concat([mean, median], axis = 1)
df1 = df1.sort_values(by = 'mean_price', ascending = False)
df1

In [20]:
plt.subplot(2, 1, 1)
ax1 = sns.barplot(x=df1.neighbourhood_group, y=df1.mean_price, data=df1)
plt.subplot(2, 1, 2)
ax2 = sns.barplot(x=df1.neighbourhood_group, y=df1.median_price, data=df1)
ax1.set_ylabel('Mean Price')
ax1.set_xlabel('')
ax1.set_title('Mean Price of Listings by Borough')
ax2.set_ylabel('Mean Price')
ax2.set_xlabel('Borough')
ax2.set_title('Median Price of Listings by Borough')

In [21]:
ng1=df.loc[df['neighbourhood_group'] == 'Brooklyn']
price_ng1=ng1[['price']]
ng2=df.loc[df['neighbourhood_group'] == 'Bronx']
price_ng2=ng2[['price']]
ng3=df.loc[df['neighbourhood_group'] == 'Manhattan']
price_ng3=ng3[['price']]
ng4=df.loc[df['neighbourhood_group'] == 'Queens']
price_ng4=ng4[['price']]
ng5=df.loc[df['neighbourhood_group'] == 'Staten Island']
price_ng5=ng5[['price']]
bbprice = [price_ng1, price_ng2, price_ng3, price_ng4, price_ng5]
boros = ['Brooklyn', 'Bronx', 'Manhattan', 'Queens', 'Staten Island']
df6 = []
for x in bbprice :
    i = x.describe(percentiles = [.25, .5, .75, .9, .95, .99])
    i = i.iloc[3:]
    i.reset_index(inplace = True)
    i.rename(columns = {'index' : 'Stats'}, inplace = True)
    df6.append(i)
df6[0].rename(columns = {'price' : boros[0]}, inplace = True)
df6[1].rename(columns = {'price' : boros[1]}, inplace = True)
df6[2].rename(columns = {'price' : boros[2]}, inplace = True)
df6[3].rename(columns = {'price' : boros[3]}, inplace = True)
df6[4].rename(columns = {'price' : boros[4]}, inplace = True)
df6=[df.set_index('Stats') for df in df6]
df6=df6[0].join(df6[1:])
df6

In [22]:
df2 = df[df.price <= 550] #95% percentile for manhattan, 
viol1 = sns.violinplot(x=df2.neighbourhood_group, y=df2.price, data=df2)
viol1.set_ylabel('Price')
viol1.set_xlabel('Borough')
viol1.set_title('Distribution of Prices by Borough')

In [23]:
df3 = df[df.neighbourhood_group == 'Manhattan']
df3 = df3[df3.price <= 550]
sns.violinplot(x=df3.neighbourhood, y=df3.price, data=df3)
plt.xticks(rotation=45)

**ROOMS**

In [24]:
sns.histplot(data = df, x = 'price', stat = "count", hue = 'room_type', binwidth = 20)

In [25]:
df2 = df[df.price <= 1000]
sns.histplot(data = df2, x = 'price', hue = 'room_type', stat = "count", binwidth = 20)

In [26]:
mean2 = df[['room_type', 'price']].groupby(['room_type'], as_index = False).mean().sort_values(by = 'room_type', ascending = True)
median2 = df[['room_type', 'price']].groupby(['room_type'], as_index = False).median().sort_values(by = 'room_type', ascending = True)
median2 = median2.drop(['room_type'], axis = 1)
median2.rename(columns = {'price':'median_price'}, inplace = True)
mean2.rename(columns = {'price':'mean_price'}, inplace = True)
df2 = pd.concat([mean2, median2], axis = 1)
df2 = df2.sort_values(by = 'mean_price', ascending = False)
df2

In [27]:
plt.subplot(2, 1, 1)
ax2 = sns.barplot(x=df2.room_type, y=df2.mean_price, data=df2)
plt.subplot(2, 1, 2)
ax3 = sns.barplot(x=df2.room_type, y=df2.median_price, data=df2)
viol1.set_ylabel('Price')
viol1.set_xlabel('Borough')
viol1.set_title('Distribution of Prices by Borough')
viol1.set_ylabel('Price')
viol1.set_xlabel('Borough')
viol1.set_title('Distribution of Prices by Borough')

In [28]:
rt1=df.loc[df['room_type'] == 'Entire home/apt']
price_rt1=rt1[['price']]
rt2=df.loc[df['room_type'] == 'Private room']
price_rt2=rt2[['price']]
rt3=df.loc[df['room_type'] == 'Shared room']
price_rt3=rt3[['price']]
rtprice = [price_rt1, price_rt2, price_rt3]
rt = ['Entire home/apt', 'Private room', 'Shared room']
df7 = []
for x in rtprice :
    i = x.describe(percentiles = [.25, .5, .75, .9, .95, .99])
    i = i.iloc[3:]
    i.reset_index(inplace = True)
    i.rename(columns = {'index' : 'Stats'}, inplace = True)
    df7.append(i)
df7[0].rename(columns = {'price' : rt[0]}, inplace = True)
df7[1].rename(columns = {'price' : rt[1]}, inplace = True)
df7[2].rename(columns = {'price' : rt[2]}, inplace = True)
df7=[df.set_index('Stats') for df in df7]
df7=df7[0].join(df7[1:])
df7

In [29]:
df2 = df[df.price <= 550] #95% percentile for manhattan, 
viol2 = sns.violinplot(x=df2.room_type, y=df2.price, data=df2)
viol2.set_ylabel('Price')
viol2.set_xlabel('Room Type')
viol2.set_title('Distribution of Prices by Room Type')

**Combining Room type and location**

In [30]:
df4 = df
df4['room_neighbourhood'] = df4['room_type'] + ' ' + df4['neighbourhood_group']

In [31]:
df5 = df4[df4.price <= 450]
sns.violinplot(x=df5.room_neighbourhood, y=df5.price, data=df5)
plt.xticks(rotation=45)

**BUSIEST HOSTS**

In [32]:
bh = clustered.sort_values(by = 'calculated_host_listings_count', ascending = False)
bhd = clustered.drop_duplicates('host_id').sort_values(by = 'calculated_host_listings_count', ascending = False)
bh10 = bhd.head(10)
bh10

In [33]:
plot_order = bh10.sort_values(by='calculated_host_listings_count', ascending=False).host_id.values
sns.barplot(x=bh10.host_id, y=bh10.calculated_host_listings_count, data=bh10, order=plot_order)

In [34]:
bht10 = bh[(bh.host_id == 219517861)|(bh.host_id == 107434423)|(bh.host_id == 30283594)|(bh.host_id == 137358866)|(bh.host_id == 16098958)|(bh.host_id == 12243051)|(bh.host_id == 61391963)|(bh.host_id == 22541573)|(bh.host_id == 200380610)|(bh.host_id == 7503643)]
bhgp = bht10.drop(['id', 'name', 'host_name', 'neighbourhood_group', 'neighbourhood', 'room_type', 'minimum_nights', 'number_of_reviews', 'reviews_per_month','calculated_host_listings_count', 'availability_365', 'last_review'], axis=1)
bhgp['host_id'] = bhgp['host_id'].astype(str)
bh_gdf = gp.GeoDataFrame(bhgp, geometry=gp.points_from_xy(bhgp.longitude, bhgp.latitude))
bh_gdf

In [35]:
bht10_0 = bht10.sort_values(by='calculated_host_listings_count', ascending=False)
mean3 = bht10_0[['host_id', 'price']].groupby(['host_id'], as_index = False).mean()
median3 = bht10_0[['host_id', 'price']].groupby(['host_id'], as_index = False).median()
median3 = median3.drop(['host_id'], axis = 1)
median3.rename(columns = {'price':'median_price'}, inplace = True)
mean3.rename(columns = {'price':'mean_price'}, inplace = True)
bht10_1 = pd.concat([mean3, median3], axis = 1)
bht10_1

In [36]:
plt.subplot(2, 1, 1)
ax2 = sns.barplot(x=bht10_1.host_id, y=bht10_1.mean_price, data=bht10_1)
plt.subplot(2, 1, 2)
ax1 = sns.barplot(x=bht10_1.host_id, y=bht10_1.median_price, data=bht10_1)
plt.show()

In [37]:
nybb = gp.read_file('../input/new-york-boroughs/geo_export_782dc836-1ce6-418b-bfe4-90af19c5617c.shp')
ax = nybb.plot(color='white', edgecolor='black')
bhplt=bh_gdf.plot(ax=ax, column = 'host_id', legend = True, cmap = 'tab10')
bhplt.set_title('Map of the Busiest Hosts')

**MOST REVIEWED**

In [38]:
mr = clustered.sort_values(by = 'number_of_reviews', ascending = False)
mr100 = mr.head(100)
mr100

In [39]:
plot_order1 = mr100.sort_values(by='number_of_reviews', ascending=False).id.values
sns.barplot(x=mr100.id, y=mr100.number_of_reviews, data=mr100, order=plot_order1)
plt.xticks(rotation=90)

In [40]:
mean4 = mr100[['price']].mean()
median4 = mr100[['price']].median()
mr100_1 = pd.concat([mean4, median4], axis = 1)
mr100_1 #mean price is 0 and median is 1

In [41]:
mr100['number_of_reviews'] = mr100['number_of_reviews'].astype(str)
gpmr = gp.GeoDataFrame(mr100, geometry=gp.points_from_xy(mr100.longitude, mr100.latitude))
nybb = gp.read_file('../input/new-york-boroughs/geo_export_782dc836-1ce6-418b-bfe4-90af19c5617c.shp')
ax = nybb.plot(color='white', edgecolor='black')
mrplt=gpmr.plot(ax=ax, column = 'room_type', legend = True, cmap='Set1')
mrplt.set_title('Map of the Most Reviewed')

In [42]:
prep = clustered.loc[df['number_of_reviews'] != 0]
prep = prep.loc[df['price'] <= 200]
prep = prep.sort_values(by = 'number_of_reviews', ascending = False)
lm0h = prep[(prep.nearest_landmark == 'The Metropolitan Museum of Art')&(prep.room_type == 'Entire home/apt')]
lm0h = lm0h.head(20)
lm0p = prep[(prep.nearest_landmark == 'The Metropolitan Museum of Art')&(prep.room_type == 'Private room')]
lm0p = lm0p.head(20)
lm0s = prep[(prep.nearest_landmark == 'The Metropolitan Museum of Art')&(prep.room_type == 'Shared room')]
lm0s = lm0s.head(20)
lm1h = prep[(prep.nearest_landmark == 'Central Park')&(prep.room_type == 'Entire home/apt')]
lm1h = lm1h.head(20)
lm1p = prep[(prep.nearest_landmark == 'Central Park')&(prep.room_type == 'Private room')]
lm1p = lm1p.head(20)
lm1s = prep[(prep.nearest_landmark == 'Central Park')&(prep.room_type == 'Shared room')]
lm1s = lm1s.head(20)
lm2h = prep[(prep.nearest_landmark == 'Statue of Liberty')&(prep.room_type == 'Entire home/apt')]
lm2h = lm2h.head(20)
lm2p = prep[(prep.nearest_landmark == 'Statue of Liberty')&(prep.room_type == 'Private room')]
lm2p = lm2p.head(20)
lm2s = prep[(prep.nearest_landmark == 'Statue of Liberty')&(prep.room_type == 'Shared room')]
lm2s = lm2s.head(20)
lm3h = prep[(prep.nearest_landmark == 'Guggenheim Museum')&(prep.room_type == 'Entire home/apt')]
lm3h = lm3h.head(20)
lm3p = prep[(prep.nearest_landmark == 'Guggenheim Museum')&(prep.room_type == 'Private room')]
lm3p = lm3p.head(20)
lm3s = prep[(prep.nearest_landmark == 'Guggenheim Museum')&(prep.room_type == 'Shared room')]
lm3s = lm3s.head(20)
lm4h = prep[(prep.nearest_landmark == 'Empire State Building')&(prep.room_type == 'Entire home/apt')]
lm4h = lm4h.head(20)
lm4p = prep[(prep.nearest_landmark == 'Empire State Building')&(prep.room_type == 'Private room')]
lm4p = lm4p.head(20)
lm4s = prep[(prep.nearest_landmark == 'Empire State Building')&(prep.room_type == 'Shared room')]
lm4s = lm4s.head(20)
lm5h = prep[(prep.nearest_landmark == 'Times Square')&(prep.room_type == 'Entire home/apt')]
lm5h = lm5h.head(20)
lm5p = prep[(prep.nearest_landmark == 'Times Square')&(prep.room_type == 'Private room')]
lm5p = lm5p.head(20)
lm5s = prep[(prep.nearest_landmark == 'Times Square')&(prep.room_type == 'Shared room')]
lm5s = lm5s.head(20)
lm6h = prep[(prep.nearest_landmark == 'American Museum of Natural History')&(prep.room_type == 'Entire home/apt')]
lm6h = lm6h.head(20)
lm6p = prep[(prep.nearest_landmark == 'American Museum of Natural History')&(prep.room_type == 'Private room')]
lm6p = lm6p.head(20)
lm6s = prep[(prep.nearest_landmark == 'American Museum of Natural History')&(prep.room_type == 'Shared room')]
lm6s = lm6s.head(20)
lm7h = prep[(prep.nearest_landmark == 'Brooklyn Bridge')&(prep.room_type == 'Entire home/apt')]
lm7h = lm7h.head(20)
lm7p = prep[(prep.nearest_landmark == 'Brooklyn Bridge')&(prep.room_type == 'Private room')]
lm7p = lm7p.head(20)
lm7s = prep[(prep.nearest_landmark == 'Brooklyn Bridge')&(prep.room_type == 'Shared room')]
lm7s = lm7s.head(20)
lm8h = prep[(prep.nearest_landmark == 'The High Line')&(prep.room_type == 'Entire home/apt')]
lm8h = lm8h.head(20)
lm8p = prep[(prep.nearest_landmark == 'The High Line')&(prep.room_type == 'Private room')]
lm8p = lm8p.head(20)
lm8s = prep[(prep.nearest_landmark == 'The High Line')&(prep.room_type == 'Shared room')]
lm8s = lm8s.head(20)
lm9h = prep[(prep.nearest_landmark == 'Grand Central Station')&(prep.room_type == 'Entire home/apt')]
lm9h = lm9h.head(20)
lm9p = prep[(prep.nearest_landmark == 'Grand Central Station')&(prep.room_type == 'Private room')]
lm9p = lm9p.head(20)
lm9s = prep[(prep.nearest_landmark == 'Grand Central Station')&(prep.room_type == 'Shared room')]
lm9s = lm9s.head(20)
lm10h = prep[(prep.nearest_landmark == '9/11 Memorial Pools')&(prep.room_type == 'Entire home/apt')]
lm10h = lm10h.head(20)
lm10p = prep[(prep.nearest_landmark == '9/11 Memorial Pools')&(prep.room_type == 'Private room')]
lm10p = lm10p.head(20)
lm10s = prep[(prep.nearest_landmark == '9/11 Memorial Pools')&(prep.room_type == 'Shared room')]
lm10s = lm10s.head(20)
lm11h = prep[(prep.nearest_landmark == 'Brooklyn Museum')&(prep.room_type == 'Entire home/apt')]
lm11h = lm11h.head(20)
lm11p = prep[(prep.nearest_landmark == 'Brooklyn Museum')&(prep.room_type == 'Private room')]
lm11p = lm11p.head(20)
lm11s = prep[(prep.nearest_landmark == 'Brooklyn Museum')&(prep.room_type == 'Shared room')]
lm11s = lm11s.head(20)
lm12h = prep[(prep.nearest_landmark == 'Rockefeller')&(prep.room_type == 'Entire home/apt')]
lm12h = lm12h.head(20)
lm12p = prep[(prep.nearest_landmark == 'Rockefeller')&(prep.room_type == 'Private room')]
lm12p = lm12p.head(20)
lm12s = prep[(prep.nearest_landmark == 'Rockefeller')&(prep.room_type == 'Shared room')]
lm12s = lm12s.head(20)
lm13h = prep[(prep.nearest_landmark == 'New York Botanical Garden')&(prep.room_type == 'Entire home/apt')]
lm13h = lm13h.head(20)
lm13p = prep[(prep.nearest_landmark == 'New York Botanical Garden')&(prep.room_type == 'Private room')]
lm13p = lm13p.head(20)
lm13s = prep[(prep.nearest_landmark == 'New York Botanical Garden')&(prep.room_type == 'Shared room')]
lm13s = lm13s.head(20)
lm14h = prep[(prep.nearest_landmark == 'Coney Island')&(prep.room_type == 'Entire home/apt')]
lm14h = lm14h.head(20)
lm14p = prep[(prep.nearest_landmark == 'Coney Island')&(prep.room_type == 'Private room')]
lm14p = lm14p.head(20)
lm14s = prep[(prep.nearest_landmark == 'Coney Island')&(prep.room_type == 'Shared room')]
lm14s = lm14s.head(20)
lm15h = prep[(prep.nearest_landmark == 'Chrysler Building')&(prep.room_type == 'Entire home/apt')]
lm15h = lm15h.head(20)
lm15p = prep[(prep.nearest_landmark == 'Chrysler Building')&(prep.room_type == 'Private room')]
lm15p = lm15p.head(20)
lm15s = prep[(prep.nearest_landmark == 'Chrysler Building')&(prep.room_type == 'Shared room')]
lm15s = lm15s.head(20)
lm16h = prep[(prep.nearest_landmark == 'Bronx Zoo')&(prep.room_type == 'Entire home/apt')]
lm16h = lm16h.head(20)
lm16p = prep[(prep.nearest_landmark == 'Bronx Zoo')&(prep.room_type == 'Private room')]
lm16p = lm16p.head(20)
lm16s = prep[(prep.nearest_landmark == 'Bronx Zoo')&(prep.room_type == 'Shared room')]
lm16s = lm16s.head(20)
lm17h = prep[(prep.nearest_landmark == 'Flatiron Building')&(prep.room_type == 'Entire home/apt')]
lm17h = lm17h.head(20)
lm17p = prep[(prep.nearest_landmark == 'Flatiron Building')&(prep.room_type == 'Private room')]
lm17p = lm17p.head(20)
lm17s = prep[(prep.nearest_landmark == 'Flatiron Building')&(prep.room_type == 'Shared room')]
lm17s = lm17s.head(20)
lm18h = prep[(prep.nearest_landmark == "St Patrick's Cathedral")&(prep.room_type == 'Entire home/apt')]
lm18h = lm18h.head(20)
lm18p = prep[(prep.nearest_landmark == "St Patrick's Cathedral")&(prep.room_type == 'Private room')]
lm18p = lm18p.head(20)
lm18s = prep[(prep.nearest_landmark == "St Patrick's Cathedral")&(prep.room_type == 'Shared room')]
lm18s = lm18s.head(20)
lm19h = prep[(prep.nearest_landmark == "Madison Square Garden")&(prep.room_type == 'Entire home/apt')]
lm19h = lm19h.head(20)
lm19p = prep[(prep.nearest_landmark == "Madison Square Garden")&(prep.room_type == 'Private room')]
lm19p = lm19p.head(20)
lm19s = prep[(prep.nearest_landmark == "Madison Square Garden")&(prep.room_type == 'Shared room')]
lm19s = lm19s.head(20)
lm20h = prep[(prep.nearest_landmark == "Yankee Stadium")&(prep.room_type == 'Entire home/apt')]
lm20h = lm20h.head(20)
lm20p = prep[(prep.nearest_landmark == "Yankee Stadium")&(prep.room_type == 'Private room')]
lm20p = lm20p.head(20)
lm20s = prep[(prep.nearest_landmark == "Yankee Stadium")&(prep.room_type == 'Shared room')]
lm20s = lm20s.head(20)
lm21h = prep[(prep.nearest_landmark == "UN Headquarters")&(prep.room_type == 'Entire home/apt')]
lm21h = lm21h.head(20)
lm21p = prep[(prep.nearest_landmark == "UN Headquarters")&(prep.room_type == 'Private room')]
lm21p = lm21p.head(20)
lm21s = prep[(prep.nearest_landmark == "UN Headquarters")&(prep.room_type == 'Shared room')]
lm21s = lm21s.head(20)
lm22h = prep[(prep.nearest_landmark == "New York City Hall")&(prep.room_type == 'Entire home/apt')]
lm22h = lm22h.head(20)
lm22p = prep[(prep.nearest_landmark == "New York City Hall")&(prep.room_type == 'Private room')]
lm22p = lm22p.head(20)
lm22s = prep[(prep.nearest_landmark == "New York City Hall")&(prep.room_type == 'Shared room')]
lm22s = lm22s.head(20)
lm23h = prep[(prep.nearest_landmark == "Wall Street")&(prep.room_type == 'Entire home/apt')]
lm23h = lm23h.head(20)
lm23p = prep[(prep.nearest_landmark == "Wall Street")&(prep.room_type == 'Private room')]
lm23p = lm23p.head(20)
lm23s = prep[(prep.nearest_landmark == "Wall Street")&(prep.room_type == 'Shared room')]
lm23s = lm23s.head(20)
lm24h = prep[(prep.nearest_landmark == "Dyson Store")&(prep.room_type == 'Entire home/apt')]
lm24h = lm24h.head(20)
lm24p = prep[(prep.nearest_landmark == "Dyson Store")&(prep.room_type == 'Private room')]
lm24p = lm24p.head(20)
lm24s = prep[(prep.nearest_landmark == "Dyson Store")&(prep.room_type == 'Shared room')]
lm24s = lm24s.head(20)
cust_use_df = pd.concat([lm24s, lm24p, lm24h, lm23s, lm23p, lm23h, lm22s, lm22p, lm22h, lm21s, lm21p, lm21h, lm20s, lm20p, lm20h, lm19s, lm19p, lm19h, lm18s, lm18p, lm18h, lm17s, lm17p, lm17h, lm16s, lm16p, lm16h, lm15s, lm15p, lm15h, lm14s, lm14p, lm14h, lm13s, lm13p, lm13h, lm12s, lm12p, lm12h, lm11s, lm11p, lm11h, lm10s, lm10p, lm10h, lm9s, lm9p, lm9h, lm8s, lm8p, lm8h, lm7s, lm7p, lm7h, lm6s, lm6p, lm6h, lm5s, lm5p, lm5h, lm4s, lm4p, lm4h, lm3s, lm3p, lm3h, lm2s, lm2p, lm2h, lm1s, lm1p, lm1h, lm0s, lm0p, lm0h], axis=0)
cust_use_df = cust_use_df.drop(['host_id', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'geometry'], axis=1)
cust_use_df

In [43]:
gdfcust = gp.GeoDataFrame(cust_use_df, geometry=gp.points_from_xy(cust_use_df.longitude, cust_use_df.latitude))
m = nybb.explore(
     column="boro_name", # make choropleth based on "BoroName" column
     popup=True, # show all values in popup (on click)
     tiles="CartoDB positron", # use "CartoDB positron" tiles
     cmap="Pastel1", # use "Set1" matplotlib colormap
     style_kwds=dict(color="black")
)

gdfcust.explore(
     m=m, # pass the map object
     column = "nearest_landmark", 
     marker_kwds=dict(radius=10, fill=True),
     cmap = 'tab20'
)
gdflm.explore(
     m=m, # pass the map object
     column = "landmark", 
     marker_kwds=dict(radius=2, fill=True),
     style_kwds=dict(color = "black")
)

folium.TileLayer('Stamen Toner', control=True).add_to(m)  # use folium to add alternative tiles
folium.LayerControl().add_to(m)  # use folium to add layer control
m

**PRICE PREDICTION**

In [44]:
p = clustered.loc[(df['price'] <= 550)&(df['price'] > 0)]
one_hot0=pd.get_dummies(p, columns = ['room_type'])
one_hot1=pd.get_dummies(one_hot0, columns = ['neighbourhood_group'])
one_hot=pd.get_dummies(one_hot1, columns = ['nearest_landmark'])
one_hot.columns

In [45]:
one_hot['price'] = np.log(one_hot['price'])
one_hot

In [46]:
xprep = one_hot[['latitude', 'longitude', 'room_type_Entire home/apt', 'room_type_Private room', 'neighbourhood_group_Manhattan', 'neighbourhood_group_Queens', 'neighbourhood_group_Brooklyn', 'nearest_landmark_Brooklyn Museum', 'nearest_landmark_Yankee Stadium', 'nearest_landmark_Times Square', 'nearest_landmark_The High Line', 'nearest_landmark_Flatiron Building']]
yprep = one_hot[['price']]
X = xprep.to_numpy()
y = yprep.to_numpy()
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
yprep.describe()

In [47]:
regtest = LR().fit(X_train, y_train)
y_hat = regtest.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_hat))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_hat))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_hat)))
print('R2 Score:', metrics.r2_score(y_test, y_hat))

In [48]:
def price_function(new_listing):
    logprice = regtest.predict(new_listing)
    price = np.exp(logprice)
    price = price.astype('int')
    print("$", *price[0])

In [49]:
#numpy array colums are: 'latitude', 'longitude', 'room_type_Entire home/apt', 'room_type_Private room', 'neighbourhood_group_Manhattan', 'neighbourhood_group_Queens', 'neighbourhood_group_Brooklyn', 'nearest_landmark_Brooklyn Museum', 'nearest_landmark_Yankee Stadium', 'nearest_landmark_Times Square', 'nearest_landmark_The High Line', 'nearest_landmark_Flatiron Building'
new_listing = np.array([[40.7493, -73.9699, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]])
price_function(new_listing)