## Airbnb Explorer NYC

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from dash import Dash, html, dcc, Input, Output
import dash_bootstrap_components as dbc

In [2]:
df = pd.read_csv('data/AB_NYC_2019.csv')

# Print the first 5 rows of the dataframe
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [3]:
# Print the number of rows in the dataframe
len(df)

48895

In [4]:
# Print the data types
df.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [5]:
# Check if there are any missing values
df.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [6]:
# Fill NA/NaN values of reviews_per_month with 0
df.fillna({"reviews_per_month": 0}, inplace=True)

# Check if there are any null values after filling
df["reviews_per_month"].isnull().sum()

0

In [7]:
# Print the unique values in the neighbourhood_group column
df["neighbourhood_group"].unique()

array(['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx'],
      dtype=object)

In [8]:
# Print the count of unique values in the neighbourhood column
df["neighbourhood"].nunique()

221

In [9]:
# Describe the price column
df["price"].describe().round(1)

count    48895.0
mean       152.7
std        240.2
min          0.0
25%         69.0
50%        106.0
75%        175.0
max      10000.0
Name: price, dtype: float64

We can see that there are some extreme values in the dataset. These must be removed before visualization.

In [10]:
max_price = df["price"].quantile(0.98)
max_price

550.0

In [11]:
# Drop the rows where the price is 0
df = df[df["price"] > 0]

# Remove extremes values from price column
df = df[df["price"] <= max_price]

# Describe the price column
df["price"].describe().round(1)

count    47949.0
mean       132.5
std         90.1
min         10.0
25%         68.0
50%        102.0
75%        175.0
max        550.0
Name: price, dtype: float64

In [12]:
min_price = df["price"].min()
max_price = df["price"].max()
mean_price = round(df["price"].mean())

In [13]:
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

app.layout = dbc.Container([
    html.H1("Airbnb Explorer NYC"),
    html.Hr(),
    html.H3("Visualize the geographic distribution of housing on a map:"),
    dbc.Row([
        dbc.Col([
            html.P("Select the price range (in dollars):"),
        ], width="auto"),
        dbc.Col([
            dcc.RangeSlider(min=min_price,
                            max=max_price,
                            step=1,
                            value=[min_price, max_price],
                            marks=None,
                            tooltip={"placement": "bottom", "always_visible": True},
                            allowCross=False,
                            id="slider-price"),
        ]),
    ], align="center"),
    dcc.Graph(id="price-map"),
    html.Hr(),
])

@app.callback(Output('price-map', 'figure'),
              Input('slider-price', 'value'))

def update_price_map(slider_price):
    df_filtered = df[(df["price"] >= slider_price[0]) & (df["price"] <= slider_price[1])]

    fig = px.scatter_mapbox(df_filtered,
                            lat="latitude",
                            lon="longitude",
                            hover_name="name",
                            hover_data=["price", "availability_365"],
                            height=500,
                            zoom=9,
                            color="price",
                            color_continuous_scale=px.colors.sequential.Turbo,
                            opacity=0.6)
    
    fig.update_layout(mapbox_style="carto-positron",
                      margin={"r":0,"t":0,"l":0,"b":0})
    
    fig.update_layout(showlegend=True,
                      coloraxis_colorbar=dict(title="Price"))
    return fig

if __name__ == "__main__":
    app.run(debug=True, port=8050, use_reloader=False)

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app '__main__'
 * Debug mode: on
