# AirBnB NY Dataset

### Dataset
https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data/data#

In [None]:
! pip install seaborn
! pip3 install seaborn

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import os

for root, dirs, files in os.walk("."):
    for filename in files:
        print(filename)

.DS_Store
AirBnB Data Analysis.ipynb
AB_NYC_2019.csv
AirBnB Data Analysis-checkpoint.ipynb


In [3]:
# Loading the file in dataframe
df = pd.read_csv("AB_NYC_2019.csv")

### Let's look at the data

In [4]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [5]:
df.shape

(48895, 16)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
id                                48895 non-null int64
name                              48879 non-null object
host_id                           48895 non-null int64
host_name                         48874 non-null object
neighbourhood_group               48895 non-null object
neighbourhood                     48895 non-null object
latitude                          48895 non-null float64
longitude                         48895 non-null float64
room_type                         48895 non-null object
price                             48895 non-null int64
minimum_nights                    48895 non-null int64
number_of_reviews                 48895 non-null int64
last_review                       38843 non-null object
reviews_per_month                 38843 non-null float64
calculated_host_listings_count    48895 non-null int64
availability_365                  48895 non-null int64

In [7]:
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,19017140.0,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,10983110.0,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,9471945.0,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,19677280.0,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,29152180.0,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


# Data Processing:

### Remove Nulls

In [None]:
df.isnull().sum()

In [None]:
df['reviews_per_month'].fillna(0,inplace = True)

In [None]:
df.isnull().sum()

In [None]:
df['name'].fillna("$",inplace=True)
df['host_name'].fillna("#",inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.drop(['last_review'],axis=1,inplace=True)

In [None]:
df.isnull().sum()

## Understanding Data

In [None]:
df.head(3)

In [None]:
df.room_type.unique()

In [None]:
df.room_type.value_counts()

In [None]:
df_neighbourhood_group_count = df['neighbourhood_group'].value_counts()
df_neighbourhood_group_count

In [None]:
f,ax = plt.subplots(figsize=(15,6))
ax = sns.countplot(df.neighbourhood_group, palette="muted")
ax.set_title('Hosts with the most listings in NYC')
plt.show()

In [None]:
viz_1 = df_neighbourhood_group_count.plot(kind='bar')
viz_1.set_title('Neighbourhood wise listing')

In [None]:
# creating a sub-dataframe with no extreme values / less than 400
df_price_below_400=df[df.price < 400]
#using violinplot to showcase density and distribtuion of prices 
viz_2=sns.violinplot(data=df_price_below_400, x='neighbourhood_group', y='price')
viz_2.set_title('Density and distribution of prices for each neighberhood_group')

### Let's explore **Manhattan**

In [None]:
df_manhattan = df[df.neighbourhood_group=="Manhattan"]
df_manhattan

In [None]:
df_manhattan['price']

In [None]:
df_manhattan['price'].describe()

In [None]:
f,ax = plt.subplots(figsize=(15,4))
df1 = df_manhattan['price'] 
df1 = df_manhattan[df_manhattan.price < 250]['price'] 
sns.distplot(df1)
plt.show()

In [None]:
df_manhattan.neighbourhood.unique()

In [None]:
# let's see what hosts (IDs) have the most listings on Airbnb platform and 
# taking advantage of this service
df_manhattan['host_id'].value_counts().head(15)

In [None]:
# coming back to our dataset we can confirm our fidnings with already existing column 
# called 'calculated_host_listings_count'
df.calculated_host_listings_count.max()

In [None]:
df_top_prices_by_neighbourhood = df_manhattan.groupby('neighbourhood').agg({'price': 'mean', 'host_id': 'count'}).sort_values('price')

In [None]:
df_top_prices_by_neighbourhood

In [None]:
df_manhattan.neighbourhood.value_counts()

# References:

+ https://www.kaggle.com/scsaurabh/complete-analysis-of-airbnb-data-new-york-city
+ https://www.kaggle.com/bee1693/ny-airbnb-eda
+ https://www.kaggle.com/bavalpreet26/eda-air-bnb-26