# EDA on Air BnB NYC data

## This is part 1 of the exercise where primarily data wrangling has been done
## Output of this exercise is a clean version of the Airbnb data, which will be uploaded on kaggle
## The next part will be about data visualization

In [2]:
# Importing the essential libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import plotly_express as px
import os
import gc

In [5]:
# Setting up display parameters

pd.set_option('display.max_columns', None)
%matplotlib inline

In [6]:
%%HTML
<style type=""text/css"">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [4]:
# Looking for the dataset in local folder

# os.listdir('/home/sandeep/Development/Datasets/Hospitality/airbnb')

In [5]:
# Reading the first line of the apparent dataset file

# with open('/home/sandeep/Development/Datasets/Hospitality/airbnb/Airbnb_Open_Data.csv', 'r') as f:
#     for line in f:
#         print(line)
#         break

In [8]:
# Reading the csv file in pandas dataframe

df = pd.read_csv('../input/airbnbopendata/Airbnb_Open_Data.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../input/airbnbopendata/Airbnb_Open_Data.csv'

In [7]:
df.shape

(102599, 26)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102599 entries, 0 to 102598
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              102599 non-null  int64  
 1   NAME                            102349 non-null  object 
 2   host id                         102599 non-null  int64  
 3   host_identity_verified          102310 non-null  object 
 4   host name                       102193 non-null  object 
 5   neighbourhood group             102570 non-null  object 
 6   neighbourhood                   102583 non-null  object 
 7   lat                             102591 non-null  float64
 8   long                            102591 non-null  float64
 9   country                         102067 non-null  object 
 10  country code                    102468 non-null  object 
 11  instant_bookable                102494 non-null  object 
 12  cancellation_pol

In [9]:
# Many null values are present, need to act upon them, let's check the nulls

df.isnull().sum().sort_values(ascending=False)

license                           102597
house_rules                        52131
last review                        15893
reviews per month                  15879
country                              532
availability 365                     448
minimum nights                       409
host name                            406
review rate number                   326
calculated host listings count       319
host_identity_verified               289
service fee                          273
NAME                                 250
price                                247
Construction year                    214
number of reviews                    183
country code                         131
instant_bookable                     105
cancellation_policy                   76
neighbourhood group                   29
neighbourhood                         16
long                                   8
lat                                    8
id                                     0
host id         

In [10]:
# What on earth is this license column, where just 2 entries are present?

df.loc[~df.license.isnull()]

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,country code,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules,license
11114,7139598,"Cozy 1 BR on Bedford Avenue, Wburg",73023181304,verified,Christina,Brooklyn,Williamsburg,40.71764,-73.95689,United States,US,True,strict,Private room,2010.0,$702,$140,1.0,1.0,1/3/2016,0.02,1.0,1.0,191.0,"Dear Guest, Thank you for appreciating that I ...",41662/AL
72947,41289964,"Cozy 1 BR on Bedford Avenue, Wburg",25804773951,unconfirmed,Christina,Brooklyn,Williamsburg,40.71764,-73.95689,United States,US,True,flexible,Private room,2010.0,$702,$140,1.0,1.0,1/3/2016,0.02,1.0,1.0,0.0,,41662/AL


In [11]:
# O boy! The same person seems to have created 2 IDs for the same listing. Which means, we need to 
# look for duplicate rows and delete them. Also, this last column 'license' is useless, let's drop it

df.drop(df.index[72947], inplace=True)

In [12]:
# drop the 'license' column

df.drop('license', axis=1, inplace=True)

In [13]:
# Next, let's impute the missing 'house_rules' column with the word 'blank'

df.loc[df.house_rules.isnull(), 'house_rules'] = 'blank'

In [14]:
# Let's check the column 'last review'

df['last review']

0         10/19/2021
1          5/21/2022
2                NaN
3           7/5/2019
4         11/19/2018
             ...    
102594           NaN
102595      7/6/2015
102596           NaN
102597    10/11/2015
102598           NaN
Name: last review, Length: 102598, dtype: object

In [15]:
# So 'last review' is a date column, we'll fix that. Also, we'll impute the null values with the
# very first date in this listing, denoting that the listing has almost never been reviewed

df['last review'] = pd.to_datetime(df['last review'])

In [16]:
# Let's check the min and max timestamps

df['last review'].min(), df['last review'].max()

(Timestamp('2012-07-11 00:00:00'), Timestamp('2058-06-16 00:00:00'))

In [17]:
# O boy! the max date of review is the year 2058. Let's find out how many such bogus dates are there, and fix them

df[df['last review'].apply(lambda x: x.year) > 2022]

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,country code,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules
127,1071478,Garden studio in the Upper East Sid,77172555024,unconfirmed,Miller,Manhattan,Upper East Side,40.778,-73.94822,United States,US,False,strict,Entire home/apt,2007.0,$571,$114,5.0,21.0,2024-08-15,0.19,4.0,,395.0,"Dear Guests, Welcome to 62 Cornwall St! I hope..."
191,1106825,LUX APT IN TIMES SQUARE NEW BUILDING,93725364475,unconfirmed,Aiden,Manhattan,Hell's Kitchen,40.76307,-73.99665,United States,US,False,moderate,Entire home/apt,,$539,$108,,41.0,2025-06-26,0.38,2.0,1.0,,"You will be sharing a bathroom, so you must be..."
255,1142173,Beautiful Landmarked Duplex,87944779917,,Baker,Brooklyn,Greenpoint,40.72945,-73.95511,United States,US,True,moderate,Entire home/apt,2008.0,$842,$168,3.0,124.0,2058-06-16,1.22,4.0,3.0,230.0,No rules per say. I just ask that you respect ...
318,1176967,,70084472212,verified,Barnes,Brooklyn,Greenpoint,40.72488,-73.95018,United States,US,True,flexible,Private room,2018.0,$920,$184,2.0,1.0,2026-03-28,0.01,3.0,1.0,73.0,Check-In is 3pm. Check-Out is 12 Noon. In or...
483,1268097,Modern Space in Charming Pre-war,13746585241,verified,Adelaide,Manhattan,Harlem,40.82411,-73.94934,United States,US,False,flexible,Private room,,$721,$144,2.0,41.0,2040-06-16,0.43,3.0,2.0,47.0,blank


In [18]:
# Okay, so let's change these wrong review dates to the median review date, giving benefit of doubt to the host

df.loc[df[df['last review'].apply(lambda x: x.year) > 2022].index, 'last review'] = df['last review'].median()

In [19]:
# Now let's impute the null values to the minimum date in the dataset

df.loc[df['last review'].isnull(), 'last review'] = df['last review'].median()

In [20]:
# Let's find the duplicate listings if any (like the one we deleted earlier)

df[df.duplicated()]

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,country code,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules
102058,35506831,Master Bedroom with private Bathroom & Balcony,55110690425,unconfirmed,UZeyir,Queens,Maspeth,40.74056,-73.90635,United States,US,True,strict,Private room,2016.0,$706,$141,1.0,1.0,2021-11-14,0.27,3.0,1.0,339.0,blank
102059,35507383,Cozy 2 br in sunny Fort Greene apt,80193772189,verified,Sally,Brooklyn,Fort Greene,40.68701,-73.97555,United States,US,False,flexible,Private room,2020.0,$651,$130,3.0,38.0,2021-11-13,0.27,3.0,1.0,0.0,blank
102060,35507935,Duplex w/ Terrace @ Box House Hotel,72991962259,verified,The Box House Hotel,Brooklyn,Greenpoint,40.73756,-73.95350,United States,US,False,strict,Hotel room,2016.0,$907,$181,3.0,10.0,2021-11-13,0.08,3.0,30.0,32.0,blank
102061,35508488,"Cozy, clean Greenpoint room with yard access",74975156081,verified,Dawn,Brooklyn,Greenpoint,40.72516,-73.95004,United States,US,False,strict,Private room,2013.0,$589,$118,30.0,38.0,2021-11-13,0.34,5.0,2.0,324.0,blank
102062,35509040,2BR XL Loft: Cleaning CDC guidelines implemented,85844415221,unconfirmed,Vida,Brooklyn,Greenpoint,40.72732,-73.94185,United States,US,False,flexible,Entire home/apt,2015.0,$356,$71,30.0,13.0,2021-11-13,0.14,4.0,28.0,336.0,blank
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102594,6092437,Spare room in Williamsburg,12312296767,verified,Krik,Brooklyn,Williamsburg,40.70862,-73.94651,United States,US,False,flexible,Private room,2003.0,$844,$169,1.0,0.0,2019-06-14,,3.0,1.0,227.0,No Smoking No Parties or Events of any kind Pl...
102595,6092990,Best Location near Columbia U,77864383453,unconfirmed,Mifan,Manhattan,Morningside Heights,40.80460,-73.96545,United States,US,True,moderate,Private room,2016.0,$837,$167,1.0,1.0,2015-07-06,0.02,2.0,2.0,395.0,House rules: Guests agree to the following ter...
102596,6093542,"Comfy, bright room in Brooklyn",69050334417,unconfirmed,Megan,Brooklyn,Park Slope,40.67505,-73.98045,United States,US,True,moderate,Private room,2009.0,$988,$198,3.0,0.0,2019-06-14,,5.0,1.0,342.0,blank
102597,6094094,Big Studio-One Stop from Midtown,11160591270,unconfirmed,Christopher,Queens,Long Island City,40.74989,-73.93777,United States,US,True,strict,Entire home/apt,2015.0,$546,$109,2.0,5.0,2015-10-11,0.10,3.0,1.0,386.0,blank


In [21]:
# Wow! 541 duplicate rows. Now let's get rid of them

df.drop_duplicates(keep='first', inplace=True)

In [22]:
# Well, there could be some more duplicates like the first case we saw. Let's use the latitude, longitude,
# host name, and price combination to find such cases

df.duplicated(subset=['host name', 'lat', 'long', 'price']).sum()

32752

In [23]:
# OMG! that's almost 35% of the dataset which appears duplicated. Let's be sure of this before taking any action
# Let's find these duplicate entries and manually confirm our hunch

temp = df.loc[df.duplicated(subset=['host name', 'lat', 'long', 'price'], keep=False)].copy()
temp = temp.groupby(['host name', 'lat', 'long', 'price'])
for key, subdf in temp:
    print(key)
    print(pd.DataFrame(subdf), '\n')
    break

('(Ari) HENRY LEE', 40.78938, -73.94679, '$1,179 ')
             id               NAME      host id host_identity_verified  \
10316   6698863  MANHATTAN LIVING!  47503557541               verified   
72149  40849229  MANHATTAN LIVING!  29667285224               verified   

             host name neighbourhood group neighbourhood       lat      long  \
10316  (Ari) HENRY LEE           Manhattan   East Harlem  40.78938 -73.94679   
72149  (Ari) HENRY LEE           Manhattan   East Harlem  40.78938 -73.94679   

             country country code instant_bookable cancellation_policy  \
10316  United States           US             True            moderate   
72149  United States           US             True            moderate   

             room type  Construction year    price service fee  \
10316  Entire home/apt             2020.0  $1,179        $236    
72149  Entire home/apt             2020.0  $1,179        $236    

       minimum nights  number of reviews last review  reviews 

In [24]:
# Wow! this clearly reveals that there are plenty of duplicate entries, and we would be better off in dropping them

del temp, subdf
df.drop_duplicates(subset=['host name', 'lat', 'long', 'price'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69305 entries, 0 to 102050
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              69305 non-null  int64         
 1   NAME                            69097 non-null  object        
 2   host id                         69305 non-null  int64         
 3   host_identity_verified          69088 non-null  object        
 4   host name                       68992 non-null  object        
 5   neighbourhood group             69276 non-null  object        
 6   neighbourhood                   69289 non-null  object        
 7   lat                             69297 non-null  float64       
 8   long                            69297 non-null  float64       
 9   country                         68982 non-null  object        
 10  country code                    69219 non-null  object        
 11  i

In [25]:
# Let's check the null counts once again

df.isnull().sum().sort_values(ascending=False)

reviews per month                 10814
availability 365                    438
minimum nights                      363
country                             323
host name                           313
price                               245
service fee                         241
review rate number                  239
host_identity_verified              217
NAME                                208
Construction year                   190
number of reviews                   152
calculated host listings count      144
country code                         86
instant_bookable                     86
cancellation_policy                  57
neighbourhood group                  29
neighbourhood                        16
long                                  8
lat                                   8
id                                    0
last review                           0
room type                             0
host id                               0
house_rules                           0


In [26]:
df.head(2)

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,country code,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,US,False,strict,Private room,2020.0,$966,$193,10.0,9.0,2021-10-19,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,US,False,moderate,Entire home/apt,2007.0,$142,$28,30.0,45.0,2022-05-21,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...


In [27]:
# Next, let's explore the 'reviews per month' column

fig = px.histogram(df, x='reviews per month', log_y=True, )
fig.show()

In [28]:
df['reviews per month'].min(), df['reviews per month'].max()

(0.01, 90.0)

In [29]:
fig = px.box(df, y='reviews per month', log_y=True, )
fig.show()

In [30]:
# Since this data is severely right skewed, we'll use the median to impute the column

df.loc[df['reviews per month'].isnull(), 'reviews per month'] = 0.79

In [31]:
# Let's check the variability in the columns of dataframe

for col in df.columns:
    print(f'Column {col} \t has {df[col].nunique()} unique values')

Column id 	 has 69305 unique values
Column NAME 	 has 61233 unique values
Column host id 	 has 69304 unique values
Column host_identity_verified 	 has 2 unique values
Column host name 	 has 13190 unique values
Column neighbourhood group 	 has 7 unique values
Column neighbourhood 	 has 224 unique values
Column lat 	 has 21991 unique values
Column long 	 has 17774 unique values
Column country 	 has 1 unique values
Column country code 	 has 1 unique values
Column instant_bookable 	 has 2 unique values
Column cancellation_policy 	 has 3 unique values
Column room type 	 has 4 unique values
Column Construction year 	 has 20 unique values
Column price 	 has 1151 unique values
Column service fee 	 has 231 unique values
Column minimum nights 	 has 148 unique values
Column number of reviews 	 has 476 unique values
Column last review 	 has 2472 unique values
Column reviews per month 	 has 1016 unique values
Column review rate number 	 has 5 unique values
Column calculated host listings count 	 ha

In [32]:
# Let's check the host_identity_verified value count

df.host_identity_verified.value_counts()

unconfirmed    34562
verified       34526
Name: host_identity_verified, dtype: int64

In [33]:
# Let's check the neighbourhood group values

df['neighbourhood group'].value_counts()

Manhattan        29573
Brooklyn         27881
Queens            9260
Bronx             1911
Staten Island      649
brookln              1
manhatan             1
Name: neighbourhood group, dtype: int64

In [34]:
df.neighbourhood.value_counts().sort_values(ascending=False)[:10]

Bedford-Stuyvesant    5331
Williamsburg          5079
Harlem                3659
Bushwick              3270
Hell's Kitchen        2817
Upper West Side       2601
Upper East Side       2460
Midtown               2317
East Village          2285
Crown Heights         2173
Name: neighbourhood, dtype: int64

In [35]:
# These are the cumulative actions we can take right now:
# 1. Impute NAME column with 'blank'
# 2. Impute host id with 0
# 3. Impute host_identity_verified with 'unconfirmed'
# 4. Impute host name with 'blank'
# 5. Fix the spellings of manhattan and brooklyn in column 'neighbourhood group' and impute missing using lat/long
# 6. Let's try to impute 'neighbourhood' using lat/long
# 7. Let's try to impute lat/long using neighbourhood group and neighbourhood
# 8. Let's drop 'country' and 'country code' because they have zero variability

# For now, let's handle this much and then we'll check again

In [36]:
# Impute NAME column with 'blank'
df.loc[df['NAME'].isnull(), 'NAME'] = 'blank'    # or use df['NAME'] =  df['NAME'].fillna('blank')

# Impute host id with 0
df.loc[df['host id'].isnull(), 'host id'] = 0    # or use df['host id'] = df['host id'].fillna(0)

# Impute host_identity_verified with 'unconfirmed'
df.loc[df['host_identity_verified'].isnull(), 'host_identity_verified'] = 'unconfirmed'

# Impute host name with 'blank'
df.loc[df['host name'].isnull(), 'host name'] = 'blank'

In [37]:
# Fix the spellings of manhattan and brooklyn in column 'neighbourhood group' and impute missing using lat/long
df.loc[df['neighbourhood group']=='manhatan', 'neighbourhood group'] = 'Manhattan'
df.loc[df['neighbourhood group']=='brookln', 'neighbourhood group'] = 'Brooklyn'

In [38]:
# Let's drop 'country' and 'country code' because they have zero variability
df.drop(['country', 'country code'], axis=1, inplace=True)

In [39]:
# Let's try to impute 'neighbourhood' using lat/long
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="MyApp")

# Check sample location
location = geolocator.geocode("Manhattan")
location

Location(Manhattan, New York County, City of New York, New York, United States, (40.7896239, -73.9598939, 0.0))

In [40]:
# Check the missing neighbourhood rows

df.loc[df.neighbourhood.isnull()]

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules
517,1286875,"""The Oasis"" on Bedford Williamsburg",30603782652,unconfirmed,Lucas,Brooklyn,,40.7158,-73.95803,,,Entire home/apt,2005.0,,$130,6.0,1.0,2016-01-01,0.02,5.0,1.0,266.0,1. No parties. 2. Respect the neighbors. Nois...
547,1303444,STYLISH EAST VILLAGE FLAT,6825076306,verified,Arnold,Manhattan,,40.73089,-73.98195,True,strict,Entire home/apt,2013.0,$749,$150,30.0,25.0,2018-04-30,0.26,4.0,1.0,67.0,blank
553,1306758,Comfortable. Spacious. Private Room.,45534966158,unconfirmed,Ned,Manhattan,,40.713,-73.99752,True,flexible,Private room,2019.0,$613,$123,,172.0,2019-07-02,1.84,5.0,2.0,63.0,Please be gentle with the furniture and applia...
575,1318909,"2 Bed, 2 Bath Apartment on Central Park West",86633710982,unconfirmed,Arthur,Manhattan,,40.79816,-73.9619,True,strict,Entire home/apt,2014.0,$892,$178,2.0,45.0,2019-04-22,0.47,4.0,3.0,4.0,blank
589,1326641,CBG Helps Haiti Rm #3,33952991254,verified,Jacob,Brooklyn,,40.68012,-73.97847,False,strict,Private room,2003.0,$793,$159,2.0,23.0,2018-09-15,0.24,1.0,6.0,,Please remember that this is a residential bui...
613,1339896,Wonderfully inviting East Village,84560895335,verified,Carter,Manhattan,,40.72709,-73.98274,True,moderate,Private room,2004.0,$486,$97,1.0,109.0,2015-10-02,1.15,2.0,1.0,116.0,#NAME?
624,1345971,Stylish Studio with exclusive Terrace,21547822804,unconfirmed,Cole,Manhattan,,40.75348,-73.97065,False,moderate,Entire home/apt,2019.0,$417,$83,30.0,136.0,2019-06-19,1.45,4.0,1.0,272.0,Please remember that this is a residential bui...
633,1350942,Lower East Side/Chinatown 1 Bedroom,58408401042,verified,Higgins,Manhattan,,40.71693,-73.98948,False,flexible,Entire home/apt,2017.0,"$1,148",$230,5.0,8.0,2018-05-18,0.11,3.0,1.0,42.0,blank
643,1356465,2BR Apt - 20min to Soho,21192497123,verified,Barrett,Brooklyn,,40.68016,-73.94878,False,moderate,Entire home/apt,2015.0,$938,$188,3.0,5.0,2017-07-09,0.05,2.0,1.0,408.0,Smoking is allowed on the patio only. No pets ...
670,1371377,Loft Suite @ The Box House Hotel,75427375884,verified,Gray,Brooklyn,,40.73641,-73.9533,False,strict,Entire home/apt,2006.0,$104,$21,3.0,24.0,2018-11-06,0.32,4.0,28.0,419.0,blank


In [41]:
# Let's define a function to accept coordinates and return suburb name
def loc_from_coord(lat, long):
    location = geolocator.reverse(str(lat)+","+str(long))
    return location.raw['address'].get('road', '')

# Let's test the function
temp = df.loc[df.neighbourhood.isnull()].copy()
print(loc_from_coord(temp.iloc[0].lat, temp.iloc[0].long))

Driggs Avenue


In [42]:
# So the sample worked, now we impute all the missing neighbourhood data

idx = df.loc[df.neighbourhood.isnull()].index
df.loc[idx, 'neighbourhood'] = df.loc[idx].apply(lambda x: \
                                                loc_from_coord(x.lat, x.long), axis=1)

In [43]:
del temp

In [44]:
# Let's check whether the imputation worked or not
df.loc[idx].head()

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules
517,1286875,"""The Oasis"" on Bedford Williamsburg",30603782652,unconfirmed,Lucas,Brooklyn,Driggs Avenue,40.7158,-73.95803,,,Entire home/apt,2005.0,,$130,6.0,1.0,2016-01-01,0.02,5.0,1.0,266.0,1. No parties. 2. Respect the neighbors. Nois...
547,1303444,STYLISH EAST VILLAGE FLAT,6825076306,verified,Arnold,Manhattan,East 14th Street,40.73089,-73.98195,True,strict,Entire home/apt,2013.0,$749,$150,30.0,25.0,2018-04-30,0.26,4.0,1.0,67.0,blank
553,1306758,Comfortable. Spacious. Private Room.,45534966158,unconfirmed,Ned,Manhattan,Henry Street,40.713,-73.99752,True,flexible,Private room,2019.0,$613,$123,,172.0,2019-07-02,1.84,5.0,2.0,63.0,Please be gentle with the furniture and applia...
575,1318909,"2 Bed, 2 Bath Apartment on Central Park West",86633710982,unconfirmed,Arthur,Manhattan,Manhattan Avenue,40.79816,-73.9619,True,strict,Entire home/apt,2014.0,$892,$178,2.0,45.0,2019-04-22,0.47,4.0,3.0,4.0,blank
589,1326641,CBG Helps Haiti Rm #3,33952991254,verified,Jacob,Brooklyn,5th Avenue,40.68012,-73.97847,False,strict,Private room,2003.0,$793,$159,2.0,23.0,2018-09-15,0.24,1.0,6.0,,Please remember that this is a residential bui...


In [45]:
# Okay, so that worked. Now we'll impute 'neighbourhood group'
# Let's check the rows
df.loc[df['neighbourhood group'].isnull()]

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules
74,1042206,"HARLEM, NEW YORK WELCOMES YOU!!",98195975718,unconfirmed,Violet,,Washington Heights,40.83139,-73.94095,True,moderate,Private room,2011.0,$571,$114,2.0,49.0,2019-06-18,1.6,2.0,2.0,,The usual courtesies apply: - No smoking - No ...
75,1042759,BLUE TRIM GUEST HOUSE,4726877402,unconfirmed,Audrey,,Clinton Hill,40.68346,-73.96374,True,strict,Private room,2014.0,$398,$80,2.0,105.0,2019-06-26,0.92,1.0,1.0,,Shoes off please Cat can go in or out as he de...
76,1043311,Charming East Village One Bedroom Flat,74322993447,verified,Violet,,East Village,40.72828,-73.98801,False,strict,Entire home/apt,2018.0,$618,$124,5.0,21.0,2019-01-02,0.2,4.0,1.0,,no smoking quiet
77,1043863,Manhattan Room,11468499446,verified,Sofia,,Upper East Side,40.76865,-73.95058,False,strict,Private room,2007.0,$116,$23,1.0,142.0,2019-07-06,1.5,4.0,1.0,,I'm a semi kosher vegetarian which means that ...
78,1044415,Little King of Queens,68599531533,unconfirmed,Melanie,,Woodside,40.75038,-73.90334,True,flexible,Private room,2012.0,$54,$11,30.0,25.0,2019-06-14,0.22,2.0,1.0,,No Street Shoes allowed in House. No cooking K...
90,1051043,Cozy Bedroom in Williamsburg 3 BR,14067827221,unconfirmed,Lilianna,,Williamsburg,40.71156,-73.96218,False,moderate,Private room,2015.0,$266,$53,3.0,174.0,2019-06-22,1.54,5.0,4.0,,Dryer and Washing Machine are in Basement (1.0...
91,1051595,Sunny room+Pvte office in huge loft,12884105458,unconfirmed,Albert,,Bushwick,40.70032,-73.9383,False,moderate,Private room,2012.0,$728,$146,4.0,24.0,2019-06-14,0.28,5.0,1.0,,"To enjoy, relax, feel safe and cozy. Also, kee..."
92,1052148,Spacious Prospect Heights Apartment,63218812094,unconfirmed,Sarah,,Prospect Heights,40.68233,-73.97261,False,flexible,Entire home/apt,2021.0,$583,$117,4.0,166.0,2019-06-27,3.4,2.0,1.0,,We ask that guests be respectful as there are ...
148,1083076,NYC Zen,83696952551,verified,Amelia,,East Village,40.72354,-73.98295,False,strict,Entire home/apt,2003.0,,$119,3.0,30.0,2019-06-17,0.28,5.0,1.0,344.0,No Smoking No Pets No Parties
161,1090256,Indie-Chic Share In Williamsburg,1595619477,unconfirmed,Darcy,,Williamsburg,40.71088,-73.95055,,,Private room,2022.0,"$1,020",$204,4.0,202.0,2019-05-28,1.86,5.0,2.0,377.0,"No smoking in the apartment, even with the win..."


In [46]:
# It's a long list. Let's make a function to convert the coordinates to neighbourhood group

def neigh_from_coord(lat,long):
    location = geolocator.reverse(str(lat)+","+str(long))
    return location.raw['address'].get('suburb', '')

In [47]:
# Let's check a sample
idx = df.loc[df['neighbourhood group'].isnull()].index
print(neigh_from_coord(df.loc[idx].iloc[0].lat, df.loc[idx].iloc[0].long))

Manhattan


In [48]:
# So the sample worked, now we impute all the missing neighbourhood group data

df.loc[idx, 'neighbourhood group'] = df.loc[idx].apply(lambda x: neigh_from_coord(x.lat, x.long), 
                                                       axis=1)

In [49]:
# Let's check whether the imputation worked or not
df.loc[idx]

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules
74,1042206,"HARLEM, NEW YORK WELCOMES YOU!!",98195975718,unconfirmed,Violet,Manhattan,Washington Heights,40.83139,-73.94095,True,moderate,Private room,2011.0,$571,$114,2.0,49.0,2019-06-18,1.6,2.0,2.0,,The usual courtesies apply: - No smoking - No ...
75,1042759,BLUE TRIM GUEST HOUSE,4726877402,unconfirmed,Audrey,Brooklyn,Clinton Hill,40.68346,-73.96374,True,strict,Private room,2014.0,$398,$80,2.0,105.0,2019-06-26,0.92,1.0,1.0,,Shoes off please Cat can go in or out as he de...
76,1043311,Charming East Village One Bedroom Flat,74322993447,verified,Violet,Manhattan,East Village,40.72828,-73.98801,False,strict,Entire home/apt,2018.0,$618,$124,5.0,21.0,2019-01-02,0.2,4.0,1.0,,no smoking quiet
77,1043863,Manhattan Room,11468499446,verified,Sofia,Manhattan,Upper East Side,40.76865,-73.95058,False,strict,Private room,2007.0,$116,$23,1.0,142.0,2019-07-06,1.5,4.0,1.0,,I'm a semi kosher vegetarian which means that ...
78,1044415,Little King of Queens,68599531533,unconfirmed,Melanie,Queens,Woodside,40.75038,-73.90334,True,flexible,Private room,2012.0,$54,$11,30.0,25.0,2019-06-14,0.22,2.0,1.0,,No Street Shoes allowed in House. No cooking K...
90,1051043,Cozy Bedroom in Williamsburg 3 BR,14067827221,unconfirmed,Lilianna,Brooklyn,Williamsburg,40.71156,-73.96218,False,moderate,Private room,2015.0,$266,$53,3.0,174.0,2019-06-22,1.54,5.0,4.0,,Dryer and Washing Machine are in Basement (1.0...
91,1051595,Sunny room+Pvte office in huge loft,12884105458,unconfirmed,Albert,Brooklyn,Bushwick,40.70032,-73.9383,False,moderate,Private room,2012.0,$728,$146,4.0,24.0,2019-06-14,0.28,5.0,1.0,,"To enjoy, relax, feel safe and cozy. Also, kee..."
92,1052148,Spacious Prospect Heights Apartment,63218812094,unconfirmed,Sarah,Brooklyn,Prospect Heights,40.68233,-73.97261,False,flexible,Entire home/apt,2021.0,$583,$117,4.0,166.0,2019-06-27,3.4,2.0,1.0,,We ask that guests be respectful as there are ...
148,1083076,NYC Zen,83696952551,verified,Amelia,Manhattan,East Village,40.72354,-73.98295,False,strict,Entire home/apt,2003.0,,$119,3.0,30.0,2019-06-17,0.28,5.0,1.0,344.0,No Smoking No Pets No Parties
161,1090256,Indie-Chic Share In Williamsburg,1595619477,unconfirmed,Darcy,Brooklyn,Williamsburg,40.71088,-73.95055,,,Private room,2022.0,"$1,020",$204,4.0,202.0,2019-05-28,1.86,5.0,2.0,377.0,"No smoking in the apartment, even with the win..."


In [50]:
# Okay, so that worked. Let's now try to impute lat/long using neighbourhood group and neighbourhood
df.loc[df.lat.isnull()]

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules
779,1431578,"Large, furnished room in a 2 bedroom!",20368956893,unconfirmed,Gibson,Brooklyn,Crown Heights,,,False,strict,Private room,,$539,$108,1.0,1.0,2017-03-18,0.04,2.0,1.0,41.0,- Weekly and monthly prices are much lower - P...
785,1434892,Authentic NY Charming Artist Loft,66486085219,unconfirmed,Bailey,Brooklyn,Greenpoint,,,False,strict,Entire home/apt,2021.0,"$1,058",$212,5.0,14.0,2019-06-19,0.16,5.0,1.0,226.0,We live and let live - hoping that you'd be re...
799,1442624,Huge room with private balcony,69386945815,verified,Hunt,Manhattan,East Village,,,False,flexible,Private room,2010.0,$506,$101,6.0,1.0,2013-05-06,0.01,1.0,1.0,240.0,Expect respect for the family and the space--t...
814,1450908,Decorators 5-Star Flat West Village,33280739304,verified,Watson,Manhattan,West Village,,,True,strict,Entire home/apt,2003.0,$381,$76,20.0,157.0,2016-08-11,1.71,4.0,1.0,61.0,"Please keep it clean, thats all we really ask ..."
843,1466925,Nice Private Room Beauty in Queens,15305733205,verified,Roberts,Queens,Elmhurst,,,True,strict,Private room,2005.0,$224,$45,1.0,63.0,2019-05-18,0.89,3.0,2.0,70.0,blank
885,1490122,Cute Room in Historic Loft!,42267829819,unconfirmed,Jones,Brooklyn,Greenpoint,,,True,flexible,Private room,2019.0,$524,$105,14.0,22.0,2019-05-02,0.25,1.0,1.0,266.0,"Pets are cool (just clean up after them!), smo..."
926,1512766,21 day Chelsea Apartment rental,10876728736,unconfirmed,Owens,Manhattan,Flatiron District,,,False,strict,Private room,2020.0,$623,$125,21.0,0.0,2019-06-14,0.79,2.0,1.0,104.0,blank
986,1545904,New York City for All Seasons!,26437872336,unconfirmed,Douglas,Manhattan,Upper West Side,,,True,flexible,Private room,2014.0,$413,$83,1.0,25.0,2013-06-22,0.28,2.0,1.0,259.0,No Smoking No Pets


In [51]:
# Let's collect the indices as earlier (lat and long are missing together)
idx = df.loc[df.lat.isnull()].index

# Now we define a function to accept the location and return latitude and longitude
def lat_from_loc(loc):
    location = geolocator.geocode(loc)
    return location.latitude

def long_from_loc(loc):
    location = geolocator.geocode(loc)
    return location.longitude

# Let's test a sample case
print(lat_from_loc(df.loc[idx].iloc[0].neighbourhood), long_from_loc(df.loc[idx].iloc[0].neighbourhood))

40.667471 -73.9435662


In [52]:
# Now that looks pretty good, let's impute all the missing coordinates (used both neighbourhood and 
# neighbourhood group because there can be multiple suburbs with same name, such as, Elmhurst is also in IL)

df.loc[idx, 'lat'] = df.loc[idx].apply(lambda x: lat_from_loc(x.neighbourhood+', '+x['neighbourhood group']), axis=1)
df.loc[idx, 'long'] = df.loc[idx].apply(lambda x: long_from_loc(x.neighbourhood+', '+x['neighbourhood group']), axis=1)

In [53]:
df.loc[idx]

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules
779,1431578,"Large, furnished room in a 2 bedroom!",20368956893,unconfirmed,Gibson,Brooklyn,Crown Heights,40.667471,-73.943566,False,strict,Private room,,$539,$108,1.0,1.0,2017-03-18,0.04,2.0,1.0,41.0,- Weekly and monthly prices are much lower - P...
785,1434892,Authentic NY Charming Artist Loft,66486085219,unconfirmed,Bailey,Brooklyn,Greenpoint,40.723713,-73.950971,False,strict,Entire home/apt,2021.0,"$1,058",$212,5.0,14.0,2019-06-19,0.16,5.0,1.0,226.0,We live and let live - hoping that you'd be re...
799,1442624,Huge room with private balcony,69386945815,verified,Hunt,Manhattan,East Village,40.729269,-73.987361,False,flexible,Private room,2010.0,$506,$101,6.0,1.0,2013-05-06,0.01,1.0,1.0,240.0,Expect respect for the family and the space--t...
814,1450908,Decorators 5-Star Flat West Village,33280739304,verified,Watson,Manhattan,West Village,40.734186,-74.00558,True,strict,Entire home/apt,2003.0,$381,$76,20.0,157.0,2016-08-11,1.71,4.0,1.0,61.0,"Please keep it clean, thats all we really ask ..."
843,1466925,Nice Private Room Beauty in Queens,15305733205,verified,Roberts,Queens,Elmhurst,40.73658,-73.878393,True,strict,Private room,2005.0,$224,$45,1.0,63.0,2019-05-18,0.89,3.0,2.0,70.0,blank
885,1490122,Cute Room in Historic Loft!,42267829819,unconfirmed,Jones,Brooklyn,Greenpoint,40.723713,-73.950971,True,flexible,Private room,2019.0,$524,$105,14.0,22.0,2019-05-02,0.25,1.0,1.0,266.0,"Pets are cool (just clean up after them!), smo..."
926,1512766,21 day Chelsea Apartment rental,10876728736,unconfirmed,Owens,Manhattan,Flatiron District,40.741072,-73.989653,False,strict,Private room,2020.0,$623,$125,21.0,0.0,2019-06-14,0.79,2.0,1.0,104.0,blank
986,1545904,New York City for All Seasons!,26437872336,unconfirmed,Douglas,Manhattan,Upper West Side,40.787045,-73.975416,True,flexible,Private room,2014.0,$413,$83,1.0,25.0,2013-06-22,0.28,2.0,1.0,259.0,No Smoking No Pets


In [54]:
# So now we'll check for null values again, data cleaning is always a long process, and I'm not the most efficient

df.isnull().sum().sort_values(ascending=False)

availability 365                  438
minimum nights                    363
price                             245
service fee                       241
review rate number                239
Construction year                 190
number of reviews                 152
calculated host listings count    144
instant_bookable                   86
cancellation_policy                57
id                                  0
reviews per month                   0
last review                         0
room type                           0
NAME                                0
long                                0
lat                                 0
neighbourhood                       0
neighbourhood group                 0
host name                           0
host_identity_verified              0
host id                             0
house_rules                         0
dtype: int64

In [55]:
df.head(1)

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,False,strict,Private room,2020.0,$966,$193,10.0,9.0,2021-10-19,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...


In [56]:
# Check 'availability 365'
fig = px.histogram(df, x='availability 365')
fig.show()

In [57]:
fig = px.box(df, y='availability 365')
fig.show()

In [58]:
# We impute availability 365 with median value 127
df['availability 365'] = df['availability 365'].fillna(127)

In [59]:
# Check minimum nights
fig = px.histogram(df, x='minimum nights')
fig.show()

In [60]:
# This is weird, there are negative numbers and also large positive numbers
# Let us clip the data between 0 and 4th quartile
df['minimum nights'].min(), df['minimum nights'].max()

(-1223.0, 5645.0)

In [61]:
# We'll take log normal
fig = px.histogram(df, x='minimum nights', log_y=True)
fig.show()

In [62]:
fig = px.box(df, y='minimum nights', log_y=True)
fig.show()

In [63]:
# Let's clip the data between 0 and 13, the upper fence (Q3 + 1.5 * IQR)
df['minimum nights'].clip(lower=0, upper=13, inplace=True)
fig = px.histogram(df, x='minimum nights', log_y=True)
fig.show()

In [64]:
fig = px.box(df, y='minimum nights', log_y=True)
fig.show()

In [65]:
# Let's impute the 'minimum nights' feature with the median 3

df['minimum nights'] = df['minimum nights'].fillna(3)

In [66]:
# Check the price feature
# First we'll convert price from object to numeric
import re

idx = df.loc[~df.price.isnull()].index
df.loc[idx, 'price'] = df.loc[idx].apply(lambda x: re.sub(r'\D', '', x.price), axis=1)
df.loc[idx, 'price'] = pd.to_numeric(df['price'])

In [67]:
type(df.price[0])

float

In [68]:
fig = px.histogram(df, x='price')
fig.show()

In [69]:
# The distribution of price indicates that the mean and median will be close
fig = px.box(df, y='price')
fig.show()

In [70]:
df.price.mean()

624.7360700839849

In [71]:
# Impute the price with mean
df.price.fillna(df.price.mean(), inplace=True)

In [72]:
df.isnull().sum().sort_values(ascending=False)

service fee                       241
review rate number                239
Construction year                 190
number of reviews                 152
calculated host listings count    144
instant_bookable                   86
cancellation_policy                57
id                                  0
price                               0
availability 365                    0
reviews per month                   0
last review                         0
minimum nights                      0
room type                           0
NAME                                0
long                                0
lat                                 0
neighbourhood                       0
neighbourhood group                 0
host name                           0
host_identity_verified              0
host id                             0
house_rules                         0
dtype: int64

In [73]:
# Check service fee
df['service fee'].dtype

dtype('O')

In [74]:
df['service fee']

0         $193 
1          $28 
2         $124 
3          $74 
4          $41 
          ...  
102046      NaN
102047      NaN
102048      NaN
102049      NaN
102050      NaN
Name: service fee, Length: 69305, dtype: object

In [75]:
# We'll give same treatment to service fee as price
idx = df.loc[~df['service fee'].isnull()].index
df.loc[idx, 'service fee'] = df.loc[idx].apply(lambda x: re.sub(r'\D', '', x['service fee']), axis=1)
df.loc[idx, 'service fee'] = pd.to_numeric(df['service fee'])

In [76]:
type(df['service fee'][0])

float

In [77]:
fig = px.histogram(df, x='service fee')
fig.show()

In [78]:
# The distribution of service fee indicates that the mean and median will be close
fig = px.box(df, y='service fee')
fig.show()

In [79]:
df['service fee'].mean()

124.89402583111317

In [80]:
# Let's impute the service fee with mean
df['service fee'].fillna(df['service fee'].mean(), inplace=True)

In [81]:
df.dtypes

id                                         int64
NAME                                      object
host id                                    int64
host_identity_verified                    object
host name                                 object
neighbourhood group                       object
neighbourhood                             object
lat                                      float64
long                                     float64
instant_bookable                          object
cancellation_policy                       object
room type                                 object
Construction year                        float64
price                                    float64
service fee                              float64
minimum nights                           float64
number of reviews                        float64
last review                       datetime64[ns]
reviews per month                        float64
review rate number                       float64
calculated host list

In [82]:
# Let us take the following cumulative actions:
# 1. Check the distribution and impute review rate number
# 2. Check the distribution and impute Construction year
# 3. Check the distribution and impute number of reviews
# 4. Check the distribution and impute calculated host listings count
# 5. Check the unique values and impute instant_bookable
# 6. Check the unique values and impute cancellation_policy

In [83]:
# review rate number
fig = px.box(df, y='review rate number')
fig.show()

In [84]:
# It's a modest distribution, we'll impute with median
df['review rate number'].fillna(df['review rate number'].median(), inplace=True)

In [85]:
# Construction year
fig = px.box(df, y='Construction year')
fig.show()

In [86]:
# Impute with median
df['Construction year'].fillna(df['Construction year'].median(), inplace=True)

In [87]:
# number of reviews
fig = px.histogram(df, x='number of reviews')
fig.show()

In [88]:
# It has a heavy right skew, let's check log transform
fig = px.histogram(df, x='number of reviews', log_y=True)
fig.show()

In [89]:
fig = px.box(df, y='number of reviews', log_y=True)
fig.show()

In [90]:
# Impute with median
df['number of reviews'].fillna(df['number of reviews'].median(), inplace=True)

In [91]:
# calculated host listings count
fig = px.histogram(df, x='calculated host listings count')
fig.show()

In [92]:
# It has a heavy right skew, let's check log transform
fig = px.histogram(df, x='calculated host listings count', log_y=True)
fig.show()

In [93]:
fig = px.box(df, y='calculated host listings count', log_y=True)
fig.show()

In [94]:
# Impute with median
df['calculated host listings count'].fillna(df['calculated host listings count'].median(), inplace=True)

In [95]:
# Check the unique values and impute instant_bookable
df.instant_bookable.value_counts()

False    34867
True     34352
Name: instant_bookable, dtype: int64

In [96]:
# Giving the host benefit of doubt, impute the column with True
df.instant_bookable.fillna(True, inplace=True)

In [97]:
# Check the unique values and impute cancellation_policy
df.cancellation_policy.value_counts(0)

moderate    23187
flexible    23126
strict      22935
Name: cancellation_policy, dtype: int64

In [98]:
# Again,giving host benefit of doubt, impute with 'moderate'
df.cancellation_policy.fillna('moderate', inplace=True)

In [99]:
# Final check for null values
df.isnull().sum()

id                                0
NAME                              0
host id                           0
host_identity_verified            0
host name                         0
neighbourhood group               0
neighbourhood                     0
lat                               0
long                              0
instant_bookable                  0
cancellation_policy               0
room type                         0
Construction year                 0
price                             0
service fee                       0
minimum nights                    0
number of reviews                 0
last review                       0
reviews per month                 0
review rate number                0
calculated host listings count    0
availability 365                  0
house_rules                       0
dtype: int64

In [100]:
gc.collect()

6406

In [101]:
# Found anothor opportunity to clean the data during Part 2 of the exercise:
# 1. The neighbourhood group feature has a single entry 'The Bronx', which needs to be converted
# 2. All column headers to be converted to lowercase with underscore separators (should have done this first in this exercise too)

In [102]:
df['neighbourhood group'].value_counts()

Manhattan        29587
Brooklyn         27895
Queens            9262
Bronx             1911
Staten Island      649
The Bronx            1
Name: neighbourhood group, dtype: int64

In [103]:
# Convert the single occurance of 'The Bronx'
df.loc[df['neighbourhood group']=='The Bronx', 'neighbourhood group']='Bronx'

In [104]:
# Convert the column headers for later ease of usage
df.columns = df.columns.str.lower().str.replace(' ','_')
df.head(1)

Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,instant_bookable,cancellation_policy,room_type,construction_year,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,house_rules
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,False,strict,Private room,2020.0,966.0,193.0,10.0,9.0,2021-10-19,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...


In [105]:
df.to_csv('airbnb_nyc_clean.csv', index=False)

In [106]:
os.listdir()

['__notebook__.ipynb', 'airbnb_nyc_clean.csv']

In [107]:
df.head(1)

Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,instant_bookable,cancellation_policy,room_type,construction_year,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,house_rules
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,False,strict,Private room,2020.0,966.0,193.0,10.0,9.0,2021-10-19,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...


In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69305 entries, 0 to 102050
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              69305 non-null  int64         
 1   name                            69305 non-null  object        
 2   host_id                         69305 non-null  int64         
 3   host_identity_verified          69305 non-null  object        
 4   host_name                       69305 non-null  object        
 5   neighbourhood_group             69305 non-null  object        
 6   neighbourhood                   69305 non-null  object        
 7   lat                             69305 non-null  float64       
 8   long                            69305 non-null  float64       
 9   instant_bookable                69305 non-null  bool          
 10  cancellation_policy             69305 non-null  object        
 11  r

# This dataset has been cleaned, and the final version has been saved for later use during part 2, Data Visualization