In [1]:
import os
from bs4 import BeautifulSoup
from urllib.request import urlopen, urlretrieve
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', -1)

In [2]:
# URL where data is obtained, and its output folder
URL = 'http://insideairbnb.com/get-the-data.html'
OUTPUT_DIR = '../Asset'

u = urlopen(URL)
try:
    html = u.read().decode('utf-8')
finally:
    u.close()

soup = BeautifulSoup(html, "html.parser")

date = []
href = []

# Filter html to find first td tag in each table class, to save datestamp of file
for table in soup.findAll("table", {"class": "table table-hover table-striped new-york-city"}):
    for tr in table.findAll('tr'):
        tdTag = tr.find('td')
        try:
            date.append(tdTag.text) # Save date
        except AttributeError:
            pass
    for a in table.findAll('a'):
        href.append(a['href']) # Save file download link
              
# Save into dataframe
href_df = pd.DataFrame(list(zip(date, href)), columns =['Date', 'Href'])    

In [3]:
# To replace missing date values with preceding date values
href_df.Date = href_df.Date.replace({'N/A': np.nan}).fillna(method='ffill')

In [4]:
# Convert date column to datetime
href_df['Date'] = pd.to_datetime(href_df['Date'])

# Set date column as index
href_df.set_index(['Date']);

In [5]:
# Create YearMonth column for file nomenclature later
href_df['YearMonth'] = href_df['Date'].map(lambda x: x.month + 100*x.year)

In [6]:
# Create file column in dictionary for file type
file = []
for i in href_df.Href:
    file.append(i.split('/')[-1])
    
href_df['file'] = file

In [7]:
# Remove unecessary file downloads link from dataframe
href_df = href_df.drop(href_df[href_df.file == 'listings.csv'].index)
href_df = href_df.drop(href_df[href_df.file == 'reviews.csv'].index)
href_df = href_df.drop(href_df[href_df.file == 'neighbourhoods.csv'].index)
href_df.reset_index(inplace=True);

In [8]:
# Filter for 1 year (to be downloaded)
href_1yr = href_df.head(48)

In [9]:
href_1yr.head(8)

Unnamed: 0,index,Date,Href,YearMonth,file
0,0,2019-09-12,http://data.insideairbnb.com/united-states/ny/new-york-city/2019-09-12/data/listings.csv.gz,201909,listings.csv.gz
1,1,2019-09-12,http://data.insideairbnb.com/united-states/ny/new-york-city/2019-09-12/data/calendar.csv.gz,201909,calendar.csv.gz
2,2,2019-09-12,http://data.insideairbnb.com/united-states/ny/new-york-city/2019-09-12/data/reviews.csv.gz,201909,reviews.csv.gz
3,6,2019-09-12,http://data.insideairbnb.com/united-states/ny/new-york-city/2019-09-12/visualisations/neighbourhoods.geojson,201909,neighbourhoods.geojson
4,7,2019-08-06,http://data.insideairbnb.com/united-states/ny/new-york-city/2019-08-06/data/listings.csv.gz,201908,listings.csv.gz
5,8,2019-08-06,http://data.insideairbnb.com/united-states/ny/new-york-city/2019-08-06/data/calendar.csv.gz,201908,calendar.csv.gz
6,9,2019-08-06,http://data.insideairbnb.com/united-states/ny/new-york-city/2019-08-06/data/reviews.csv.gz,201908,reviews.csv.gz
7,13,2019-08-06,http://data.insideairbnb.com/united-states/ny/new-york-city/2019-08-06/visualisations/neighbourhoods.geojson,201908,neighbourhoods.geojson


Data is collected in the middle of each month. For each month, over a period of 1 year, 4 files will be downloaded:
<br>1) Full listing details of current listings - listings.csv.gz
<br>2) 1 year forward calendar availability     - calendar.csv.gz
<br>3) Reviews till date for each listing       - reviews.csv.gz
<br>4) Geojson file for the listings            - neighbourhoods.geojson

In [10]:
# Downloading of latest 12 months files
for num, link in enumerate(href_1yr['Href']):
    filename = os.path.join(OUTPUT_DIR, str(href_1yr.YearMonth[num]) + '_' + str(href_1yr.file[num]))

    print("Downloading %s to %s..." % (link, filename) )
    urlretrieve(link, filename)
    print("Done.")

In [12]:
# Creating a merged listing dataframe across the 12 months, and removing duplicates
listings = pd.DataFrame()

for i in href_1yr['YearMonth'].unique().tolist():
    listed = pd.read_csv('../Asset/{}'.format(str(i) + '_' + 'listings.csv.gz'), compression='gzip', header=0, sep=',', quotechar='"')
    listings = pd.concat([listings,listed], ignore_index=True, sort=False).drop_duplicates(subset='id')

In [13]:
# Creating a merged reviews dataframe across the 12 months, and removing duplicates
reviews = pd.DataFrame()

for i in href_1yr['YearMonth'].unique().tolist():
    rev = pd.read_csv('../Asset/{}'.format(str(i) + '_' + 'reviews.csv.gz'), compression='gzip', header=0, sep=',', quotechar='"')
    reviews = pd.concat([reviews,rev], ignore_index=True, sort=False).drop_duplicates(subset='id')

In [14]:
# Export out merged files for modelling in separate notebook
listings.to_csv(r'../Asset/listing_merged.csv', index=False)
reviews.to_csv(r'../Asset/reviews_merged.csv', index=False)