# Part I. Procuring Datasets
***
Importing dependencies and setting basic URL variables. URL will look like: 
<br/> 
*`http://data.insideairbnb.com/united-states/il/chicago/2019-07-15/visualisations/listings.csv`*

In [1]:
import os
import wget
import pandas as pd
from urllib.request import urlopen, URLError
from pprint import pprint

base = 'http://data.insideairbnb.com/united-states/il/chicago/'
v = '/visualisations'
        
l = '/listings.csv'
r = '/reviews.csv'
geo = '/neighbourhoods.geojson'


base+'2019-'+'07-'+'15'+v+l

'http://data.insideairbnb.com/united-states/il/chicago/2019-07-15/visualisations/listings.csv'

Create year, month, and day lists to be used as a lookup. 
<br/> 
Convert integers into padded string variables.

In [2]:
yr = list(range(2018,2020))
yr = [str(x) + '-' for x in yr]

mo = list(range(1,13))
mo = [format(x, '02d') + '-' for x in mo]

dy = list(range(1,32))
dy = [format(x, '02d') for x in dy]

print(f'Year:\n{yr}\n\nMonth:\n{mo}\n\nDay:\n{dy}')

Year:
['2018-', '2019-']

Month:
['01-', '02-', '03-', '04-', '05-', '06-', '07-', '08-', '09-', '10-', '11-', '12-']

Day:
['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']


In [3]:
temp = yr[0]+mo[0]+dy[0]
temp

'2018-01-01'

In [4]:
ymd_full = []

for day in dy: 
    for month in mo:
        for year in yr:
            ymd_full.append(year+month+day)

ymd_full[:10]

['2018-01-01',
 '2019-01-01',
 '2018-02-01',
 '2019-02-01',
 '2018-03-01',
 '2019-03-01',
 '2018-04-01',
 '2019-04-01',
 '2018-05-01',
 '2019-05-01']

Creating a function to generate a gross list of links
***

In [6]:
def download_list_csv(): 
    type = input('type of review? ')
    temp = []
    for date in ymd_full: 
        if type.startswith('li'): 
            temp.append(base+date+v+l)
        elif type.startswith('re'): 
            temp.append(base+date+v+r)
        else: 
            return ("go home, you're drunk")
    return temp

listings_list = download_list_csv()
pprint(listings_list[:10])
print('\n')
reviews_list = download_list_csv()
pprint(reviews_list[:10])
print('\n')
temp = download_list_csv()
pprint(temp)

type of review? listing
['http://data.insideairbnb.com/united-states/il/chicago/2018-01-01/visualisations/listings.csv',
 'http://data.insideairbnb.com/united-states/il/chicago/2019-01-01/visualisations/listings.csv',
 'http://data.insideairbnb.com/united-states/il/chicago/2018-02-01/visualisations/listings.csv',
 'http://data.insideairbnb.com/united-states/il/chicago/2019-02-01/visualisations/listings.csv',
 'http://data.insideairbnb.com/united-states/il/chicago/2018-03-01/visualisations/listings.csv',
 'http://data.insideairbnb.com/united-states/il/chicago/2019-03-01/visualisations/listings.csv',
 'http://data.insideairbnb.com/united-states/il/chicago/2018-04-01/visualisations/listings.csv',
 'http://data.insideairbnb.com/united-states/il/chicago/2019-04-01/visualisations/listings.csv',
 'http://data.insideairbnb.com/united-states/il/chicago/2018-05-01/visualisations/listings.csv',
 'http://data.insideairbnb.com/united-states/il/chicago/2019-05-01/visualisations/listings.csv']


type

Using the `urllib` library to validate the URLs in our newly generated lists
***

In [7]:
def validate_url(url):
    try:
        urlopen(url)
        return True
    except URLError:
        return False

print(validate_url("http://data.insideairbnb.com/united-states/il/chicago/8888888888888888/visualisations/reviews.csv"))
validate_url("http://data.insideairbnb.com/united-states/il/chicago/2019-07-15/visualisations/reviews.csv")

False


True

Creating a loop utilizing the newly created `validate_url` function
***

In [16]:
# temp = download_list_csv()

def download_query(): 
    
    dl_list = download_list_csv()
    
    invalid = []
    valid = []
    counter = 0

    print(f'{"%"*6} Ready. Set. Go. {"%"*6}\n{"-"*29}')

    for link in dl_list:
        if validate_url(link) == False: 
            invalid.append(link)
        else: 
            valid.append(link)
            counter += 1
            print(f'{counter}. {link}\n{"-"*29}')
    print(f'{"%"*5} Ready. Set. Stop. {"%"*5}')
    return valid
          
temp = download_query()

type of review? listing
%%%%%% Ready. Set. Go. %%%%%%
-----------------------------
1. http://data.insideairbnb.com/united-states/il/chicago/2019-02-09/visualisations/listings.csv
-----------------------------
2. http://data.insideairbnb.com/united-states/il/chicago/2018-10-11/visualisations/listings.csv
-----------------------------
3. http://data.insideairbnb.com/united-states/il/chicago/2018-02-12/visualisations/listings.csv
-----------------------------
4. http://data.insideairbnb.com/united-states/il/chicago/2019-03-12/visualisations/listings.csv
-----------------------------
5. http://data.insideairbnb.com/united-states/il/chicago/2018-12-13/visualisations/listings.csv
-----------------------------
6. http://data.insideairbnb.com/united-states/il/chicago/2019-06-14/visualisations/listings.csv
-----------------------------
7. http://data.insideairbnb.com/united-states/il/chicago/2018-09-14/visualisations/listings.csv
-----------------------------
8. http://data.insideairbnb.com/un

In [None]:
def bar_custom(current, total, width = 80):
    print("Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total))

In [27]:
temp = ['http://data.insideairbnb.com/united-states/il/chicago/2019-02-09/visualisations/listings.csv']

def download_csv():
    counter = 1

    for link in temp:
        
        x = 'data/listings'+str(counter)+'.csv'
        wget.download(link, out = "data")
        os.rename('data/listings.csv', x)
        counter += 1
        
download_csv()

100% [..........................................................................] 1057022 / 1057022

Combining everything into one function
***

In [27]:
def download_csv():
    csv_list = download_query()
    
    counter = 1
    print(f'\n{"="*6} Download start. {"="*6}\n{"~"*29}')
    for link in csv_list:
        if csv_list[0].endswith('listings.csv'): 
            csv_name = 'data/listings'+str(counter)+'.csv'
            wget.download(link, out = "data")
            os.rename('data/listings.csv', csv_name)
            counter += 1
        else: 
            csv_name = 'data/reviews'+str(counter)+'.csv'
            wget.download(link, out = "data")
            os.rename('data/reviews.csv', csv_name)
            counter += 1

    print(f'\n{"="*5} Download complete {"="*5}\n{"~"*29}')
    
temp = download_csv()

type of review? listing
%%%%%% Ready. Set. Go. %%%%%%
-----------------------------
1. http://data.insideairbnb.com/united-states/il/chicago/2019-02-09/visualisations/listings.csv
-----------------------------
2. http://data.insideairbnb.com/united-states/il/chicago/2018-10-11/visualisations/listings.csv
-----------------------------
3. http://data.insideairbnb.com/united-states/il/chicago/2018-02-12/visualisations/listings.csv
-----------------------------
4. http://data.insideairbnb.com/united-states/il/chicago/2019-03-12/visualisations/listings.csv
-----------------------------
5. http://data.insideairbnb.com/united-states/il/chicago/2018-12-13/visualisations/listings.csv
-----------------------------
6. http://data.insideairbnb.com/united-states/il/chicago/2019-06-14/visualisations/listings.csv
-----------------------------
7. http://data.insideairbnb.com/united-states/il/chicago/2018-09-14/visualisations/listings.csv
-----------------------------
8. http://data.insideairbnb.com/un

# Part II. Merging Datasets
***

In [None]:
# pandas concat
# df = pd.concat([pd.read_csv(f, index_col=[0,1]) for f in files])