In [1]:
## Import

%matplotlib inline
import numpy as np
import pandas as pd
import os
import urllib.request
import datetime
import gzip
import shutil


## Read in the data

In [2]:
listing_col = ['id',
               'listing_url',
               'name',
               'price',
               'summary',
               'host_id',
               'host_name',
               'host_about',
               'host_since',
               'host_listings_count',
               'host_total_listings_count',
               'host_response_rate',
               'description',
               'neighbourhood_cleansed',
               'property_type',
               'room_type',
               'price',
               'number_of_reviews',
               'instant_bookable',
               'review_scores_rating',
               'beds',
               'bedrooms',
               'bathrooms',
               'accommodates',
               'minimum_nights',
               'maximum_nights',
               'amenities', # Betekent voorzieningen
               'cancellation_policy',
               'reviews_per_month',
               'first_review',
               'last_review',
               'reviews_per_month',
               'review_scores_rating',
               'latitude',
               'longitude',
               'availability_30',
               'availability_60',
               'availability_90',
               'availability_365'              
              ]

url = 'http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2018-01-10/data/listings.csv.gz'
filename = 'listings_2018-01-10.csv.gz'
urllib.request.urlretrieve(url, filename)
unfilled = gzip.open(filename)
df_listings = pd.read_csv(unfilled, usecols=listing_col)
df_listings['publicatie'] = '2018-01-10'
        

our_dates= []
start = datetime.datetime.strptime("11-01-2018", "%d-%m-%Y")  # we can change this
end = datetime.datetime.strptime("31-12-2018", "%d-%m-%Y")    # we can change this
date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]

for date in date_generated:
    our_dates.append(date.strftime("%Y-%m-%d"))

    
item = "listings" ## We can also extract reviews and calanders
    
for date in our_dates:
    url = "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/" +date+ "/data/"+item+".csv.gz"
    filename = item +"_"+ date +".csv.gz" 
    file_python = item +"_"+ date
    try:
        urllib.request.urlretrieve(url, filename)
        print("found : " + date)
        unfilled = gzip.open(filename)
#         df_reviews[file_python] = pd.read_csv(unfilled)
        temp = pd.read_csv(unfilled, usecols=listing_col)
        temp['publicatie'] = date
        
        df_listings.append(temp)
        
        
    except urllib.error.URLError as e:
        print(e.reason) 
        

        


KeyboardInterrupt: 

In [3]:
url = 'http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2019-01-13/data/listings.csv.gz'
filename = 'listings_2019-01-13.csv.gz'
urllib.request.urlretrieve(url, filename)
unfilled = gzip.open(filename)
df_listings = pd.read_csv(unfilled, usecols=listing_col)
# df_listings['publicatie'] = '2019-01-13'

del unfilled

## Making the dataframe

In [4]:
df_listings['price'] = pd.to_numeric(df_listings['price'].str.replace(',','').str.replace('$',''))

# Group by columns
price = df_listings.groupby(df_listings.neighbourhood_cleansed).price.mean()
listings =df_listings.groupby(df_listings.neighbourhood_cleansed).id.nunique()
accounts = df_listings.groupby(df_listings.neighbourhood_cleansed).host_id.nunique()

area_overview = pd.concat((price,listings,accounts), axis=1, join='inner').reset_index()
area_overview.rename(columns={'neighbourhood_cleansed': 'name', 'price': 'avg_price_night', 'id':'listings', 'host_id':'accounts'}, inplace=True)

## Tijdelijk dummy values voor volgende kolommen
# area_overview["listings_per_ha"]  = area_overview["listings"] /3
# area_overview["percentage_high_availability"] = area_overview["listings"] / 4
# area_overview["percentage_host_with_multiple_listings"] = area_overview["listings"] /5
# area_overview["duplicates"] = area_overview["listings"] / 6
# area_overview["risk_rank"] = area_overview["listings"] / 7

## reorder the names
area_overview = area_overview.reindex(columns=['name','accounts','listings','avg_price_night'])

## Show result
area_overview


# accounts unfold_more	# listings unfold_more	price/night unfold_more	% high availability unfold_more	% multiple listings unfold_more	# duplicates unfold_more

# risico_rank

Unnamed: 0,name,accounts,listings,avg_price_night
0,Bijlmer-Centrum,94,109,91.073394
1,Bijlmer-Oost,82,102,99.813725
2,Bos en Lommer,1036,1137,117.425682
3,Buitenveldert - Zuidas,224,237,150.940928
4,Centrum-Oost,1439,1775,188.936901
5,Centrum-West,1850,2317,189.206733
6,De Aker - Nieuw Sloten,112,130,126.423077
7,De Baarsjes - Oud-West,3173,3480,143.556609
8,De Pijp - Rivierenbuurt,2242,2494,157.072173
9,Gaasperdam - Driemond,102,119,95.336134


In [5]:
inside_airbnb = pd.read_excel("data_inside_airbnb.xlsx", error_bad_lines=False)

In [6]:
area_overview = area_overview.merge(inside_airbnb, left_on='name', right_on='neighbourhood', how='left')

In [7]:
area_overview.head()

Unnamed: 0,name,accounts,listings,avg_price_night,neighbourhood,percentage_high_availability,percentage_host_with_multiple_listings
0,Bijlmer-Centrum,94,109,91.073394,Bijlmer-Centrum,26.9,25.0
1,Bijlmer-Oost,82,102,99.813725,Bijlmer-Oost,36.6,27.6
2,Bos en Lommer,1036,1137,117.425682,Bos en Lommer,18.3,19.8
3,Buitenveldert - Zuidas,224,237,150.940928,Buitenveldert - Zuidas,31.1,16.4
4,Centrum-Oost,1439,1775,188.936901,Centrum-Oost,36.9,30.3


In [8]:
duplicates = pd.read_json('C:\\Users\\Jeffr\\Downloads\\DataSystemProjects\\DataSystemProject\\Analyse_multiple_accounts_multiple_listings\\duplicates.js' ,orient='records')

duplicates_verrijkt = duplicates.merge(df_listings, left_on = 'case_A_id', right_on = 'id', how='left')

agg_df = duplicates_verrijkt.groupby('neighbourhood_cleansed').count()


agg_df = agg_df.iloc[:,1]

type(agg_df)

pandas.core.series.Series

In [9]:
area_overview = area_overview.set_index('neighbourhood')
area_overview['duplicates'] = agg_df

In [11]:
area_overview

Unnamed: 0_level_0,name,accounts,listings,avg_price_night,percentage_high_availability,percentage_host_with_multiple_listings,duplicates
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bijlmer-Centrum,Bijlmer-Centrum,94,109,91.073394,26.9,25.0,3
Bijlmer-Oost,Bijlmer-Oost,82,102,99.813725,36.6,27.6,6
Bos en Lommer,Bos en Lommer,1036,1137,117.425682,18.3,19.8,30
Buitenveldert - Zuidas,Buitenveldert - Zuidas,224,237,150.940928,31.1,16.4,5
Centrum-Oost,Centrum-Oost,1439,1775,188.936901,36.9,30.3,78
Centrum-West,Centrum-West,1850,2317,189.206733,39.2,33.1,133
De Aker - Nieuw Sloten,De Aker - Nieuw Sloten,112,130,126.423077,41.0,26.1,3
De Baarsjes - Oud-West,De Baarsjes - Oud-West,3173,3480,143.556609,20.2,17.4,83
De Pijp - Rivierenbuurt,De Pijp - Rivierenbuurt,2242,2494,157.072173,22.3,18.4,64
Gaasperdam - Driemond,Gaasperdam - Driemond,102,119,95.336134,46.3,30.1,6


In [15]:
risk = round(area_overview["listings"] /  area_overview["duplicates"] ,2)

import sklearn.preprocessing
risk_min_max = sklearn.preprocessing.minmax_scale(risk, feature_range=(0, 1), axis=0) #, copy=True)

risk_min_max


array([0.40731167, 0.0504062 , 0.43629985, 0.61170606, 0.15675775,
       0.058161  , 0.53655835, 0.51070901, 0.45605613, 0.10265879,
       0.49723043, 0.28175775, 0.32274742, 0.33290251, 1.        ,
       0.        , 0.49095273, 0.78895864, 0.25221566, 0.99907681,
       0.58364106, 0.62093796])

In [18]:
area_overview["risk_rank"] = risk_min_max

area_overview["risk_rank"] = round( area_overview["risk_rank"]  ,2)
# area_overview = area_overview.drop(columns = 'neighbourhood')
area_overview



Unnamed: 0_level_0,name,accounts,listings,avg_price_night,percentage_high_availability,percentage_host_with_multiple_listings,duplicates,risk_rank
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bijlmer-Centrum,Bijlmer-Centrum,94,109,91.073394,26.9,25.0,3,0.41
Bijlmer-Oost,Bijlmer-Oost,82,102,99.813725,36.6,27.6,6,0.05
Bos en Lommer,Bos en Lommer,1036,1137,117.425682,18.3,19.8,30,0.44
Buitenveldert - Zuidas,Buitenveldert - Zuidas,224,237,150.940928,31.1,16.4,5,0.61
Centrum-Oost,Centrum-Oost,1439,1775,188.936901,36.9,30.3,78,0.16
Centrum-West,Centrum-West,1850,2317,189.206733,39.2,33.1,133,0.06
De Aker - Nieuw Sloten,De Aker - Nieuw Sloten,112,130,126.423077,41.0,26.1,3,0.54
De Baarsjes - Oud-West,De Baarsjes - Oud-West,3173,3480,143.556609,20.2,17.4,83,0.51
De Pijp - Rivierenbuurt,De Pijp - Rivierenbuurt,2242,2494,157.072173,22.3,18.4,64,0.46
Gaasperdam - Driemond,Gaasperdam - Driemond,102,119,95.336134,46.3,30.1,6,0.1


In [19]:

area_overview.to_json('area_overview.json' ,orient='records')
