In [1]:
## Import

%matplotlib inline
import numpy as np
import pandas as pd
import os
import urllib.request
import datetime
import gzip
import shutil


## Read in the data

In [2]:
listing_col = ['id',
               'listing_url',
               'name',
               'price',
               'summary',
               'host_id',
               'host_name',
               'host_about',
               'host_since',
               'host_listings_count',
               'host_total_listings_count',
               'host_response_rate',
               'description',
               'neighbourhood_cleansed',
               'property_type',
               'room_type',
               'price',
               'number_of_reviews',
               'instant_bookable',
               'review_scores_rating',
               'beds',
               'bedrooms',
               'bathrooms',
               'accommodates',
               'minimum_nights',
               'maximum_nights',
               'amenities', # Betekent voorzieningen
               'cancellation_policy',
               'reviews_per_month',
               'first_review',
               'last_review',
               'reviews_per_month',
               'review_scores_rating',
               'latitude',
               'longitude',
               'availability_30',
               'availability_60',
               'availability_90',
               'availability_365'              
              ]

url = 'http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2018-01-10/data/listings.csv.gz'
filename = 'listings_2018-01-10.csv.gz'
urllib.request.urlretrieve(url, filename)
unfilled = gzip.open(filename)
df_listings = pd.read_csv(unfilled, usecols=listing_col)
df_listings['publicatie'] = '2018-01-10'
        

our_dates= []
start = datetime.datetime.strptime("11-01-2018", "%d-%m-%Y")  # we can change this
end = datetime.datetime.strptime("31-12-2018", "%d-%m-%Y")    # we can change this
date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]

for date in date_generated:
    our_dates.append(date.strftime("%Y-%m-%d"))

    
item = "listings" ## We can also extract reviews and calanders
    
for date in our_dates:
    url = "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/" +date+ "/data/"+item+".csv.gz"
    filename = item +"_"+ date +".csv.gz" 
    file_python = item +"_"+ date
    try:
        urllib.request.urlretrieve(url, filename)
        print("found : " + date)
        unfilled = gzip.open(filename)
#         df_reviews[file_python] = pd.read_csv(unfilled)
        temp = pd.read_csv(unfilled, usecols=listing_col)
        temp['publicatie'] = date
        
        df_listings.append(temp)
        
        
    except urllib.error.URLError as e:
        print(e.reason) 
        

        


KeyboardInterrupt: 

In [3]:
url = 'http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2019-01-13/data/listings.csv.gz'
filename = 'listings_2019-01-13.csv.gz'
urllib.request.urlretrieve(url, filename)
unfilled = gzip.open(filename)
df_listings = pd.read_csv(unfilled, usecols=listing_col)
# df_listings['publicatie'] = '2019-01-13'

del unfilled

## Making the dataframe

In [4]:
df_listings['price'] = pd.to_numeric(df_listings['price'].str.replace(',','').str.replace('$',''))

# Group by columns
price = df_listings.groupby(df_listings.neighbourhood_cleansed).price.mean()
listings =df_listings.groupby(df_listings.neighbourhood_cleansed).id.nunique()
accounts = df_listings.groupby(df_listings.neighbourhood_cleansed).host_id.nunique()

area_overview = pd.concat((price,listings,accounts), axis=1, join='inner').reset_index()
area_overview.rename(columns={'neighbourhood_cleansed': 'name', 'price': 'avg_price_night', 'id':'listings', 'host_id':'accounts'}, inplace=True)

## Tijdelijk dummy values voor volgende kolommen
# area_overview["listings_per_ha"]  = area_overview["listings"] /3
# area_overview["percentage_high_availability"] = area_overview["listings"] / 4
# area_overview["percentage_host_with_multiple_listings"] = area_overview["listings"] /5
# area_overview["duplicates"] = area_overview["listings"] / 6
# area_overview["risk_rank"] = area_overview["listings"] / 7

## reorder the names
area_overview = area_overview.reindex(columns=['name','accounts','listings','avg_price_night'])

## Show result
area_overview


# accounts unfold_more	# listings unfold_more	price/night unfold_more	% high availability unfold_more	% multiple listings unfold_more	# duplicates unfold_more

# risico_rank

Unnamed: 0,name,accounts,listings,avg_price_night
0,Bijlmer-Centrum,94,109,91.073394
1,Bijlmer-Oost,82,102,99.813725
2,Bos en Lommer,1036,1137,117.425682
3,Buitenveldert - Zuidas,224,237,150.940928
4,Centrum-Oost,1439,1775,188.936901
5,Centrum-West,1850,2317,189.206733
6,De Aker - Nieuw Sloten,112,130,126.423077
7,De Baarsjes - Oud-West,3173,3480,143.556609
8,De Pijp - Rivierenbuurt,2242,2494,157.072173
9,Gaasperdam - Driemond,102,119,95.336134


In [5]:
inside_airbnb = pd.read_excel("data_inside_airbnb.xlsx", error_bad_lines=False)

In [6]:
area_overview = area_overview.merge(inside_airbnb, left_on='name', right_on='neighbourhood', how='left')

In [26]:
df_listings.head()

Unnamed: 0,id,listing_url,name,summary,description,host_id,host_name,host_since,host_about,host_response_rate,...,availability_60,availability_90,availability_365,number_of_reviews,first_review,last_review,review_scores_rating,instant_bookable,cancellation_policy,reviews_per_month
0,2818,https://www.airbnb.com/rooms/2818,Quiet Garden View Room & Super Fast WiFi,Quiet Garden View Room & Super Fast WiFi,Quiet Garden View Room & Super Fast WiFi I'm r...,3159,Daniel,2008-09-24,"Upon arriving in Amsterdam, one can imagine as...",,...,18,18,18,249,2009-03-30,2018-12-30,97.0,t,strict_14_with_grace_period,2.09
1,20168,https://www.airbnb.com/rooms/20168,100%Centre-Studio 1 Private Floor/Bathroom,"Cozy studio on your own private floor, 100% in...","Cozy studio on your own private floor, 100% in...",59484,Alexander,2009-12-02,Secondary phone nr. + (Phone number hidden by ...,100%,...,14,35,184,240,2010-03-02,2019-01-08,87.0,f,strict_14_with_grace_period,2.22
2,25428,https://www.airbnb.com/rooms/25428,Lovely apt in City Centre (Jordaan),,"This nicely furnished, newly renovated apt is...",56142,Joan,2009-11-20,"We are a retired couple who live in NYC, and h...",100%,...,31,31,155,1,2018-01-21,2018-01-21,100.0,f,strict_14_with_grace_period,0.08
3,27886,https://www.airbnb.com/rooms/27886,"Romantic, stylish B&B houseboat in canal district",Stylish and romantic houseboat on fantastic hi...,Stylish and romantic houseboat on fantastic hi...,97647,Flip,2010-03-23,"Marjan works in ""eye"" the dutch filmmuseum, an...",,...,35,52,164,175,2012-01-09,2018-12-30,99.0,t,strict_14_with_grace_period,2.05
4,28658,https://www.airbnb.com/rooms/28658,Cosy guest room near city centre -1,2 beds guest room in Amsterdam West near Erasm...,2 beds guest room in Amsterdam West near Erasm...,123414,Michele,2010-05-12,"I'm Italian (from Sardinia) , I live in Amster...",,...,30,39,265,438,2010-05-16,2018-12-26,93.0,f,moderate,4.15


In [7]:
duplicates = pd.read_json('C:\\Users\\Jeffr\\Downloads\\DataSystemProjects\\DataSystemProject\\Analyse_multiple_accounts_multiple_listings\\duplicates.js' ,orient='records')

duplicates_verrijkt = duplicates.merge(df_listings, left_on = 'case_A_id', right_on = 'id', how='left')

agg_df = duplicates_verrijkt.groupby('neighbourhood_cleansed').count()


agg_df = agg_df.iloc[:,1]

type(agg_df)

pandas.core.series.Series

In [8]:
area_overview = area_overview.set_index('name')
area_overview['duplicates'] = agg_df

In [9]:
area_overview["risk_rank"] = round(area_overview["listings"] /  area_overview["duplicates"] ,2)
area_overview = area_overview.drop(columns = 'neighbourhood').reset_index
area_overview



Unnamed: 0_level_0,accounts,listings,avg_price_night,percentage_high_availability,percentage_host_with_multiple_listings,duplicates,risk_rank
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bijlmer-Centrum,94,109,91.073394,26.9,25.0,3,36.33
Bijlmer-Oost,82,102,99.813725,36.6,27.6,6,17.0
Bos en Lommer,1036,1137,117.425682,18.3,19.8,30,37.9
Buitenveldert - Zuidas,224,237,150.940928,31.1,16.4,5,47.4
Centrum-Oost,1439,1775,188.936901,36.9,30.3,78,22.76
Centrum-West,1850,2317,189.206733,39.2,33.1,133,17.42
De Aker - Nieuw Sloten,112,130,126.423077,41.0,26.1,3,43.33
De Baarsjes - Oud-West,3173,3480,143.556609,20.2,17.4,83,41.93
De Pijp - Rivierenbuurt,2242,2494,157.072173,22.3,18.4,64,38.97
Gaasperdam - Driemond,102,119,95.336134,46.3,30.1,6,19.83


In [45]:

# area_overview.to_json('area_overview.json' ,orient='records')
