In [77]:
import numpy as np
import pandas as pd
import datetime
from dateutil.parser import parse
import requests, re, time
import pandas_datareader
import pickle
import math

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.lines import Line2D
import matplotlib.patches as mpatches
import seaborn as sns 

import statsmodels.api as sm
import json
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup
from selenium import webdriver
import time

import geopy.distance

In [2]:
### Define our Connector

import requests,os,time
def ratelimit(dt):
    "A function that handles the rate of your calls."
    time.sleep(dt) # sleep one second.

class Connector():
  def __init__(self,logfile,overwrite_log=False,connector_type='requests',session=False,path2selenium='',n_tries = 5,timeout=30,waiting_time=0.5):
    """This Class implements a method for reliable connection to the internet and monitoring. 
    It handles simple errors due to connection problems, and logs a range of information for basic quality assessments
    
    Keyword arguments:
    logfile -- path to the logfile
    overwrite_log -- bool, defining if logfile should be cleared (rarely the case). 
    connector_type -- use the 'requests' module or the 'selenium'. Will have different since the selenium webdriver does not have a similar response object when using the get method, and monitoring the behavior cannot be automated in the same way.
    session -- requests.session object. For defining custom headers and proxies.
    path2selenium -- str, sets the path to the geckodriver needed when using selenium.
    n_tries -- int, defines the number of retries the *get* method will try to avoid random connection errors.
    timeout -- int, seconds the get request will wait for the server to respond, again to avoid connection errors.
    """
    
    ## Initialization function defining parameters. 
    self.n_tries = n_tries # For avoiding triviel error e.g. connection errors, this defines how many times it will retry.
    self.timeout = timeout # Defining the maximum time to wait for a server to response.
    self.waiting_time = waiting_time # define simple rate_limit parameter.
    ## not implemented here, if you use selenium.
    if connector_type=='selenium':
      assert path2selenium!='', "You need to specify the path to you geckodriver if you want to use Selenium"
      from selenium import webdriver 
      ## HIN download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases

      assert os.path.isfile(path2selenium),'You need to insert a valid path2selenium the path to your geckodriver. You can download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases'
      self.browser = webdriver.Firefox(executable_path=path2selenium) # start the browser with a path to the geckodriver.

    self.connector_type = connector_type # set the connector_type
    
    if session: # set the custom session
      self.session = session
    else:
      self.session = requests.session()
    self.logfilename = logfile # set the logfile path
    ## define header for the logfile
    header = ['id','project','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
    if os.path.isfile(logfile):        
      if overwrite_log==True:
        self.log = open(logfile,'w')
        self.log.write(';'.join(header))
      else:
        self.log = open(logfile,'a')
    else:
      self.log = open(logfile,'w')
      self.log.write(';'.join(header))
    ## load log 
    with open(logfile,'r') as f: # open file
        
      l = f.read().split('\n') # read and split file by newlines.
      ## set id
      if len(l)<=1:
        self.id = 0
      else:
        self.id = int(l[-1][0])+1
            
  def get(self,url,project_name):
    """Method for connector reliably to the internet, with multiple tries and simple error handling, as well as default logging function.
    Input url and the project name for the log (i.e. is it part of mapping the domain, or is it the part of the final stage in the data collection).
    
    Keyword arguments:
    url -- str, url
    project_name -- str, Name used for analyzing the log. Use case could be the 'Mapping of domain','Meta_data_collection','main data collection'. 
    """
     
    project_name = project_name.replace(';','-') # make sure the default csv seperator is not in the project_name.
    if self.connector_type=='requests': # Determine connector method.
      for _ in range(self.n_tries): # for loop defining number of retries with the requests method.
        ratelimit(self.waiting_time)
        t = time.time()
        try: # error handling 
          response = self.session.get(url,timeout = self.timeout) # make get call

          err = '' # define python error variable as empty assumming success.
          success = True # define success variable
          redirect_url = response.url # log current url, after potential redirects 
          dt = t - time.time() # define delta-time waiting for the server and downloading content.
          size = len(response.text) # define variable for size of html content of the response.
          response_code = response.status_code # log status code.
          ## log...
          call_id = self.id # get current unique identifier for the call
          self.id+=1 # increment call id
          #['id','project_name','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
          row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row to be written in the log.
          self.log.write('\n'+';'.join(map(str,row))) # write log.
          self.log.flush()
          return response,call_id # return response and unique identifier.

        except Exception as e: # define error condition
          err = str(e) # python error
          response_code = '' # blank response code 
          success = False # call success = False
          size = 0 # content is empty.
          redirect_url = '' # redirect url empty 
          dt = t - time.time() # define delta t

          ## log...
          call_id = self.id # define unique identifier
          self.id+=1 # increment call_id

          row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row
          self.log.write('\n'+';'.join(map(str,row))) # write row to log.
          self.log.flush()
    else:
      t = time.time()
      ratelimit(self.waiting_time)
      self.browser.get(url) # use selenium get method
      ## log
      call_id = self.id # define unique identifier for the call. 
      self.id+=1 # increment the call_id
      err = '' # blank error message
      success = '' # success blank
      redirect_url = self.browser.current_url # redirect url.
      dt = t - time.time() # get time for get method ... NOTE: not necessarily the complete load time.
      size = len(self.browser.page_source) # get size of content ... NOTE: not necessarily correct, since selenium works in the background, and could still be loading.
      response_code = '' # empty response code.
      row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row 
      self.log.write('\n'+';'.join(map(str,row))) # write row to log file.
      self.log.flush()
    # Using selenium it will not return a response object, instead you should call the browser object of the connector.
    ## connector.browser.page_source will give you the html.
      return None,call_id

In [3]:
path2gecko = '/Users/holger/Documents/Python/Harmsen_Repo/Metro-Study/Gecko/geckodriver'
connector = Connector('Boliga Scrape.csv',overwrite_log=True,path2selenium=path2gecko,connector_type='requests')

In [4]:
url = 'https://api.boliga.dk/api/v2/sold/search/results?propertyType=3&municipality=101&salesDateMin=1992&salesDateMax=today&sort=date-d&page=1&street='

r, _ = connector.get(url, 'Boliga Scrape')

In [5]:
data = r.json()

In [6]:
data.keys()

dict_keys(['meta', 'results'])

In [7]:
data['meta']

{'pageIndex': 1,
 'pageSize': 50,
 'totalCount': 55540,
 'totalPages': 1111,
 'minPage': 1,
 'maxPage': 6,
 'countFrom': 1,
 'countTo': 50}

In [8]:
data['results'][1].keys()

dict_keys(['estateId', 'address', 'zipCode', 'price', 'soldDate', 'propertyType', 'saleType', 'sqmPrice', 'rooms', 'size', 'buildYear', 'change', 'guid', 'latitude', 'longitude', 'municipalityCode', 'estateCode', 'city', 'groupKey', 'canGetVR'])

In [26]:
def ApartmentScraper(pages,municipality):
    Apartmentlist = [] 
    start_time = time.time()

    for i in range(1,pages+1):
        if i % 50 == 0:
            print(f'Iteration: {i}')
        url = f'https://api.boliga.dk/api/v2/sold/search/results?propertyType=3&municipality={municipality}&salesDateMin=1992&salesDateMax=today&sort=date-d&page={i}&street='

        try: 
            r, _ = connector.get(url, 'Boliga Scrape')
            Apartmentlist.append(r.json())
        except Exception as e:
            print(e)
        
    time.sleep(0.2)
    print(f'Time elapsed: {time.time()-start_time: .2f}')
    return Apartmentlist

In [31]:
CPH_Apartments = ApartmentScraper(1110,101) # Copenhagen: Municipality 101
CPH_Apartments_df = pd.concat([pd.DataFrame(data['results']) for data in CPH_Apartments])

Iteration: 50
Iteration: 100
Iteration: 150
Iteration: 200
Iteration: 250
Iteration: 300
Iteration: 350
Iteration: 400
Iteration: 450
Iteration: 500
Iteration: 550
Iteration: 600
Iteration: 650
Iteration: 700
Iteration: 750
Iteration: 800
Iteration: 850
Iteration: 900
Iteration: 950
Iteration: 1000
Iteration: 1050
Iteration: 1100
Time elapsed:  994.20


In [32]:
FRB_Apartments = ApartmentScraper(247,147) # Copenhagen: Municipality 147
FRB_Apartments_df = pd.concat([pd.DataFrame(data['results']) for data in FRB_Apartments])

Iteration: 50
Iteration: 100
Iteration: 150
Iteration: 200
Time elapsed:  216.59


In [33]:
Apartments = pd.concat([CPH_Apartments_df,FRB_Apartments_df],ignore_index=True)
Apartments.to_pickle('/Users/holger/Documents/Python/Harmsen_Repo/Metro-Study/Pickles/Apartment Data API.pkl')

## Working with the data

In [91]:
Apartments = pd.read_pickle('/Users/holger/Documents/Python/Harmsen_Repo/Metro-Study/Pickles/Apartment Data API.pkl')

In [92]:
Apartments.head(n=2)

Unnamed: 0,estateId,address,zipCode,price,soldDate,propertyType,saleType,sqmPrice,rooms,size,buildYear,change,guid,latitude,longitude,municipalityCode,estateCode,city,groupKey,canGetVR
0,1677218,"Brofogedvej 14, 3. th",2400,3195000,2020-08-05T22:00:00.000Z,3,Alm. Salg,48409.09,3.0,66,1899,3.231018,4B11F5D0-1CEC-4D45-8CA0-DB28989FCF66,55.70392,12.53198,101,73963,København NV,,True
1,1670734,"Nordre Frihavnsgade 55, st. tv",2100,4000000,2020-08-05T22:00:00.000Z,3,Alm. Salg,50000.0,3.0,80,1893,2.695764,4300D256-CF8C-4573-ACDA-98F771473FEA,55.7027,12.583746,101,396657,København Ø,,True


In [99]:
Apartments.rename(columns={'address':'Address','price':'Price','sqmPrice':'Price_sq_m','rooms':'Rooms','size':'sq_m'},inplace=True)

Apartments['Address'] = Apartments['Address'] + ', ' + Apartments['city'] #Including area and city in address, so it resembles the scraper.
Apartments['Address transformed'] = [re.sub(',[^>]+,', '',i)
for i in Apartments['Address']
    ]

Apartments['Price_mio'] = Apartments['Price']/10**6
Apartments['Price_sq_m_1000'] = Apartments['Price_sq_m']/10**3

# Date sold
dates = [i[:10] for i in Apartments['soldDate']]
Apartments['Date_sold'] = [datetime.datetime(year=int(i[:4]),month=int(i[5:7]),day=int(i[8:]),)
    for i in dates
]

Apartments['Coordinates'] = [(Apartments['latitude'][i],Apartments['longitude'][i]) for i in range(0,len(Apartments))]

Apartments['Rooms'] = [int(i) for i in Apartments['Rooms']]

In [100]:
Apartments

Unnamed: 0,estateId,Address,zipCode,Price,soldDate,propertyType,saleType,Price_sq_m,Rooms,sq_m,...,estateCode,city,groupKey,canGetVR,Address transformed,Price_mio,Price_sq_m_1000,Date_sold,Coordinates,Area
0,1677218,"Brofogedvej 14, 3. th, København NV, København...",2400,3195000,2020-08-05T22:00:00.000Z,3,Alm. Salg,48409.0900,3,66,...,73963,København NV,,True,Brofogedvej 14 København NV,3.1950,48.409090,2020-08-05,"(55.70392, 12.53198)",NV
1,1670734,"Nordre Frihavnsgade 55, st. tv, København Ø, K...",2100,4000000,2020-08-05T22:00:00.000Z,3,Alm. Salg,50000.0000,3,80,...,396657,København Ø,,True,Nordre Frihavnsgade 55 København Ø,4.0000,50.000000,2020-08-05,"(55.7027, 12.583746)",Ø
2,0,"A.C. Meyers Vænge 5, st. tv, København SV, Køb...",2450,2650000,2020-08-04T22:00:00.000Z,3,Fam. Salg,18794.3260,5,141,...,771610,København SV,,True,A.C. Meyers Vænge 5 København SV,2.6500,18.794326,2020-08-04,"(55.652477, 12.543737)",
3,1663437,"Bremensgade 25, 1. th, København S, København ...",2300,3270000,2020-08-04T22:00:00.000Z,3,Alm. Salg,40875.0000,4,80,...,70522,København S,,True,Bremensgade 25 København S,3.2700,40.875000,2020-08-04,"(55.66606, 12.617712)",S
4,0,"Nørre Søgade 19, 5. tv, København K, København...",1370,3162500,2020-08-03T22:00:00.000Z,3,Fam. Salg,28750.0000,4,110,...,793451,København K,,True,Nørre Søgade 19 København K,3.1625,28.750000,2020-08-03,"(55.6845, 12.563928)",K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67845,0,"Moltkesvej 61, st. th, Frederiksberg, Frederik...",2000,690000,1992-01-25T23:00:00.000Z,3,Alm. Salg,7840.9090,4,88,...,80573,Frederiksberg,,True,Moltkesvej 61 Frederiksberg,0.6900,7.840909,1992-01-25,"(55.68591, 12.508274)",FRB
67846,0,"Sankt Thomas Alle 11, 4. th, Frederiksberg C, ...",1824,725800,1992-01-23T23:00:00.000Z,3,Alm. Salg,6598.1816,4,110,...,198310,Frederiksberg C,,True,Sankt Thomas Alle 11 Frederiksberg C,0.7258,6.598182,1992-01-23,"(55.674374, 12.546298)",FRB
67847,0,"Wilkensvej 7, 5. tv, Frederiksberg, Frederiksb...",2000,535000,1992-01-19T23:00:00.000Z,3,Alm. Salg,8492.0630,2,63,...,137834,Frederiksberg,,True,Wilkensvej 7 Frederiksberg,0.5350,8.492063,1992-01-19,"(55.68207, 12.516475)",FRB
67848,0,"Carl Bernhards Vej 3A, st, Frederiksberg C, Fr...",1817,750000,1992-01-16T23:00:00.000Z,3,Alm. Salg,3260.8696,7,230,...,199430,Frederiksberg C,,True,Carl Bernhards Vej 3A Frederiksberg C,0.7500,3.260870,1992-01-16,"(55.672497, 12.544128)",FRB


In [95]:
# Inferring the area from the last part of the address:
Apartments['Area'] = 'NaN'
for i in range(0,len(Apartments)):
    if Apartments['city'][i][-1] == 'N':
        Apartments['Area'][i] = 'N'
        # Solving issue with differentiating between NV and V:
    elif Apartments['city'][i][-2:] == 'NV':
        if Apartments['city'][i][-2] == ' V':
            Apartments['Area'][i] = 'V'
        else: 
            Apartments['Area'][i] = 'NV'
    elif Apartments['city'][i][-1] == 'Ø':
        Apartments['Area'][i] = 'Ø'
    elif Apartments['city'][i][-1] == 'K':
        Apartments['Area'][i] = 'K'
    elif Apartments['city'][i][-1] == 'S':
        Apartments['Area'][i] = 'S'
    elif Apartments['city'][i][-5:] == 'Valby':
        Apartments['Area'][i] = 'Valby'
    elif Apartments['city'][i][-13:] == 'Frederiksberg':
        Apartments['Area'][i] = 'FRB'
    elif Apartments['city'][i][-1:] == 'C':
        Apartments['Area'][i] = 'FRB'

In [96]:
Apartments

Unnamed: 0,estateId,Address,zipCode,Price,soldDate,propertyType,saleType,Price_sq_m,Rooms,sq_m,...,estateCode,city,groupKey,canGetVR,Address transformed,Price_mio,Price_sq_m_1000,Date_sold,Coordinates,Area
0,1677218,"Brofogedvej 14, 3. th, København NV",2400,3195000,2020-08-05T22:00:00.000Z,3,Alm. Salg,48409.0900,3.0,66,...,73963,København NV,,True,Brofogedvej 14 København NV,3.1950,48.409090,2020-08-05,"(55.70392, 12.53198)",NV
1,1670734,"Nordre Frihavnsgade 55, st. tv, København Ø",2100,4000000,2020-08-05T22:00:00.000Z,3,Alm. Salg,50000.0000,3.0,80,...,396657,København Ø,,True,Nordre Frihavnsgade 55 København Ø,4.0000,50.000000,2020-08-05,"(55.7027, 12.583746)",Ø
2,0,"A.C. Meyers Vænge 5, st. tv, København SV",2450,2650000,2020-08-04T22:00:00.000Z,3,Fam. Salg,18794.3260,5.0,141,...,771610,København SV,,True,A.C. Meyers Vænge 5 København SV,2.6500,18.794326,2020-08-04,"(55.652477, 12.543737)",
3,1663437,"Bremensgade 25, 1. th, København S",2300,3270000,2020-08-04T22:00:00.000Z,3,Alm. Salg,40875.0000,4.0,80,...,70522,København S,,True,Bremensgade 25 København S,3.2700,40.875000,2020-08-04,"(55.66606, 12.617712)",S
4,0,"Nørre Søgade 19, 5. tv, København K",1370,3162500,2020-08-03T22:00:00.000Z,3,Fam. Salg,28750.0000,4.0,110,...,793451,København K,,True,Nørre Søgade 19 København K,3.1625,28.750000,2020-08-03,"(55.6845, 12.563928)",K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67845,0,"Moltkesvej 61, st. th, Frederiksberg",2000,690000,1992-01-25T23:00:00.000Z,3,Alm. Salg,7840.9090,4.0,88,...,80573,Frederiksberg,,True,Moltkesvej 61 Frederiksberg,0.6900,7.840909,1992-01-25,"(55.68591, 12.508274)",FRB
67846,0,"Sankt Thomas Alle 11, 4. th, Frederiksberg C",1824,725800,1992-01-23T23:00:00.000Z,3,Alm. Salg,6598.1816,4.0,110,...,198310,Frederiksberg C,,True,Sankt Thomas Alle 11 Frederiksberg C,0.7258,6.598182,1992-01-23,"(55.674374, 12.546298)",FRB
67847,0,"Wilkensvej 7, 5. tv, Frederiksberg",2000,535000,1992-01-19T23:00:00.000Z,3,Alm. Salg,8492.0630,2.0,63,...,137834,Frederiksberg,,True,Wilkensvej 7 Frederiksberg,0.5350,8.492063,1992-01-19,"(55.68207, 12.516475)",FRB
67848,0,"Carl Bernhards Vej 3A, st, Frederiksberg C",1817,750000,1992-01-16T23:00:00.000Z,3,Alm. Salg,3260.8696,7.0,230,...,199430,Frederiksberg C,,True,Carl Bernhards Vej 3A Frederiksberg C,0.7500,3.260870,1992-01-16,"(55.672497, 12.544128)",FRB
