# Imports

In [1]:
import os
import numpy as np
import pandas as pd

import httplib2

import urllib
import requests
import re
from bs4 import BeautifulSoup

# Config

In [2]:
class Config():
    def __init__(self, data_path='/Users/guillaumecorda/Desktop/UvA/Information Retrieval/Project/', url=None):
        self.data_path = data_path
        self.url = url

In [3]:
cfg_ams = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=amsterdam&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.510+rental+houses')
cfg_rot = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=rotterdam&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.518+rental+houses')
cfg_haag = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=den+haag&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.518+rental+houses')
cfg_gro = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=groningen&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.517+rental+houses')
cfg_utr = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=utrecht&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.517+rental+houses')
cfg_ein = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=eindhoven&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.517+rental+houses')

# Helper functions

## Get content

In [4]:
def get_html(url):
    if 'http' not in url:
        raise ValueError('Please add protocol to url.')
    html_page = urllib.request.urlopen(url)
    soup = BeautifulSoup(html_page, features='html.parser')
    return soup

## Remove html tags

In [5]:
def remove_html_tags(text):
    p = re.compile(r'<.*?>')
    return p.sub('', text)

## Extract outgoing links

In [6]:
def extract_outgoing_links(url):
    soup = get_html(url)
    content = soup.find_all('div', class_ = 'newsBox')
    links = list()
    for flat in content:
        tag_news = str(flat.find('a', class_='news'))
        url = re.findall('"([^"]*)"', tag_news)[1]
        links.append(url)
    return links

## Url editor

In [7]:
def edit_url(url, page):
    if 'page' not in url :
        url = url+ '&page='+str(page)
    else:
        url = url.replace(url[-1], str(page))
    return url

## Details Scraper

In [8]:
def get_rent_details(url):
    soup = get_html(url)
    details_rent = soup.find_all('div', class_ = 'detailBox')

    infos = remove_html_tags(str(details_rent[0]))
    infos = infos.replace('\n', ' ')
    infos = infos.replace('\t', ' ')
    infos = infos.replace('\n2', ' ')
    infos = infos.split('   ')
    infos = [el for el in infos if el != '']
    
    infos = infos[1:8]
    
    #uncomment lines for new features
    infos[0] = infos[0].replace('Rent per month: € ','')
    infos[0] = infos[0].replace(',-', '')
    infos[1] = infos[1].replace('Available from: ', '')
    #infos[2] = infos[2].replace('Offered since: ', '')
    #infos[3] = infos[3].replace('Offered by: ', '')
    infos[4] = infos[4].replace('Surface (m2): ', '')
    infos[4] = infos[4].replace(' m²', '')
    infos[5] = infos[5].replace('Interior:  ', '')
    infos[6] = infos[6].replace('Number of bedrooms ', '')

    #remove line for new features
    del infos[2:3]
    del infos[2]
    
    return infos

## Location Scraper

In [9]:
def get_location(url, city):
    
    soup = get_html(url)
    content = soup.find_all('div', class_ = 'brownBox')

    infos = remove_html_tags(str(content[0]))
    infos = infos.replace('\n', ' ')
    infos = infos.replace('\t', ' ')
    infos = infos.replace('\n2', ' ')

    indice_start = infos.find('Rental house')
    infos = infos[indice_start:]
    
    if city == 'Amsterdam':
        infos = infos.replace('Rental house: Apartment renting in Amsterdam                 ', '')
    elif city == 'Rotterdam':
        infos = infos.replace('Rental house: Apartment renting in Rotterdam                 ', '')
    elif city == 'Den Haag':
        infos = infos.replace('Rental house: Apartment renting in Den Haag                 ', '')
    elif city == 'Utrecht':
        infos = infos.replace('Rental house: Apartment renting in Utrecht                 ', '')
    elif city == 'Groningen':
        infos = infos.replace('Rental house: Apartment renting in Groningen                 ', '')
    elif city == 'Eindhoven':
        infos = infos.replace('Rental house: Apartment renting in Eindhoven                 ', '')
    
    indice_end = infos.find('€')
    infos = infos[:indice_end]
    location = " ".join(infos.split())
    
    return location

# Scraper

## Scrap one link

In [10]:
links_ams = extract_outgoing_links(cfg_ams.url)

In [11]:
data = get_rent_details(links_ams[10])

In [12]:
data.append(get_location(links_ams[0], city='Amsterdam'))

UnicodeEncodeError: 'ascii' codec can't encode characters in position 52-53: ordinal not in range(128)

In [None]:
data

## Automation for one page

In [15]:
def mini_scraper(url, city):
    
    links = extract_outgoing_links(url)
    df = pd.DataFrame(columns=['Rent', 'Available from', 'Surface', 'Interior', 'Number of bedrooms',  'Location'])
    
    for i, link in enumerate(links):
        try:
            data = get_rent_details(link)
            data.append(get_location(link, city))
            df.loc[i] = data
            print('Scraping url {}/{}'.format(i+1, len(links)), end='\r')
        except:
            print(link)
    return df

In [16]:
df = mini_scraper(cfg_rot.url, city='Rotterdam')
df.head()

Scraping url 20/20

Unnamed: 0,Rent,Available from,Surface,Interior,Number of bedrooms,Location
0,720,right away,52,upholstered,1,Keucheniusstraat 3038SK Rotterdam
1,595,right away,55,upholstered,2,Pleinweg 3081JR Rotterdam
2,595,right away,49,upholstered,1,Frans Bekkerstraat 3082TP Rotterdam
3,710,right away,56,upholstered,2,Fazantstraat 3083ZE Rotterdam
4,675,01-03-2019,20,upholstered,1,Rental house: Room renting in Rotterdam Mathen...


## Main Scraper

In [19]:
def main_scraper(url, city, nb_page):
    print('Scraping page: 1/{}'.format(nb_page))
    df = mini_scraper(url, city)
    df_list = [df]
    for i in range(1, nb_page):
        print('Scraping page: {}/{}'.format(i+1, nb_page))
        url = edit_url(url, i)
        df_list.append(mini_scraper(url, city))
    df = pd.concat(df_list)
    return df

### Amsterdam

In [20]:
df_ams = main_scraper(cfg_ams.url, city='Amsterdam', nb_page=8)

Scraping page: 1/8
https://www.huurda.com/properties_for_rent_in_Amsterdam/Apartment/CuraÃ§aostraat--404633/
https://www.huurda.com/properties_for_rent_in_Amsterdam/Apartment/BethaniÃ«ndwarsstraat--403731/
Scraping page: 2/8
https://www.huurda.com/properties_for_rent_in_Amsterdam/Apartment/CuraÃ§aostraat--404633/
https://www.huurda.com/properties_for_rent_in_Amsterdam/Apartment/BethaniÃ«ndwarsstraat--403731/
Scraping page: 3/8
Scraping page: 4/8
Scraping page: 5/8
Scraping page: 6/8
Scraping page: 7/8
Scraping page: 8/8
https://www.huurda.com/properties_for_rent_in_Amsterdam/Apartment/Sint_LuciÃ«nsteeg--278166/
Scraping url 20/20

In [21]:
df_ams.shape

(155, 6)

In [41]:
df_ams.tail()

Unnamed: 0,Rent,Available from,Surface,Interior,Number of bedrooms,Location
15,1.9,right away,80,furnished,2,Plantage Kerklaan 1018SX Amsterdam
16,1.75,right away,45,furnished,2,Rental house: Apartment renting in amsterdam w...
17,5.75,right away,335,furnished possible,n.a.,"Rental house renting in Amsterdam, Van der Mad..."
18,3.5,right away,130,furnished,2,Staalkade 1011JN Amsterdam
19,1.95,right away,90,furnished,1,Paardenstraat 1017CX Amsterdam


In [52]:
test  = df_ams.loc[19, 'Location'].iloc[3]

In [53]:
len(test.split())

16

In [54]:
test

'Rental house renting in Amsterdam, Kloveniersburgwal Rental house: Rental house renting in Amsterdam Kloveniersburgwal 1012CT Amsterdam'

### Rotterdam

In [22]:
df_rot = main_scraper(cfg_rot.url, city='Rotterdam', nb_page=13)

Scraping page: 1/13
Scraping page: 2/13
Scraping page: 3/13
Scraping page: 4/13
Scraping page: 5/13
Scraping page: 6/13
Scraping page: 7/13
Scraping page: 8/13
Scraping page: 9/13
Scraping page: 10/13
Scraping page: 11/13
Scraping page: 12/13
Scraping page: 13/13


In [23]:
df_rot.shape

(220, 6)

In [24]:
df_rot.tail()

Unnamed: 0,Rent,Available from,Surface,Interior,Number of bedrooms,Location
15,0.0,right away,70,upholstered,4,Rental house: Apartment renting in Rotterdam-z...
16,1.25,right away,65,furnished,2,Schiedamsedijk 3011EN Rotterdam
17,1.5,right away,85,furnished,2,Wijnbrugstraat 3011XW Rotterdam
18,0.0,right away,20,upholstered,1,Rental house: Room renting in Rotterdam-centru...
19,1.1,right away,55,upholstered,1,Schiedamsesingel 3012BA Rotterdam


### Den Haag

In [25]:
df_haag = main_scraper(cfg_haag.url, city='Den Haag', nb_page=12)

Scraping page: 1/12
Scraping page: 2/12
Scraping page: 3/12
https://www.huurda.com/properties_for_rent_in_Den_Haag/Upstairs_apartment/LandrÃ©straat--391694/
Scraping page: 4/12
Scraping page: 5/12
Scraping page: 6/12
Scraping page: 7/12
Scraping page: 8/12
Scraping page: 9/12
Scraping page: 10/12
Scraping page: 11/12
Scraping page: 12/12


In [26]:
df_haag.shape

(219, 6)

In [27]:
df_haag.head()

Unnamed: 0,Rent,Available from,Surface,Interior,Number of bedrooms,Location
0,835.0,right away,50,upholstered,1,Van de Spiegelstraat 2518 ES Den Haag
1,960.0,right away,50,upholstered,2,Nicolaas Tulpstraat 2563 XL Den Haag
2,995.0,right away,70,furnished possible,2,Veenendaalkade 2547 AR Den Haag
3,1.85,25-04-2019,185,furnished,4,Rental house: Family house renting in Den Haag...
4,2.195,01-03-2019,90,furnished,2,Turfhaven 2511 DK Den Haag


### Groningen

In [28]:
df_gro = main_scraper(cfg_gro.url, city='Groningen', nb_page=6)

Scraping page: 1/6
Scraping page: 2/6
Scraping page: 3/6
Scraping page: 4/6
Scraping page: 5/6
Scraping page: 6/6
Scraping url 20/20

In [29]:
df_gro.shape

(120, 6)

In [30]:
df_gro.head()

Unnamed: 0,Rent,Available from,Surface,Interior,Number of bedrooms,Location
0,975.0,01-03-2019,190,upholstered,5,Rental house: Family house renting in Stitswer...
1,850.0,15-04-2019,55,furnished,1,Witte de Withstraat 9726EC Groningen
2,1.155,01-03-2019,85,furnished,2,Rental house: Family house renting in Groninge...
3,700.0,right away,120,upholstered,4,Rental house: Family house renting in Nieuw Be...
4,1.295,right away,118,upholstered,4,Rental house: Family house renting in Groninge...


### Utrecht

In [31]:
df_utr = main_scraper(cfg_utr.url, city='Utrecht', nb_page=10)

Scraping page: 1/10
https://www.huurda.com/properties_for_rent_in_Utrecht/Upstairs_apartment/Aziëlaan--404464/
Scraping page: 2/10
https://www.huurda.com/properties_for_rent_in_Utrecht/Upstairs_apartment/Aziëlaan--404464/
Scraping page: 3/10
Scraping page: 4/10
Scraping page: 5/10
Scraping page: 6/10
Scraping page: 7/10
Scraping page: 8/10
Scraping page: 9/10
Scraping page: 10/10
Scraping url 20/20

In [32]:
df_utr.shape

(198, 6)

In [33]:
df_utr.head()

Unnamed: 0,Rent,Available from,Surface,Interior,Number of bedrooms,Location
0,1.000,01-03-2019,40,upholstered,2,Lucas Bolwerk 3512EH Utrecht
1,1.375,01-03-2019,80,upholstered,2,Kintgenshaven 3512GA Utrecht
2,1.250,01-03-2019,100,furnished,2,Rental house: Family house renting in Soesterb...
3,2.750,right away,125,upholstered,3,Voorstraat 3512 AH Utrecht
4,1.500,right away,104,furnished possible,2,Rental house: Family house renting in Breukele...
5,3.750,right away,220,furnished,7,Rental house: Family house renting in Utrecht ...
6,1.150,right away,50,upholstered,1,Weerdsingel 3513 BE Utrecht
8,1.195,01-05-2019,75,furnished,2,Rental house: Apartment renting in Bilthoven M...
9,1.100,right away,65,upholstered,1,Rental house: Apartment renting in De Bilt Hes...
10,1.745,right away,91,upholstered,2,Rental house: Family house renting in Utrecht ...


### Eindhoven

In [34]:
df_ein = main_scraper(cfg_utr.url, city='Eindhoven', nb_page=9)

Scraping page: 1/9
https://www.huurda.com/properties_for_rent_in_Utrecht/Upstairs_apartment/Aziëlaan--404464/
Scraping page: 2/9
https://www.huurda.com/properties_for_rent_in_Utrecht/Upstairs_apartment/Aziëlaan--404464/
Scraping page: 3/9
Scraping page: 4/9
Scraping page: 5/9
Scraping page: 6/9
Scraping page: 7/9
Scraping page: 8/9
Scraping page: 9/9
Scraping url 20/20

In [35]:
df_ein.shape

(178, 6)

In [36]:
df_ein.head()

Unnamed: 0,Rent,Available from,Surface,Interior,Number of bedrooms,Location
0,1.0,01-03-2019,40,upholstered,2,Rental house: Apartment renting in Utrecht Luc...
1,1.375,01-03-2019,80,upholstered,2,Rental house: Apartment renting in Utrecht Kin...
2,1.25,01-03-2019,100,furnished,2,Rental house: Family house renting in Soesterb...
3,2.75,right away,125,upholstered,3,Rental house: Apartment renting in Utrecht Voo...
4,1.5,right away,104,furnished possible,2,Rental house: Family house renting in Breukele...


## Clean output

## Merge all data

In [37]:
df_list = [df_ams, df_rot, df_haag, df_gro, df_utr, df_ein]
df = pd.concat(df_list)

In [38]:
df.shape

(1090, 6)

In [39]:
df.head()

Unnamed: 0,Rent,Available from,Surface,Interior,Number of bedrooms,Location
1,1.995,right away,60,furnished,1,Herengracht 1015 BT Amsterdam
2,1.675,11-03-2019,60,furnished,1,Schinkelkade 1075 VH Amsterdam
3,1.45,right away,86,furnished,2,Meeuwenlaan 1021 JC Amsterdam
4,1.7,right away,68,furnished,2,Legmeerplein 1058 NL Amsterdam
5,1.6,right away,80,furnished,2,Loenermark 1025 TS Amsterdam
