# Imports

In [59]:
import os
import numpy as np
import pandas as pd

import httplib2

import urllib
import requests
import re
from bs4 import BeautifulSoup

# Config

In [60]:
class Config():
    def __init__(self, data_path='/Users/guillaumecorda/Desktop/UvA/Information Retrieval/Project/', url=None):
        self.data_path = data_path
        self.url = url

In [61]:
cfg = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=amsterdam&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.510+rental+houses')

# Helper functions

## Get content

In [62]:
def get_html(url):
    if 'http' not in url:
        raise ValueError('Please add protocol to url.')
    html_page = urllib.request.urlopen(url)
    soup = BeautifulSoup(html_page, features='html.parser')
    return soup

## Remove html tags

In [63]:
def remove_html_tags(text):
    p = re.compile(r'<.*?>')
    return p.sub('', text)

## Extract outgoing links

In [64]:
def extract_outgoing_links(url):
    soup = get_html(url)
    content = soup.find_all('div', class_ = 'newsBox')
    links = list()
    for flat in content:
        tag_news = str(flat.find('a', class_='news'))
        url = re.findall('"([^"]*)"', tag_news)[1]
        links.append(url)
    return links

## Url editor

In [65]:
def edit_url(url, page):
    if 'page' not in url :
        url = url+ '&page='+str(page)
    else:
        url = url.replace(url[-1], str(page))
    return url

## Details Scraper

In [66]:
def get_rent_details(url):
    soup = get_html(url)
    details_rent = soup.find_all('div', class_ = 'detailBox')

    infos = remove_html_tags(str(details_rent[0]))
    infos = infos.replace('\n', ' ')
    infos = infos.replace('\t', ' ')
    infos = infos.replace('\n2', ' ')
    infos = infos.split('   ')
    infos = [el for el in infos if el != '']
    
    infos = infos[1:8]
    
    #uncomment lines for new features
    infos[0] = infos[0].replace('Rent per month: € ','')
    infos[0] = infos[0].replace(',-', '')
    infos[1] = infos[1].replace('Available from: ', '')
    #infos[2] = infos[2].replace('Offered since: ', '')
    #infos[3] = infos[3].replace('Offered by: ', '')
    infos[4] = infos[4].replace('Surface (m2): ', '')
    infos[4] = infos[4].replace(' m²', '')
    infos[5] = infos[5].replace('Interior:  ', '')
    infos[6] = infos[6].replace('Number of bedrooms ', '')

    #remove line for new features
    del infos[2:3]
    del infos[2]
    
    return infos

## Location Scraper

In [78]:
def get_location(url):
    
    soup = get_html(url)
    content = soup.find_all('div', class_ = 'brownBox')

    infos = remove_html_tags(str(content[0]))
    infos = infos.replace('\n', ' ')
    infos = infos.replace('\t', ' ')
    infos = infos.replace('\n2', ' ')

    indice_start = infos.find('Rental house')
    infos = infos[indice_start:]
    infos = infos.replace('Rental house: Apartment renting in Amsterdam                 ', '')
    indice_end = infos.find('€')
    infos = infos[:indice_end]
    location = " ".join(infos.split())
    
    return location

# Scraper

## Scrap one link

In [81]:
links = extract_outgoing_links(cfg.url)

In [82]:
data = get_rent_details(links[10])

In [12]:
data.append(get_location(links[0]))

In [13]:
data

['1.500',
 'right away',
 '58',
 'furnished',
 '1',
 'Herengracht 1015 BT Amsterdam']

## Automation for one page

In [14]:
def mini_scraper(url, ):
    
    links = extract_outgoing_links(url)
    df = pd.DataFrame(columns=['Rent', 'Available from', 'Surface', 'Interior', 'Number of bedrooms',  'Location'])
    
    for i, link in enumerate(links):
        try:
            data = get_rent_details(link)
            data.append(get_location(link))
            df.loc[i] = data
            print('Scraping url {}/{}'.format(i+1, len(links)), end='\r')
        except:
            print(link)
    return df

In [15]:
df = mini_scraper(cfg.url)
df.head()

https://www.huurda.com/properties_for_rent_in_Amsterdam/Apartment/BethaniÃ«ndwarsstraat--403731/
Scraping url 20/20

Unnamed: 0,Rent,Available from,Surface,Interior,Number of bedrooms,Location
0,1.995,right away,60,furnished,1,Herengracht 1015 BT Amsterdam
1,1.675,right away,60,furnished,1,"Apartment renting in Amsterdam, Schinkelkade R..."
2,1.45,right away,86,furnished,2,"Apartment renting in Amsterdam, Meeuwenlaan Re..."
3,1.7,right away,68,furnished,2,"Apartment renting in Amsterdam, Legmeerplein R..."
4,1.6,right away,80,furnished,2,"Apartment renting in Amsterdam, Loenermark Ren..."


## Main Scraper

In [22]:
def main_scraper(url, nb_page):
    print('Scraping page: 1/{}'.format(nb_page))
    df = mini_scraper(url)
    df_list = [df]
    for i in range(1, nb_page):
        print('Scraping page: {}/{}'.format(i+1, nb_page))
        url = edit_url(url, i)
        df_list.append(mini_scraper(url))
    df = pd.concat(df_list)
    return df

In [84]:
df = main_scraper(cfg.url, nb_page=8)

Scraping page: 1/8
https://www.huurda.com/properties_for_rent_in_Amsterdam/Apartment/BethaniÃ«ndwarsstraat--403731/
Scraping page: 2/8
https://www.huurda.com/properties_for_rent_in_Amsterdam/Apartment/BethaniÃ«ndwarsstraat--403731/
Scraping page: 3/8
Scraping page: 4/8
Scraping page: 5/8
Scraping page: 6/8
Scraping page: 7/8
Scraping page: 8/8
https://www.huurda.com/properties_for_rent_in_Amsterdam/Apartment/Sint_LuciÃ«nsteeg--278166/
Scraping url 20/20

In [85]:
df.shape

(157, 6)

In [86]:
df.reset_index(inplace=True)

In [87]:
df

Unnamed: 0,index,Rent,Available from,Surface,Interior,Number of bedrooms,Location
0,0,1.995,right away,60,furnished,1,Herengracht 1015 BT Amsterdam
1,1,1.675,right away,60,furnished,1,Schinkelkade 1075 VH Amsterdam
2,2,1.450,right away,86,furnished,2,Meeuwenlaan 1021 JC Amsterdam
3,3,1.700,right away,68,furnished,2,Legmeerplein 1058 NL Amsterdam
4,4,1.600,right away,80,furnished,2,Loenermark 1025 TS Amsterdam
5,5,2.950,01-03-2019,125,upholstered,3,Rental house: Family house renting in Amsterda...
6,6,1.350,28-02-2019,32,furnished,1,Rental house: Studio renting in Amsterdam Twee...
7,7,1.850,right away,73,upholstered,2,Hectorstraat 1076 PN Amsterdam
8,8,1.875,01-04-2019,69,upholstered,2,Cornelis Schuytstraat 1071JE Amsterdam
9,9,1.650,01-03-2019,53,furnished,1,Amstelvlietstraat 1096GG Amsterdam
