# Imports

In [None]:
import os
import numpy as np
import pandas as pd

import httplib2

import urllib
import requests
import re
from bs4 import BeautifulSoup

# Config

In [None]:
class Config():
    def __init__(self, data_path='/Users/guillaumecorda/Desktop/UvA/Information Retrieval/Project/', url=None):
        self.data_path = data_path
        self.url = url

In [None]:
cfg_ams = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=amsterdam&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.510+rental+houses')
cfg_rot = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=rotterdam&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.518+rental+houses')
cfg_haag = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=den+haag&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.518+rental+houses')
cfg_gro = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=groningen&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.517+rental+houses')
cfg_utr = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=utrecht&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.517+rental+houses')
cfg_ein = Config(url='https://www.huurda.com/Housing_offer.html?searchPhotos=1&searchKeywords=eindhoven&searchRentStart=0&searchSurface=0&searchRentEnd=0&searchAvailable=&searchFurnished=&searchBedrooms=&searchFormSubmit=Search+directly+in+6.517+rental+houses')

# Helper functions

## Get content

In [None]:
def get_html(url):
    if 'http' not in url:
        raise ValueError('Please add protocol to url.')
    html_page = urllib.request.urlopen(url)
    soup = BeautifulSoup(html_page, features='html.parser')
    return soup

## Remove html tags

In [None]:
def remove_html_tags(text):
    p = re.compile(r'<.*?>')
    return p.sub('', text)

## Extract outgoing links

In [None]:
def extract_outgoing_links(url):
    soup = get_html(url)
    content = soup.find_all('div', class_ = 'newsBox')
    links = list()
    for flat in content:
        tag_news = str(flat.find('a', class_='news'))
        url = re.findall('"([^"]*)"', tag_news)[1]
        links.append(url)
    return links

## Url editor

In [None]:
def edit_url(url, page):
    if 'page' not in url :
        url = url+ '&page='+str(page)
    else:
        url = url.replace(url[-1], str(page))
    return url

## Details Scraper

In [None]:
def get_rent_details(url):
    soup = get_html(url)
    details_rent = soup.find_all('div', class_ = 'detailBox')

    infos = remove_html_tags(str(details_rent[0]))
    infos = infos.replace('\n', ' ')
    infos = infos.replace('\t', ' ')
    infos = infos.replace('\n2', ' ')
    infos = infos.split('   ')
    infos = [el for el in infos if el != '']
    
    infos = infos[1:8]
    
    #uncomment lines for new features
    infos[0] = infos[0].replace('Rent per month: € ','')
    infos[0] = infos[0].replace(',-', '')
    infos[1] = infos[1].replace('Available from: ', '')
    #infos[2] = infos[2].replace('Offered since: ', '')
    #infos[3] = infos[3].replace('Offered by: ', '')
    infos[4] = infos[4].replace('Surface (m2): ', '')
    infos[4] = infos[4].replace(' m²', '')
    infos[5] = infos[5].replace('Interior:  ', '')
    infos[6] = infos[6].replace('Number of bedrooms ', '')

    #remove line for new features
    del infos[2:3]
    del infos[2]
    
    return infos

## Location Scraper

In [None]:
def get_location(url, city):
    
    soup = get_html(url)
    content = soup.find_all('div', class_ = 'brownBox')

    infos = remove_html_tags(str(content[0]))
    infos = infos.replace('\n', ' ')
    infos = infos.replace('\t', ' ')
    infos = infos.replace('\n2', ' ')

    indice_start = infos.find('Rental house')
    infos = infos[indice_start:]
    
    if city == 'Amsterdam':
        infos = infos.replace('Rental house: Apartment renting in Amsterdam                 ', '')
    elif city == 'Rotterdam':
        infos = infos.replace('Rental house: Apartment renting in Rotterdam                 ', '')
    elif city == 'Den Haag':
        infos = infos.replace('Rental house: Apartment renting in Den Haag                 ', '')
    elif city == 'Utrecht':
        infos = infos.replace('Rental house: Apartment renting in Utrecht                 ', '')
    elif city == 'Groningen':
        infos = infos.replace('Rental house: Apartment renting in Groningen                 ', '')
    elif city == 'Eindhoven':
        infos = infos.replace('Rental house: Apartment renting in Eindhoven                 ', '')
    
    indice_end = infos.find('€')
    infos = infos[:indice_end]
    location = " ".join(infos.split())
    
    return location

# Scraper

## Scrap one link

In [None]:
links_ams = extract_outgoing_links(cfg_ams.url)

In [None]:
data = get_rent_details(links_ams[10])

In [None]:
data.append(get_location(links_ams[0], city='Amsterdam'))

In [None]:
data

## Automation for one page

In [None]:
def mini_scraper(url, city):
    
    links = extract_outgoing_links(url)
    df = pd.DataFrame(columns=['Rent', 'Available from', 'Surface', 'Interior', 'Number of bedrooms',  'Location'])
    
    for i, link in enumerate(links):
        try:
            data = get_rent_details(link)
            data.append(get_location(link, city))
            df.loc[i] = data
            print('Scraping url {}/{}'.format(i+1, len(links)), end='\r')
        except:
            print(link)
    return df

In [None]:
df = mini_scraper(cfg_rot.url, city='Rotterdam')
df.head()

## Main Scraper

In [None]:
def main_scraper(url, city, nb_page):
    print('Scraping page: 1/{}'.format(nb_page))
    df = mini_scraper(url, city)
    df_list = [df]
    for i in range(1, nb_page):
        print('Scraping page: {}/{}'.format(i+1, nb_page))
        url = edit_url(url, i)
        df_list.append(mini_scraper(url, city))
    df = pd.concat(df_list)
    return df

### Amsterdam

In [None]:
df_ams = main_scraper(cfg_ams.url, city='Amsterdam', nb_page=8)

In [None]:
df_ams.shape

In [None]:
df_ams.tail()

In [None]:
test  = df_ams.loc[19, 'Location'].iloc[3]

In [None]:
len(test.split())

In [None]:
test

### Rotterdam

In [None]:
df_rot = main_scraper(cfg_rot.url, city='Rotterdam', nb_page=13)

In [None]:
df_rot.shape

In [None]:
df_rot.tail()

### Den Haag

In [None]:
df_haag = main_scraper(cfg_haag.url, city='Den Haag', nb_page=12)

In [None]:
df_haag.shape

In [None]:
df_haag.head()

### Groningen

In [None]:
df_gro = main_scraper(cfg_gro.url, city='Groningen', nb_page=6)

In [None]:
df_gro.shape

In [None]:
df_gro.head()

### Utrecht

In [None]:
df_utr = main_scraper(cfg_utr.url, city='Utrecht', nb_page=10)

In [None]:
df_utr.shape

In [None]:
df_utr.head()

### Eindhoven

In [None]:
df_ein = main_scraper(cfg_utr.url, city='Eindhoven', nb_page=9)

In [None]:
df_ein.shape

In [None]:
df_ein.head()

## Clean output

## Merge all data

In [None]:
df_list = [df_ams, df_rot, df_haag, df_gro, df_utr, df_ein]
df = pd.concat(df_list)

In [None]:
df.shape

In [None]:
df.head()