# Scraping the bottom of the barrel

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import time
import urllib
import re
import os

from providers import ParariusProvider
from utils import Entry
import webbrowser

import pickle

In [3]:
urls = {
  "Funda": "https://www.funda.nl/zoeken/huur?selected_area=%5B%22den-haag%22%5D&price=%22-1750%22",
  "Pararius": "/apartments/den-haag/0-1750/2-bedrooms",
  'Pararius Den Haag':  "/apartments/den-haag/0-1750/2-bedrooms",
  'Pararius Leidschendam': "/apartments/leidschendam/0-1750/2-bedrooms",
  'Pararius Rijswijk': "/apartments/rijswijk/0-1750/2-bedrooms",
  'Pararius Voorburg': "/apartments/voorburg/0-1750/2-bedrooms"
}

# Too good to be true threshold (probably fake listing)
tgtbt_area_price_threshold = 13
# Tiny Room Threshold (the rooms are just too small)
tiny_room_threshold = 15

In [4]:
pps = [ParariusProvider(urls['Pararius Den Haag'], 'DenHaag'),
       ParariusProvider(urls['Pararius Leidschendam'], 'LeidschenDam'),
       ParariusProvider(urls['Pararius Rijswijk'], 'Rijswijk'),
       ParariusProvider(urls['Pararius Voorburg'], 'Voorburg')]
       
for pp in pps:
  pp.query_entries()
  pp.get_detailed_results()

100%|██████████| 3/3 [00:01<00:00,  2.75it/s]
100%|██████████| 84/84 [00:14<00:00,  5.85it/s]
100%|██████████| 3/3 [00:01<00:00,  2.83it/s]
100%|██████████| 8/8 [00:01<00:00,  4.43it/s]
100%|██████████| 5/5 [00:01<00:00,  3.44it/s]


In [5]:
entries = []
for pp in pps:
  entries += pp.entries

len(entries)

100

In [6]:
df = pd.DataFrame(entries)

df['Price per area'] = df['price'] / df['area']
df['Price per room'] = df['price'] / df['rooms']
df['Area per room'] = df['area'] / df['rooms']

initial_rows = df.shape[0]

df.head()

Unnamed: 0,title,link,location,price,area,rooms,furnished,description,address,rented,Price per area,Price per room,Area per room
0,Flat Joan Maetsuyckerstraat,https://www.pararius.com/apartment-for-rent/de...,2593 ZP Den Haag (Bezuidenhout-Oost),1375,82,3,Furnished,Description\nFully furnished apartment in Bezu...,"{'street': 'Joan Maetsuyckerstraat', 'city': '...",,16.768293,458.333333,27.333333
1,Flat Korte Houtstraat,https://www.pararius.com/apartment-for-rent/de...,2511 DA Den Haag (Uilebomen),1395,66,3,Part-furnished,"Description\nKORTE HOUTSTRAAT, CENTRE, THE HAG...","{'street': 'Korte Houtstraat', 'city': 'Den Ha...",Rented under option,21.136364,465.0,22.0
2,Flat Korte Houtstraat,https://www.pararius.com/apartment-for-rent/de...,2511 DA Den Haag (Uilebomen),1395,67,3,Part-furnished,"Description\nKORTE HOUTSTRAAT, CENTRE, THE HAG...","{'street': 'Korte Houtstraat', 'city': 'Den Ha...",,20.820896,465.0,22.333333
3,Flat Gevers Deynootweg,https://www.pararius.com/apartment-for-rent/de...,2586 HJ Den Haag (Belgisch Park),1700,90,4,Part-furnished,Description\nWe offer a very bright and fully ...,"{'street': 'Gevers Deynootweg', 'city': 'Den H...",,18.888889,425.0,22.5
4,Flat Schoolstraat,https://www.pararius.com/apartment-for-rent/de...,2511 AX Den Haag (Kortenbos),1700,90,4,Part-furnished,Description\nThis upholstered three bedroom ap...,"{'street': 'Schoolstraat', 'city': 'Den Haag',...",,18.888889,425.0,22.5


In [7]:
df = df[df['Price per area'] > tgtbt_area_price_threshold]
df = df[df['Area per room'] > tiny_room_threshold]
df = df[~df["description"].str.contains('permit', case=False)]
df = df[df["rented"].isnull()]

print(f"{df.shape[0]}/{initial_rows} rows eligible")

35/100 rows eligible


In [8]:
output_file_name = "output.dat"

array = []

if os.path.isfile(output_file_name):
  with open(output_file_name, "r") as f:
    array = f.read()[1:-1].replace("'","").strip().split(', ')
    df2 = df[~df["link"].isin(array)]
    print(f"{df2.shape[0]} new Properties!")
  
with open(output_file_name, "w") as f:
  array += list(df2["link"])
  f.write(str(array))

df2.head()

11 new Properties!


Unnamed: 0,title,link,location,price,area,rooms,furnished,description,address,rented,Price per area,Price per room,Area per room
4,Flat Schoolstraat,https://www.pararius.com/apartment-for-rent/de...,2511 AX Den Haag (Kortenbos),1700,90,4,Part-furnished,Description\nThis upholstered three bedroom ap...,"{'street': 'Schoolstraat', 'city': 'Den Haag',...",,18.888889,425.0,22.5
33,Flat Spaarnestraat 29,https://www.pararius.com/apartment-for-rent/de...,2515 VL Den Haag (Rivierenbuurt-Zuid),1400,65,3,Part-furnished,Description\nBeautiful part-furnished apartmen...,"{'street': 'Spaarnestraat 29', 'city': 'Den Ha...",,21.538462,466.666667,21.666667
41,Flat Van Musschenbroekstraat 81,https://www.pararius.com/apartment-for-rent/de...,2522 AJ Den Haag (Laakkwartier-Oost),1450,66,4,,Description\nBeautiful apartment for rent in D...,"{'street': 'Van Musschenbroekstraat 81', 'city...",,21.969697,362.5,16.5
56,Flat Til Brugmanplantsoen,https://www.pararius.com/apartment-for-rent/de...,2525 ZZ Den Haag (Groente- en Fruitmarkt),1175,84,3,,Description\nYou can make a request for a view...,"{'street': 'Til Brugmanplantsoen', 'city': 'De...",,13.988095,391.666667,28.0
58,Flat Nicolaïstraat 80 A,https://www.pararius.com/apartment-for-rent/de...,2517 TD Den Haag (Stadhoudersplantsoen),1395,70,3,Furnished,"Description\nA charming, spacious and well mai...","{'street': 'Nicolaïstraat 80 A', 'city': 'Den ...",,19.928571,465.0,23.333333


In [9]:
links = list(df2.link)
for link in links:
  webbrowser.open_new_tab(link)