## Introduction

The aim of this notebook is to scrape together data from a real estate selling site, www.kv.ee

## Initial setup

In [1]:
!pip install requests
!pip install beautifulsoup4
!pip install html5lib



In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import html5lib

In [3]:
cookies = {
    'saved_searches': 'ae1b0536cef346eebdded88a3f6632b4',
    'OptanonConsent': 'isIABGlobal=false&datestamp=Sun+Nov+28+2021+14%3A16%3A13+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0003%3A0%2CC0001%3A1%2CC0004%3A0%2CC0002%3A0%2CSTACK42%3A0&geolocation=EE%3B79&AwaitingReconsent=false',
    'OptanonAlertBoxClosed': '2021-09-29T11:27:21.911Z',
    'eupubconsent-v2': 'CPNSwfVPNSwfVAcABBENB3CgAAAAAH_AAChQIJtf_X__b3_j-_59f_t0eY1P9_7_v-0zjhfdt-8N2f_X_L8X42M7vF36pq4KuR4Eu3LBIQdlHOHcTUmw6okVrzPsbk2Mr7NKJ7PEmnMbO2dYGH9_n93TuZKY7__8___z_v-v_v____f_r-3_3__5_X---_e_V399zLv9____39nN___9ggmASYal5AF2JY4Mm0aVQogRhWEh0AoAKKAYWiKwgZXBTsrgI9QQsAEJqAjAiBBiCjBgEAAgEASERASAHggEQBEAgABACpAQgAI2AQWAFgYBAAKAaFiBFAEIEhBkcFRymBARItFBPZWAJQd7GmEIZZYAUCj-iowEShBAsDISFg5jgCQEuAAA.YAAAD_gAAAAA',
    'gallery_view': '0',
    'kv_web': '22a46a03b5604519524e8f81a28da69d',
    'page_size': '50',
    'LVO': '3332078%7C2151270%7C3331457',
    'lang': 'en',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    # 'Referer': 'https://www.kv.ee/?act=search.simple&last_deal_type=3&deal_type=3&dt_select=3&search_type=old',
    'DNT': '1',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Cache-Control': 'max-age=0',
    'TE': 'trailers',
}

In [4]:
def get_listing_urls(base_url, n_pages, cookies, headers, existing_links=[], verbose=False):
    # Converted using https://curlconverter.com/.

    links = []

    for i in range(1, n_pages + 1):
        response = requests.get(base_url + str(i), headers=headers, cookies=cookies)
        # print(response)
        if response.status_code != 200:
            print(f"Bad request, status code {response.status_code}.")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        advertisement_links = soup.findAll('a', attrs={"class":"object-title-a text-truncate"})
        # print(advertisement_links)
        for content in advertisement_links:
            link = content["href"]
            # print(link)

            if not link in existing_links:
                links.append(link)
                if verbose:
                    print(f"Link added! {link}")
            else:
                if verbose:
                    print(f"Link already exists! {link}")

        print(f"Links so far: {len(links)}.")

    return links

### Houses

In [5]:
# We do not want to parse already existing listings.
try:
    existing_links_houses = pd.read_csv("dataset_houses.csv").Link.values
except Exception as e:
    print(e)
    existing_links_houses = []
base_url_houses = "https://www.kv.ee/?act=search.simple&last_deal_type=3&orderby=cdwl&page_size=100&deal_type=3&dt_select=3&search_type=old&page="
house_links = get_listing_urls(base_url_houses, 25, cookies, headers, existing_links_houses)

[Errno 2] No such file or directory: 'dataset_houses.csv'
Links so far: 100.
Links so far: 200.
Links so far: 300.
Links so far: 400.
Links so far: 500.
Links so far: 600.
Links so far: 700.
Links so far: 800.
Links so far: 900.
Links so far: 1000.
Links so far: 1100.
Links so far: 1200.
Links so far: 1300.
Links so far: 1400.
Links so far: 1500.
Links so far: 1600.
Links so far: 1700.
Links so far: 1800.
Links so far: 1900.
Links so far: 2000.
Links so far: 2100.
Links so far: 2200.
Links so far: 2300.
Links so far: 2349.
Links so far: 2349.


### Apartments

In [6]:
try:
    existing_links_apartments = pd.read_csv("dataset_apartments.csv").Link.values
except Exception as e:
    print(e)
    existing_links_apartments = []
base_url_apartments = "https://www.kv.ee/?act=search.simple&last_deal_type=1&orderby=cdwl&page_size=100&deal_type=1&search_type=old&page="
apartment_links = get_listing_urls(base_url_apartments, 52, cookies, headers, existing_links_apartments)

[Errno 2] No such file or directory: 'dataset_apartments.csv'
Links so far: 100.
Links so far: 200.
Links so far: 300.
Links so far: 400.
Links so far: 500.
Links so far: 600.
Links so far: 700.
Links so far: 800.
Links so far: 900.
Links so far: 1000.
Links so far: 1100.
Links so far: 1200.
Links so far: 1300.
Links so far: 1400.
Links so far: 1500.
Links so far: 1600.
Links so far: 1700.
Links so far: 1800.
Links so far: 1900.
Links so far: 2000.
Links so far: 2100.
Links so far: 2200.
Links so far: 2300.
Links so far: 2400.
Links so far: 2500.
Links so far: 2600.
Links so far: 2700.
Links so far: 2800.
Links so far: 2900.
Links so far: 3000.
Links so far: 3100.
Links so far: 3200.
Links so far: 3300.
Links so far: 3400.
Links so far: 3500.
Links so far: 3600.
Links so far: 3700.
Links so far: 3800.
Links so far: 3900.
Links so far: 4000.
Links so far: 4100.
Links so far: 4200.
Links so far: 4300.
Links so far: 4400.
Links so far: 4500.
Links so far: 4600.
Links so far: 4700.
Links s

### Sharehouses

In [7]:
try:
    existing_links_sharehouses = pd.read_csv("dataset_sharehouses.csv").Link.values
except Exception as e:
    print(e)
    existing_links_sharehouses = []
base_url_sharehouses = "https://www.kv.ee/?act=search.simple&last_deal_type=11&page_size=100&deal_type=11&dt_select=11&search_type=old&page="
sharehouse_links = get_listing_urls(base_url_sharehouses, 6, cookies, headers, existing_links_sharehouses)

[Errno 2] No such file or directory: 'dataset_sharehouses.csv'
Links so far: 100.
Links so far: 200.
Links so far: 300.
Links so far: 400.
Links so far: 500.
Links so far: 501.


In [8]:
def extract_fields(text):
    results = {}

    try:
        split_text = text.split("<br/>")

        # print(split_text)
        results["Description"] = split_text[0][3:]

        for i in range(2, len(split_text) - 1):
            field = split_text[i].split("</strong> ")
            # print(field)
            results[field[0][8:len(field[0]) - 1]] = field[1].strip()
    finally:
        return results

In [25]:
def parse_listings(links, cookies, headers, save=True, filename="dataset_.csv", verbose=False):
    data = pd.DataFrame(columns=["Title", "Description", "Link", "Location", "Price"]) #, "Rooms", "Bedrooms", "Üldpind", "Korruseid", "Ehitusaasta", "Omandivorm", "Krundi pind", "Energiamärgis"])

    try:
        for i, link in enumerate(links):
            # print(link)   
            response = requests.get(link, headers=headers, cookies=cookies)

            soup = BeautifulSoup(response.text, 'html.parser')

            # Get the short description (title).
            title = soup.title.string.split(" - Kinnisvaraportaal KV.EE ")[0]

            df = pd.read_html(response.text, attrs={"class":"table-lined object-data-meta"}, flavor="bs4", skiprows=[9, 10], index_col=0)[0]
            # Rename the column to the index of the advertisement.
            df.rename(columns={df.columns.values[0]: i}, inplace=True)
            # Add the short description
            df.loc["Title"] = title
            df.loc["Link"] = link
            if verbose:
                print(title, link)
            
            # Since we cannot be sure these fields always exists, use error handling.
            try:
                # Add the additional fields.
                fields = soup.find("div", attrs={"class":"object-article-body"})
                fields = fields.find("p")
                # fields = fields.findAll("strong")
                # for field in fields:
                #     print(field)

                extracted_fields = extract_fields(str(fields))
                for field in extracted_fields:
                    # print(field)
                    df.loc[field] = extracted_fields[field]

                # Add the location info.
                location = soup.find("a", attrs={"class":"gtm-object-map"})
                df.loc["Location"] = location["href"].split("query=")[1]

                # Add the price info.
                price = soup.find("input", attrs={"name":"price"})
                df.loc["Price"] = int(price["value"])

            except Exception as e:
                print(e)

            data = data.append(df.T)

            if (i + 1) % 50 == 0:
                print(f"Parsed so far: {i + 1}")
                # break
            
            # break
    # Just print out the exceptions in order not to lose all progress.
    except Exception as e:
        print(e)
    finally:
        if save:
            data.to_csv(filename, index=False)
        return data

In [26]:
parsed_houses = parse_listings(house_links, cookies, headers, filename="dataset_houses.csv", verbose=False)

'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
Parsed so far: 50
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
Parsed so far: 100
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
Parsed so far: 150
Parsed so far: 200
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
Parsed so far: 250
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
Parsed so far: 300
Parsed so far: 350
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
Parsed so far: 400
'NoneType' object is not subscriptable
'NoneType' object is not subsc

In [27]:
parsed_apartments = parse_listings(apartment_links, cookies, headers, filename="dataset_apartments.csv")

Parsed so far: 50
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
Parsed so far: 100
Parsed so far: 150
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
Parsed so far: 200
Parsed so far: 250
'NoneType' object is not subscriptable
Parsed so far: 300
'NoneType' object is not subscriptable
Parsed so far: 350
'NoneType' object is not subscriptable
Parsed so far: 400
Parsed so far: 450
Parsed so far: 500
'NoneType' object is not subscriptable
Parsed so far: 550
'NoneType' object is not subscriptable
Parsed so far: 600
Parsed so far: 650
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
Parsed so far: 700
Parsed so far: 750
Parsed so far: 800
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
Parsed so far: 850
Parsed so far: 900
Parsed so far: 950
'NoneType' objec

In [28]:
parsed_sharehouses = parse_listings(sharehouse_links, cookies, headers, filename="dataset_sharehouses.csv")

'NoneType' object is not subscriptable
Parsed so far: 50
'NoneType' object is not subscriptable
Parsed so far: 100
'NoneType' object is not subscriptable
Parsed so far: 150
'NoneType' object is not subscriptable
Parsed so far: 200
'NoneType' object is not subscriptable
Parsed so far: 250
Parsed so far: 300
'NoneType' object is not subscriptable
Parsed so far: 350
Parsed so far: 400
'NoneType' object is not subscriptable
Parsed so far: 450
'NoneType' object is not subscriptable
Parsed so far: 500


In [29]:
print(parsed_houses.columns)
display(parsed_houses)

Index(['Title', 'Description', 'Link', 'Location', 'Price', 'Rooms',
       'Total area', 'Number of floors', 'Condition', 'Readiness',
       'Ground area', 'Cadastre no.', 'Energy mark', 'Lisainfo',
       'Neighbourhood', 'Bedrooms', 'Built in year', 'Ownership',
       'Notify about incorrect advertisement', 'Kitchen',
       'Sanitary arrangements', 'Heating and ventilation',
       'Communications and security', 'Data from realestate book',
       'Register number', 'Additional information',
       'This floor/Number of floors', 'Kulud suvel/talvel'],
      dtype='object')


Unnamed: 0,Title,Description,Link,Location,Price,Rooms,Total area,Number of floors,Condition,Readiness,...,Notify about incorrect advertisement,Kitchen,Sanitary arrangements,Heating and ventilation,Communications and security,Data from realestate book,Register number,Additional information,This floor/Number of floors,Kulud suvel/talvel
0,"House for sale, Parkali 42, Paide linn, Paide,...",Total area 81.5 m²,https://www.kv.ee/muua-paide-linnas-valguskull...,"58.8837293,25.5777054",69000,5,81.5 m²,2,needs renovating,ready,...,,,,,,,,,,
1,"House for sale, 3 bedrooms, Rookse, Kastre val...","Private ownership, wooden house, total area 16...",https://www.kv.ee/kinnisturebase-kinnistu-suur...,"58.240051969621,27.017403252405",240000,4,160.3 m²,1,all brand-new,,...,Notify about incorrect advertisement,"ceramic shove, refridgerator","bath, water boiler, sauna, shower, local water",geothermic heating,"steel door, video cameras, fenced with garden",,,,,
2,"House for sale, 4 bedrooms, Käänu tn, Haaberst...","Private ownership, stone house, total area 267...",https://www.kv.ee/muua-vaga-heas-seisukorras-k...,"59.4394199,24.5724249",550000,6,267.3 m²,2,Good condition,ready,...,Notify about incorrect advertisement,"ceramic shove, refridgerator, kitchen furniture","toilet room and bathroom separate, bath, showe...","conditioner, geothermic heating",fenced with garden,Data from realestate book,,,,
3,"House for sale, 3 bedrooms, Välja tee 6, Suuru...","Private ownership, house area 122 m², total ar...",https://www.kv.ee/hubane-maja-tallinna-lahedal...,"59.4641212,24.3978224",280000,4,122 m²,2,Good condition,ready,...,Notify about incorrect advertisement,,,,,Data from realestate book,8606802,,,
4,"House for sale, 1 bedrooms, Silmakese, Aakre, ...","Private ownership, log house, total area 81 m²",https://www.kv.ee/muua-audentsete-taluhooneteg...,"58.101587177111,26.187740990491",95000,3,81 m²,1,ready,,...,Notify about incorrect advertisement,,"sauna, water boiler, bath","stove heating, õhksoojuspump, fireplace",,Data from realestate book,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2344,"House for sale, Oa, Voorepera, Lüganuse vald,...","Private ownership, panel house, total area 95....",https://www.kv.ee/maja-vajab-renoveerimist-pak...,"59.406675,27.127107",27900,3,95.8 m²,1,needs renovating,,...,Notify about incorrect advertisement,,local water,stove heating,,,,"ventilation, water, roof tin roof",,
2345,"House for sale, Viljandi mnt 54, Rapla linn, R...","Private ownership, panel house, total area 181...",https://www.kv.ee/muua-1979-aastal-valminud-ka...,"58.999085,24.805968",158000,4,181.7 m²,2,ready,,...,Notify about incorrect advertisement,"electric stove, kitchen furniture","water boiler, toilet room and bathroom separat...",stove heating,,,,,,
2346,"House for sale, Mureli, Suure-Ahli, Haapsalu,...","Private ownership, panel house, total area 145...",https://www.kv.ee/kaunilt-haljastatud-krundil-...,"58.867395,23.527873",195000,4,145.8 m²,3,ready,,...,,electric stove,shower,solid fuel,,,1538032,,,
2347,"Farmhouse for sale, Kundihüti, Luhametsa, Ants...","Private ownership, log house, total area 73 m²",https://www.kv.ee/vana-talukoht-vorumaa-kuplit...,"57.767080520473,26.71936923858",25000,,73 m²,2,needs renovating,roofed box,...,Notify about incorrect advertisement,,,,,,2235341,,,


In [30]:
test_data = pd.read_csv("dataset_houses.csv")
test_data

Unnamed: 0,Title,Description,Link,Location,Price,Rooms,Total area,Number of floors,Condition,Readiness,...,Notify about incorrect advertisement,Kitchen,Sanitary arrangements,Heating and ventilation,Communications and security,Data from realestate book,Register number,Additional information,This floor/Number of floors,Kulud suvel/talvel
0,"House for sale, Parkali 42, Paide linn, Paide,...",Total area 81.5 m²,https://www.kv.ee/muua-paide-linnas-valguskull...,"58.8837293,25.5777054",69000.0,5.0,81.5 m²,2.0,needs renovating,ready,...,,,,,,,,,,
1,"House for sale, 3 bedrooms, Rookse, Kastre val...","Private ownership, wooden house, total area 16...",https://www.kv.ee/kinnisturebase-kinnistu-suur...,"58.240051969621,27.017403252405",240000.0,4.0,160.3 m²,1.0,all brand-new,,...,Notify about incorrect advertisement,"ceramic shove, refridgerator","bath, water boiler, sauna, shower, local water",geothermic heating,"steel door, video cameras, fenced with garden",,,,,
2,"House for sale, 4 bedrooms, Käänu tn, Haaberst...","Private ownership, stone house, total area 267...",https://www.kv.ee/muua-vaga-heas-seisukorras-k...,"59.4394199,24.5724249",550000.0,6.0,267.3 m²,2.0,Good condition,ready,...,Notify about incorrect advertisement,"ceramic shove, refridgerator, kitchen furniture","toilet room and bathroom separate, bath, showe...","conditioner, geothermic heating",fenced with garden,Data from realestate book,,,,
3,"House for sale, 3 bedrooms, Välja tee 6, Suuru...","Private ownership, house area 122 m², total ar...",https://www.kv.ee/hubane-maja-tallinna-lahedal...,"59.4641212,24.3978224",280000.0,4.0,122 m²,2.0,Good condition,ready,...,Notify about incorrect advertisement,,,,,Data from realestate book,8606802.0,,,
4,"House for sale, 1 bedrooms, Silmakese, Aakre, ...","Private ownership, log house, total area 81 m²",https://www.kv.ee/muua-audentsete-taluhooneteg...,"58.101587177111,26.187740990491",95000.0,3.0,81 m²,1.0,ready,,...,Notify about incorrect advertisement,,"sauna, water boiler, bath","stove heating, õhksoojuspump, fireplace",,Data from realestate book,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2344,"House for sale, Oa, Voorepera, Lüganuse vald,...","Private ownership, panel house, total area 95....",https://www.kv.ee/maja-vajab-renoveerimist-pak...,"59.406675,27.127107",27900.0,3.0,95.8 m²,1.0,needs renovating,,...,Notify about incorrect advertisement,,local water,stove heating,,,,"ventilation, water, roof tin roof",,
2345,"House for sale, Viljandi mnt 54, Rapla linn, R...","Private ownership, panel house, total area 181...",https://www.kv.ee/muua-1979-aastal-valminud-ka...,"58.999085,24.805968",158000.0,4.0,181.7 m²,2.0,ready,,...,Notify about incorrect advertisement,"electric stove, kitchen furniture","water boiler, toilet room and bathroom separat...",stove heating,,,,,,
2346,"House for sale, Mureli, Suure-Ahli, Haapsalu,...","Private ownership, panel house, total area 145...",https://www.kv.ee/kaunilt-haljastatud-krundil-...,"58.867395,23.527873",195000.0,4.0,145.8 m²,3.0,ready,,...,,electric stove,shower,solid fuel,,,1538032.0,,,
2347,"Farmhouse for sale, Kundihüti, Luhametsa, Ants...","Private ownership, log house, total area 73 m²",https://www.kv.ee/vana-talukoht-vorumaa-kuplit...,"57.767080520473,26.71936923858",25000.0,,73 m²,2.0,needs renovating,roofed box,...,Notify about incorrect advertisement,,,,,,2235341.0,,,
