# Rental price analysis of german cities
__Download Data__ \
__by Maximilian Hilbert__

In [21]:
import requests
import re
import os
from tqdm import tqdm
from requests.exceptions import HTTPError

__Gather links and download historical rental prices for each city contained in http://www.wohnung.com/mietpreise__

__First of all we get the links with the requests framework and save them to a list__

In [22]:
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

def get_links_mietpreise():
    url="https://www.wohnung.com/mietpreise"
    try:
        response = requests.get(url,headers=hdr)
        # If the response was successful, no Exception will be raised
        response.raise_for_status()
    except HTTPError as http_err:
        #This happens especially when links are dead, then a 404 error gets raised. Unfortunately
        #this happens in approx. 50% of all cases, so the resulting dataset contains only approx. 2400 cities
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        #Any other weird error by the server
        print(f'Other error occurred: {err}')
    else:
        #For debugging
        print('Successful connection')
    text=response.text
    #collect links that start with the domain and are continued by any subpage (this also contains links to pages e.g.
    #[^assets][^icon][^ratgeber][^vergleich] we don't like, but the following regex will take care of that)
    link_list=re.findall("https://www.wohnung.com/[\w]+", text)
    return link_list

__Save links in list__

In [23]:
links=get_links_mietpreise()

Successful connection


__For each link in the link-list above, we download the corresponding .html file__

In [24]:
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

def download_html_files(link_lst):
    #for every link in the link-list
    for i in tqdm(range(len(link_lst))):
        url=link_lst[i]
        try:
            #get a response by the url
            response=requests.get(url,headers=hdr)
            text=response.text
            filename=url.replace("https://www.wohnung.com/", "")
            #save the text received by the server in a .html file, named like the subdomain, what is conveniently
            #the cityname
            with open("html_files_wohnung_com\\"+str(filename)+".html", "w", encoding="utf-8") as file:
                file.write(text)
                file.close()
                response.raise_for_status()
        except HTTPError as http_err:
            #work through exceptions like dead links (404) and delete thos .html files (because they contain an
            # answer by the server only containing "404")
            print(f'HTTP error occurred: {http_err}')
            os.remove("html_files_wohnung_com\\"+filename+".html")
    return "Done."

__Download files in subdirectory shown above__

In [25]:
download_html_files(links)

  0%|          | 2/5267 [00:00<14:00,  6.26it/s]

HTTP error occurred: 403 Client Error: Forbidden for url: https://www.wohnung.com/assets/
HTTP error occurred: 403 Client Error: Forbidden for url: https://www.wohnung.com/assets/


  0%|          | 3/5267 [00:00<15:06,  5.81it/s]

HTTP error occurred: 404 Client Error: Not Found for url: https://www.wohnung.com/icon


  0%|          | 14/5267 [00:03<19:06,  4.58it/s]

HTTP error occurred: 404 Client Error: Not Found for url: https://www.wohnung.com/vergleich


  0%|          | 23/5267 [00:05<20:29,  4.27it/s]

HTTP error occurred: 404 Client Error: Not Found for url: https://www.wohnung.com/alle


  1%|          | 35/5267 [00:12<32:27,  2.69it/s]

HTTP error occurred: 404 Client Error: Not Found for url: https://www.wohnung.com/alle
HTTP error occurred: 404 Client Error: Not Found for url: https://www.wohnung.com/alle


  1%|          | 42/5267 [00:16<34:26,  2.53it/s]


KeyboardInterrupt: 

__List those files for later use__

In [None]:
files=os.listdir("html_files_wohnung_com/")

__Download cities for each state by hand and save them to .txt files from http://www.wohnung.com/alle-staedte in subdirectoy 'cities_by_state'__