In [15]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import logging
import time
import requests
from bs4 import BeautifulSoup
import time
import random
import re

In [17]:
# init logging
# posix stands for UNIX-systems
if os.name == 'posix':
    # coloring on linux
    cyellow = '\033[93m'
    cblue = '\033[94m'
    coff = '\033[0m'
    format = '[' + cblue + '%(asctime)s' + coff + '|' + cblue + '%(filename)-18s' + coff + '|' + cyellow + \
             '%(levelname)-8s' + coff + ']: %(message)s'
else:
    # else without color
    format = '[%(asctime)s|%(filename)-18s|%(levelname)-8s]: %(message)s',
    logging.basicConfig(
    filename='log.txt',
    format=format,
    filemode='w',
    datefmt='%Y/%m/%d %H:%M:%S',
    level=logging.DEBUG)
    __log__ = logging.getLogger(__name__)

 # Helper functions

In [18]:
def process_string(string):
    # Remove new line characters
    string = string.replace("\n", "")
    # Replace multiple whitespaces with a single one
    string = " ".join(string.split())
    # Remove following and starting whitespaces
    string = string.strip()
    return string

def extract_urls(string):
    urls =  re.findall(r'(http|ftp|https):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?' , string)
    return urls

def change_url_after_last_slash(url, new_end):
    """
    new_end needs to contain a starting "/"
    """
    return url[:url.rfind("/")] + new_end

def count_announcements(soup):
    return len(soup.find_all("h3"))

def get_announcements_from_page(soup):
    flat_list = list()
    announcements = soup.find_all("h3")
    # The first and the last h3 objects are no announcements
    for i in range(1, len(announcements) - 1):
        title = process_string(announcements[i].text)
        relative_link = announcements[i].find("a").get("href")
        link = change_url_after_last_slash(url, relative_link)
        # print(f"Flat title: {title}")
        # print(f"Flat link : {link}")
        # print("")
        flat_list.append([title, link])
    return flat_list 

def iterate_through_pages(url, max_no = 1000):
    flat_list = list()
    for page_no in range(0, max_no):
        try:
            new_url = ".".join(url.split(".")[:-2]) + "." + str(page_no) + ".html"
            # print(f"New url: {new_url}")
            resp = get_request(new_url)
            soup = BeautifulSoup(resp.content , 'lxml')
            length = count_announcements(soup)
            flat_list += get_announcements_from_page(soup)
            # print(f"Page {page_no} processed successfully: {length} flats")
            time.sleep(random.uniform(0, 2))
            page_no += 1
            # if page_no  > 220:
            #     return flat_list
        except:
            print(f"{page_no} pages successfully scanned. Script ended because of error.")
            return flat_list
    return flat_list

def extract_dates(string):
    return re.findall(r'\d{2}.\d{2}.\d{4}', string)

def get_request(url):
    # http://www.useragentstring.com/pages/useragentstring.php?name=Chrome
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
    r = requests.get(url, headers)
    return r


# Flat class

In [None]:
class Flat:
    
    def __init__(self, title, url, address, size, price, room_no, date_of_availability):
        self.title = title
        self.url = url
        self.address = address
        self.size = size
        self.price = price
        self.room_no = room_no
        self.date_of_availability = date_of_availability
        
    def __str__(self):
        return f'''
        Title: {self.title}
        URL: {self.url}
        Address: {self.address}
        Size: {self.size}
        Price: {self.price}
        Number of rooms: {self.room_no}
        Date of availability: {self.date_of_availability}
        '''

# Individual class for each platforn

In [43]:
class Flat_portal:

    def __init__(self, url, max_no):
        self.url = url
        self.flat_pages = iterate_through_pages(url, max_no = max_no)
        self.flats = []
        for page in self.flat_pages:
            flat_page = get_request(page[1])
            flat_soup = BeautifulSoup(flat_page.content, 'lxml')
            self.flats.append(self.extract_flat_data(flat_soup, page[0], page[1]))
            
    def __str__(self):
        return f'''
        Url: {url}
        '''

In [39]:
class WG_gesucht(Flat_portal):
    
    def extract_flat_data(self, flat_soup, title, url):
        headline_items = flat_soup.find_all("h2", class_ = "headline headline-key-facts")
        try:
            size = process_string(headline_items[0].text)
        except: 
            size = "unknown"
        try:
            price = process_string(headline_items[1].text)
        except:
            price = "unknown"
        try:
            room_no = process_string(headline_items[2].text)
        except:
            room_no = "unknown"

        try:
            address_string = flat_soup.find_all("a", attrs = { "style" : "line-height: 1.5em;font-weight: normal; color: #555; margin-bottom: 23px;"})[0].text
            address = process_string(address_string)
        except:
            address = "unknown"

        # Extract the date of availability
        try:
            data = []
            for nested_soup in flat_soup.find_all("div", class_ = "col-sm-3"):
                    data +=  nested_soup.find_all("p", attrs = { "style" : "line-height: 2em;"})
            date_of_availability = extract_dates(data[0].text)[0]
        except:
            date_of_availability = "unknown"
        
        # print(f"Address: {address}")
        # print(f"Size: {size}")
        # print(f"Price: {price}")
        # print(f"Number of Rooms: {room_no}")
        # print(f"Date of availability: {date_of_availability}")

        return Flat(
            title = title,
            url = url,
            address = address,
            size = size,
            price = price,
            room_no = room_no,
            date_of_availability = date_of_availability,
        )

In [None]:
class EbayKleinanzeigen(Flat_portal):
    # https://www.ebay-kleinanzeigen.de/s-auf-zeit-wg/95444/c199l7494r10
    pass

In [None]:
class Immowelt(Flat_portal):
    pass

In [42]:
class Immobilienscout24(Flat_portal):
    pass

In [None]:
class Wohnungsboerse(Flat_portal):
    pass
    # https://www.wohnungsboerse.net/Bayreuth/wohnen-auf-zeit

In [None]:
class MeineStadt(Flat_portal):
    pass
    # https://www.meinestadt.de/bayreuth/immobilien/wohnen-auf-zeit

In [None]:
class Immosuchmaschine(Flat_portal) :
    pass
    # https://www.immosuchmaschine.de/k/bayreuth/wohnen-auf-zeit

In [None]:
class Immonet(Flat_portal):
    pass
    # https://www.immonet.de/bayern/bayreuth-moebliertes-wohnen.html

# Running area

In [40]:
wg_gesucht = WG_gesucht(url = "https://www.wg-gesucht.de/1-zimmer-wohnungen-und-wohnungen-und-haeuser-in-Bayreuth.6.1+2+3.1.0.html" , max_no = 3)

In [None]:
immowelt = Immowelt(url = "https://www.immowelt.de/liste/bayreuth/wohnungen/mieten")

In [None]:
immoscout = Immobilienscout24(url = "https://www.immobilienscout24.de/Suche/radius/wohnung-mieten?centerofsearchaddress=Bayreuth;95444;;;;&geocoordinates=49.94349;11.57631;5.0")

In [41]:
for flat in (wg_gesucht.flats):
    print(flat)


        Title: Nach Sanierung: WG-geeignete 2-Zimmer-DG Whg. mit Gemeinschaftsgarten
        URL: https://www.wg-gesucht.de/wohnungen-in-Bayreuth-Birken.9001917.html
        Address: Pottensteinerstraße 00 95447 Bayreuth Birken
        Size: 32m²
        Price: 460€
        Number of rooms: 2
        Date of availability: 01.02.2022
        

        Title: Nach Sanierung: WG-geeignete 2-Zimmer-Whg. mit Gemeinschaftsgarten
        URL: https://www.wg-gesucht.de/wohnungen-in-Bayreuth-Birken.8922122.html
        Address: Pottensteinerstraße 00 95447 Bayreuth Birken
        Size: 50m²
        Price: 730€
        Number of rooms: 2
        Date of availability: 01.02.2022
        

        Title: KURZFRISTIGE 2,5 Zimmerwohnung (WG-geeignet)
        URL: https://www.wg-gesucht.de/wohnungen-in-Bayreuth-Altstadt.9097855.html
        Address: Hohenzollernring 69 95444 Bayreuth Altstadt
        Size: 95m²
        Price: 850€
        Number of rooms: 2
        Date of availability: 01.02.2022
 