In [29]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import random
import re
import sys
import time

In [30]:
def listings_urls():
    columns = ['url']
    df = pd.DataFrame(columns=columns)
    url_list =[]
    pagelimit = 3
    
    for i in range(1, pagelimit):
        url = 'https://www.immobilienscout24.de/Suche/S-T/P-{}/Wohnung-Miete/Umkreissuche/Berlin/-/229459/2511140/-/-/50?enteredFrom=result_list'.format(i)
        r = requests.get(url)
        data = r.text
        soup = BeautifulSoup(data)
        urls = soup.find_all('article')
        j = 0

        for expose in urls:
            j = j + 1
            new_url = 'https://www.immobilienscout24.de/expose/' + str(expose['data-obid'])
            url_list.append(new_url)

    return url_list


def scrape_complete_page(url):
    r = requests.get(url).text
    soup = BeautifulSoup(r, 'html.parser')
    return soup


def extract_single_element(soup, html_class): 
    value = soup.find_all(class_=html_class)[0].text
    return value


def save_images(url, images_list):
    # get expose id from the URL
    expose = url.split('/')[4]
    print("crawling pictures for expose #: " + str(expose))
    i = 0
    if not os.path.exists("./images/"):
        os.makedirs("./images/")
    for image_url in images_list:
        sys.stdout.write('\r'+"downloading image # " + str(i))

        r = requests.get(image_url)
        #print(image_url)
        if not os.path.exists("./images/" + expose + "/"):
            os.makedirs("./images/" + expose + "/")
        with open("./images/" + expose + "/" + str(i) + ".jpg", "wb") as f:
            f.write(r.content)
        i = i + 1


def scrape_elements(urls, html_classes_list):
    # create an empty list where all the data is stored
    data_all = []
    
    for url in urls:
        # a short random break between requests is very important to not be a bother to the 
        # web service provider
        time.sleep(random.uniform(0.3, 2))
        # here we added a try and except to skip errors with pages that are not 
        # standard to the regular layout of immobilienscout.de
        try:
            # web page gets requested only once
            soup = scrape_complete_page(url)
            print('\n')
            print('====================================================')
            print('url: ' + str(url))
            # create an empty list for each data set
            data_set = []
            data_set.append(url)
            # get all elements that are specified in html_classes
            for html_class in html_classes_list:
                # print(html_class)
                # print(extract_single_element(soup, html_class))
                # add the elements to the list
                data_set.append(extract_single_element(soup, html_class))
            # add all the data into the data_all list as list of lists
            data_all.append(data_set)

            # new code to save images from all urls as well as the data from 
            # before
            images = soup.find_all(class_='sp-image ')
            images_urls = []
            for image in images:
                images_urls.append(image['data-src'].split('/ORIG')[0])
            save_images(url, images_urls)
        except Exception as e:
            pass
    print(data_all)
    # create a pandas dataframe to easily store the data as a .csv-file
    column_names = ['url', 'rent', 'rooms','area']
    df = pd.DataFrame(data_all, columns = column_names)
    df.to_csv('./rent_data.csv', sep=';')

In [33]:
urls = listings_urls()

html_classes = ['is24qa-kaltmiete is24-value font-semibold', 
                'is24qa-zi is24-value font-semibold', 
                'is24qa-flaeche is24-value font-semibold']

scrape_elements(urls[0:10], html_classes)



url: 
https://www.immobilienscout24.de/expose/110553018
is24qa-kaltmiete is24-value font-semibold
 1.300 € 
is24qa-zi is24-value font-semibold
 2 
is24qa-flaeche is24-value font-semibold
 49,6 m² 
crawling pictures for expose #: 110553018
downloading image # 21

url: 
https://www.immobilienscout24.de/expose/110548217
is24qa-kaltmiete is24-value font-semibold
 962,73 € 
is24qa-zi is24-value font-semibold
 2 
is24qa-flaeche is24-value font-semibold
 60,18 m² 
crawling pictures for expose #: 110548217
downloading image # 9

url: 
https://www.immobilienscout24.de/expose/110313216
is24qa-kaltmiete is24-value font-semibold
 810 € 
is24qa-zi is24-value font-semibold
 1 
is24qa-flaeche is24-value font-semibold
 12 m² 
crawling pictures for expose #: 110313216
downloading image # 7

url: 
https://www.immobilienscout24.de/expose/109808199
is24qa-kaltmiete is24-value font-semibold
 1.700 € 
is24qa-zi is24-value font-semibold
 4 
is24qa-flaeche is24-value font-semibold
 122,45 m² 
crawling pictu