In [1]:
# import of important packages
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import statsmodels.api as sm

  from pandas.core import datetools


In [2]:
### building the Crawler
# building Spider - an automated bot which can trawl web pages and scrape them
def estate_spider(max_pages):
    """ crawls through the search pages of available rent buildings
    
    args:
    max_pages:
        the amoung of search pages to be crawled through
        
    returns:
        a list of links directing to single appartment
        
    """
    page = 1
    link_list = []
    previous_page = " " * 5001
    while page <= max_pages:
        url = "https://www.homegate.ch/rent/real-estate/city-zurich/matching-list?tab=list&o=sortToplisting-desc&ep=" \
            + str(page)
        source_code = requests.get(url)
        plain_text = source_code.text
            
        soup = bs(plain_text, "lxml")
        for link in soup.findAll("a", {"class":"detail-page-link"}):
            href = link.get("href")
            href = "https://www.homegate.ch" + href
            link_list.append(href)
            #print(href)
        previous_page = plain_text
        page+=1
        
    return link_list

In [3]:
# define which information should be crawled
def get_information_from_page(url):
    """ 
    A function which scrapes information from a listing page 
    and returns it as dictionary
    
    args: 
    url: The url of the page to be scraped
    
    returns:
        a dictionary with information on the price, location, zip code, 
         main features (like space, floor etc.) as dictionary, 
         and additional attributes of the appartment (e.g. parking 
         place) as list.
    
    """
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = bs(plain_text, "lxml")
    
    main_features_dict = {}
    tmp_features = ["not","empty"]
    features_and_furnishings_list = []

    # address
    for data in soup.findAll("a", {"class": "detail-address-link"}):
        for tag in data.findAll("span"):
            zip_and_location = tag.text
            zip_and_location = zip_and_location.split(" ")
            zip_code = zip_and_location[0]
            location = zip_and_location[1]
            
    # rent
    for data in soup.findAll("span", {"itemprop": "price"}):
        price = data.text
        price = int(price.replace(".","").replace("–","").replace(",",""))
        #print(price)
    
        
    # main features
    for data in soup.findAll("div", {"class": "detail-key-data"}):
        #print(data)
        for subtag_lvl_1 in data.findAll("ul", {"class": "list--plain"}):
            for subtag_lvl_2 in subtag_lvl_1.findAll("li"):
                counter = 0
                for subtag_lvl_3 in subtag_lvl_2.findAll("span"):
                    try:
                        tmp_features[counter] = subtag_lvl_3.text 
                        counter += 1
                    except:
                        pass
                main_features_dict[tmp_features[0]] = tmp_features[1]
            
    # features and furnishings
    for data in soup.findAll("div", {"class": "detail-configuration"}):
        for subtag_lvl_1 in data.findAll("ul", {"class": "list--plain"}):
            for subtag_lvl_2 in subtag_lvl_1.findAll("li"):
                features_and_furnishings_list.append(subtag_lvl_2.text)
                #print(subtag_lvl_2)
    
    
    result_dict = {"location":location, 
                   "zip_code":zip_code, 
                   "price": price, 
                   "main_features_dict":main_features_dict, 
                   "features_and_furnishings_list": features_and_furnishings_list 
                  }
    
    return(result_dict)

In [4]:
# get a list of the first 2 pages
subpages = estate_spider(2)
# create an empty list which will be filled with the 
#     information on each flat 
page_information_list = []

In [5]:
len(subpages)

0

In [6]:
# conduct this process for the subpages
for subpage in subpages:
    try:
        info = get_information_from_page(subpage)
        page_information_list.append(info)
    except:
        print("the page {} didn't work".format(subpage))

In [7]:
# get additional Information which can be crawled
unique_main_features_keys = []
unique_features_and_furnishings_list = []

for appartment in page_information_list:
    for key in appartment["main_features_dict"].keys():
        if key not in unique_main_features_keys:
            unique_main_features_keys.append(key)
            
    for list_element in appartment["features_and_furnishings_list"]:
        if list_element not in unique_features_and_furnishings_list:
            unique_features_and_furnishings_list.append(list_element)
    
            
print(unique_main_features_keys)
print(unique_features_and_furnishings_list)

[]
[]


In [8]:
# get the information from a real estate to consider the output
page_information_list[0]

IndexError: list index out of range

In [301]:
# define the dataframe
rent_df = pd.DataFrame()

In [302]:
# get the information of all considered real estates and print it
for i,appartment in enumerate(page_information_list):
    rent_df.loc[i,"Location"] =  appartment["location"]
    rent_df.loc[i,"Zip"] =  appartment["zip_code"]
    rent_df.loc[i,"Price"] =  appartment["price"]
    
    for element in unique_main_features_keys:
        if element in appartment["main_features_dict"].keys():
            rent_df.loc[i,element] =  appartment["main_features_dict"][element]
        else:
            rent_df.loc[i,element] =  None
            
    for element in unique_features_and_furnishings_list:
        if element in appartment["features_and_furnishings_list"]:
            rent_df.loc[i,element] =  1
        else:
            # Setting the variabel to 0 might be a bad idea
            # It might create the impression that the attribute is non existent
            # this would be false. It is simply not confirmed
            rent_df.loc[i,element] =  0
    
display(rent_df)

Unnamed: 0,Location,Zip,Price,Type,Rooms,Floor,Living space,Last renovation,Available,Floor space,...,New building,Parking place,Pets allowed,Has wheelchair access,Old building,Fireplace,Railway connection,Garage,Water connection,Lavatory connection
0,Zürich,8050,2130.0,Apartment,3.0,4.,70 m2,2018.0,by agreement,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Zürich,8048,2130.0,Apartment,3.0,GF,62 m2,,by agreement,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Zürich,8006,3180.0,Apartment,2.5,5.,69 m2,,01.07.2018,158 m2,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Zürich,8038,9100.0,Duplex,4.5,,210 m2,2018.0,01.07.2018,,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Zürich,8002,7935.0,Attic flat,4.5,3.,172 m2,,01.07.2018,,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Zürich,8038,3390.0,Apartment,3.5,3.,,,by agreement,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Zürich,8001,4730.0,Apartment,2.5,5.,88 m2,2018.0,immediately,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Zürich,8001,1040.0,Apartment,1.0,4.,,,01.08.2018,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Zürich,8001,1130.0,Apartment,1.0,2.,16 m2,2018.0,immediately,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,Zürich,8001,1490.0,Apartment,1.5,3.,28 m2,,by agreement,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [303]:
# define a function to apply to the columsn where the extracted element contains " m2"
# we want the value as numeric value for further analysis
def strip_m2(string, to_be_stripped):
    if string is not None:
        if type(string) is type("test"):
            try:
                string = float(string.strip(to_be_stripped))
            except: 
                print("The element {} could not be processed".format(string))
    return(string)

rent_df["Living space"] = rent_df["Living space"].apply(strip_m2, args = (" m2",))
rent_df["Floor space"] = rent_df["Floor space"].apply(strip_m2, args = (" m2",))
rent_df["Floor"] = rent_df["Floor"].apply(strip_m2, args = (".",))

The element GF could not be processed
The element GF could not be processed
The element GF could not be processed


In [304]:
# convert all values where possible to numbers
for column in rent_df.columns:
    rent_df[column] = pd.to_numeric(rent_df[column], errors='ignore')

In [305]:
# final result
rent_df = rent_df[pd.notnull(rent_df['Living space'])]
display(rent_df)

Unnamed: 0,Location,Zip,Price,Type,Rooms,Floor,Living space,Last renovation,Available,Floor space,...,New building,Parking place,Pets allowed,Has wheelchair access,Old building,Fireplace,Railway connection,Garage,Water connection,Lavatory connection
0,Zürich,8050,2130.0,Apartment,3.0,4,70.0,2018.0,by agreement,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Zürich,8048,2130.0,Apartment,3.0,GF,6.0,,by agreement,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Zürich,8006,3180.0,Apartment,2.5,5,69.0,,01.07.2018,158.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Zürich,8038,9100.0,Duplex,4.5,,10.0,2018.0,01.07.2018,,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Zürich,8002,7935.0,Attic flat,4.5,3,17.0,,01.07.2018,,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Zürich,8001,4730.0,Apartment,2.5,5,88.0,2018.0,immediately,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Zürich,8001,1130.0,Apartment,1.0,2,16.0,2018.0,immediately,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,Zürich,8001,1490.0,Apartment,1.5,3,8.0,,by agreement,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,Zürich,8001,1600.0,Apartment,1.0,3,53.0,,01.07.2018,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
11,Zürich,8001,1680.0,Apartment,2.0,,35.0,,01.10.2018,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [306]:
# create a set of all variables in the column of the zip code
unique_zip_list = set(rent_df["Zip"])
unique_zip_list


{8001, 8002, 8006, 8038, 8048, 8050}

In [307]:
# create a new column for each zip code as a dummy variable for the regression
for element in unique_zip_list:
    rent_df[str(element)] = np.where(rent_df["Zip"] == element, 1,0)

In [308]:
# result with column for each zip code
rent_df

Unnamed: 0,Location,Zip,Price,Type,Rooms,Floor,Living space,Last renovation,Available,Floor space,...,Railway connection,Garage,Water connection,Lavatory connection,8001,8002,8038,8006,8048,8050
0,Zürich,8050,2130.0,Apartment,3.0,4,70.0,2018.0,by agreement,,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
1,Zürich,8048,2130.0,Apartment,3.0,GF,6.0,,by agreement,,...,0.0,0.0,0.0,0.0,0,0,0,0,1,0
2,Zürich,8006,3180.0,Apartment,2.5,5,69.0,,01.07.2018,158.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
3,Zürich,8038,9100.0,Duplex,4.5,,10.0,2018.0,01.07.2018,,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
4,Zürich,8002,7935.0,Attic flat,4.5,3,17.0,,01.07.2018,,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
6,Zürich,8001,4730.0,Apartment,2.5,5,88.0,2018.0,immediately,,...,0.0,0.0,0.0,0.0,1,0,0,0,0,0
8,Zürich,8001,1130.0,Apartment,1.0,2,16.0,2018.0,immediately,,...,0.0,0.0,0.0,0.0,1,0,0,0,0,0
9,Zürich,8001,1490.0,Apartment,1.5,3,8.0,,by agreement,,...,0.0,0.0,0.0,0.0,1,0,0,0,0,0
10,Zürich,8001,1600.0,Apartment,1.0,3,53.0,,01.07.2018,,...,0.0,0.0,0.0,0.0,1,0,0,0,0,0
11,Zürich,8001,1680.0,Apartment,2.0,,35.0,,01.10.2018,,...,0.0,0.0,0.0,0.0,1,0,0,0,0,0


In [309]:
# All variables and their datatyp 
rent_df.columns

Index(['Location', 'Zip', 'Price', 'Type', 'Rooms', 'Floor', 'Living space',
       'Last renovation', 'Available', 'Floor space', 'Year built',
       'No. of floors', 'Room height', 'Cable TV', 'ISDN connection',
       'Elevator', 'View', 'Child-friendly', 'Balcony / Patio', 'New building',
       'Parking place', 'Pets allowed', 'Has wheelchair access',
       'Old building', 'Fireplace', 'Railway connection', 'Garage',
       'Water connection', 'Lavatory connection', '8001', '8002', '8038',
       '8006', '8048', '8050'],
      dtype='object')

In [310]:
rent_df.dtypes

Location                  object
Zip                        int64
Price                    float64
Type                      object
Rooms                    float64
Floor                     object
Living space             float64
Last renovation          float64
Available                 object
Floor space              float64
Year built               float64
No. of floors            float64
Room height               object
Cable TV                 float64
ISDN connection          float64
Elevator                 float64
View                     float64
Child-friendly           float64
Balcony / Patio          float64
New building             float64
Parking place            float64
Pets allowed             float64
Has wheelchair access    float64
Old building             float64
Fireplace                float64
Railway connection       float64
Garage                   float64
Water connection         float64
Lavatory connection      float64
8001                       int64
8002      

In [311]:
zip_list

['8001', '8002', '8038', '8006', '8048', '8050']

In [313]:
# create a list of the x-variables run as independent in the regression later 
len_unique_zips = len(unique_zip_list)
zip_list = list(rent_df.columns[-len_unique_zips:])

regression_cols1 = ['Living space', 'Garage', 'Rooms']
regression_cols = regression_cols1 + zip_list
regression_cols

['Living space',
 'Garage',
 'Rooms',
 '8001',
 '8002',
 '8038',
 '8006',
 '8048',
 '8050']

In [314]:
# create a mask to hide NaN values. These obstruct the regression
index_mask = rent_df[regression_cols].dropna().index

In [382]:
## Regression
rent_df = rent_df[np.isfinite(rent_df['Living space'])]


X = rent_df[regression_cols].loc[index_mask,:]

y = rent_df['Price'].loc[index_mask]

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print(est.summary())

X_1 = rent_df[regression_cols1].loc[index_mask,:]

y_1 = rent_df['Price'].loc[index_mask]

X_1 = sm.add_constant(X_1)
est2 = sm.OLS(y_1, X_1).fit()
print(est2.summary())



                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.867
Model:                            OLS   Adj. R-squared:                  0.825
Method:                 Least Squares   F-statistic:                     20.45
Date:                Thu, 07 Jun 2018   Prob (F-statistic):           3.66e-09
Time:                        00:59:46   Log-Likelihood:                -264.61
No. Observations:                  34   AIC:                             547.2
Df Residuals:                      25   BIC:                             561.0
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         2655.1061    452.468      5.868   

In [610]:
## customer interface and output
print("Hello Customer!")

import time
time.sleep(1)

while True:
    q = input('Enter the number of square meters of your real estate: ')
    try:
        val = int(q)
        if val < 1:  
            print("I think you are trying to trick me.")
            continue
        break
    except ValueError:
        print("We both know that's not a number.")     
print("Thank you very much for the information!")
    

garagecustomer = None
while garagecustomer not in ("Yes", "No"):
    garagecustomer = input('Do you have a garage (please answer with "Yes" or "No"): ')
    if garagecustomer != "Yes" or "No":
        print("I can only accept a Yes or No.")
else:
    print("Thank you very much for the information!")
        
if garagecustomer == "Yes":
    garagecustomer = 1
else:
    garagecustomer = 0


while True:
    roomscustomer = input('Enter the number of rooms: ')
    try:
        val = int(roomscustomer)
        if val < 1:  
            print("I think you are trying to trick me.")
            continue
        break
    except ValueError:
        print("We both know that's not a number.")     
print("Thank you very much for the information!")

b0 = est2.params[0]
b1 = est2.params[1]
b2 = est2.params[2]
b3 = est2.params[3]
forcast1 = int(b0) + int(q) * int(b1) + int(garagecustomer) * int(b2) + int(roomscustomer) * int(b3)

zipcustomer = input('Enter the zip code the real estate is located in: ')
if zipcustomer not in (zip_list):
    print ("It looks like the record doesn't consider your zip code. Therefore we calculate the rent without taking the postcode into account. This is", forcast1, "CHF.")
else:
    for i in regression_cols:
        if i == zipcustomer:
            i
    for i, j in enumerate(regression_cols):
        if j == zipcustomer:
            parameterzipcode = (i+1)
    c0 = est.params[0]
    c1 = est.params[1]
    c2 = est.params[2]
    c3 = est.params[3]
    c4 = est.params[parameterzipcode]
    forcast2 = int(c0) + int(q) * int(c1) + int(garagecustomer) * int(c2) + int(roomscustomer) * int(c3) + int(c4)
    print('Thank you for your information! We forcast the price of your real estate as', forcast2, "CHF.")

Hello Customer!
Enter the number of square meters of your real estate: 20
Thank you very much for the information!
Do you have a garage (please answer with "Yes" or "No"): No
Only a Yes or No are accepted as answers.
Thank you very much for the information!
Enter the number of rooms: 3
Thank you very much for the information!
Enter the zip code the real estate is located in: 9
It looks like the record doesn't consider your zip code. Therefore we calculate the rent without taking the postcode into account. This is 3973 CHF.
