# Setup

In [1]:
import pandas as pd
import re

# Boliga data 
Read scraped data

In [2]:
df =  pd.read_csv('Data_Boliga_20220208.csv')

df.head()

Unnamed: 0,Type,StreetAddress,Area,Price,Date,SellType,Size,Price_m2,Rooms_YearBuilt_PriceChange
0,E,"Amerika Plads 6, 3. th",2100 København Ø,5.000.000 kr.,17-01-2022,Alm. Salg,93 m²,53.763 kr/m²,3 2007 -5% Aktuel værdi
1,E,"Strandvejen 8, 3. tv",2100 København Ø,8.495.000 kr.,14-01-2022,Alm. Salg,170 m²,49.971 kr/m²,6 1929 Aktuel værdi
2,E,"Nørrebrogade 9F, 1. th",2200 København N,5.400.000 kr.,14-01-2022,Alm. Salg,105 m²,51.429 kr/m²,3 1863 -2% Aktuel værdi
3,E,"Badensgade 48, 2",2300 København S,7.200.000 kr.,14-01-2022,Alm. Salg,65 m²,110.769 kr/m²,3 1899 Aktuel værdi
4,E,"Badensgade 48, 1",2300 København S,7.200.000 kr.,14-01-2022,Alm. Salg,79 m²,91.139 kr/m²,3 1899 Aktuel værdi


Keep only observations for apartments which was soled as "almindelig salg".

In [3]:
print(df.shape)

df = df[(df['Type'] == 'E') & (df['SellType'] == 'Alm. Salg')]

print(df.shape)

(54793, 9)
(40673, 9)


Clean text data

In [4]:
df['Price'] = df['Price'].str.replace('.', '', regex = True).str.replace('kr', '', regex = True)
df['Size'] = df['Size'].str.extract('(\d+)')
df['Price_m2']  = df['Price_m2'].str.replace('.', '', regex = True).str.extract('(\d+)')
df[['Rooms', 'YearBuilt', 'PriceChange']] =  df['Rooms_YearBuilt_PriceChange'].str.split(' ', expand = True).iloc[:,0:3]
df.loc[df['PriceChange'] == 'Aktuel', ['PriceChange']] = '0'
df['PriceChange'] = df['PriceChange'].str.replace('%', '', regex  =True)

df['PostalCode'] = df['Area'].str.extract('(\d{4})')
df['AreaName'] = df['Area'].str.extract('(\D+)')

df.head()


Unnamed: 0,Type,StreetAddress,Area,Price,Date,SellType,Size,Price_m2,Rooms_YearBuilt_PriceChange,Rooms,YearBuilt,PriceChange,PostalCode,AreaName
0,E,"Amerika Plads 6, 3. th",2100 København Ø,5000000,17-01-2022,Alm. Salg,93,53763,3 2007 -5% Aktuel værdi,3,2007,-5,2100,København Ø
1,E,"Strandvejen 8, 3. tv",2100 København Ø,8495000,14-01-2022,Alm. Salg,170,49971,6 1929 Aktuel værdi,6,1929,0,2100,København Ø
2,E,"Nørrebrogade 9F, 1. th",2200 København N,5400000,14-01-2022,Alm. Salg,105,51429,3 1863 -2% Aktuel værdi,3,1863,-2,2200,København N
3,E,"Badensgade 48, 2",2300 København S,7200000,14-01-2022,Alm. Salg,65,110769,3 1899 Aktuel værdi,3,1899,0,2300,København S
4,E,"Badensgade 48, 1",2300 København S,7200000,14-01-2022,Alm. Salg,79,91139,3 1899 Aktuel værdi,3,1899,0,2300,København S


Create variable for the floor number of the apartment by extracting the number from street address column, "st" stands for ground floor.

In [5]:
df['Floor'] = df['StreetAddress'].str.extract(', (\d+|st)')
df.loc[df['Floor'] == 'st', 'Floor'] = 0

Check if any null values appeared for the floor number.

In [6]:
df_floor_null = df[df['Floor'].isnull()]

print(df_floor_null.shape)

df_floor_null.head(20)

(433, 15)


Unnamed: 0,Type,StreetAddress,Area,Price,Date,SellType,Size,Price_m2,Rooms_YearBuilt_PriceChange,Rooms,YearBuilt,PriceChange,PostalCode,AreaName,Floor
48,E,Overbys Allé 1A,2500 Valby,7550000,05-01-2022,Alm. Salg,121,62397,6 1900 -6% Aktuel værdi,6,1900,-6,2500,Valby,
398,E,Strandgade 10A,1401 København K,11200000,01-12-2021,Alm. Salg,216,51852,5 1720 Aktuel værdi,5,1720,0,1401,København K,
647,E,Øresundsvej 132D,2300 København S,6995000,13-11-2021,Alm. Salg,132,52992,4 1934 Aktuel værdi,4,1934,0,2300,København S,
724,E,Norgesmindevej 35,2900 Hellerup,11350000,10-11-2021,Alm. Salg,223,50897,5 1908 Aktuel værdi,5,1908,0,2900,Hellerup,
777,E,Kirkebjerg Allé 38,2720 Vanløse,2555000,07-11-2021,Alm. Salg,59,43305,2 1903 -2% Aktuel værdi,2,1903,-2,2720,Vanløse,
862,E,Øresundsvej 132C,2300 København S,5995000,02-11-2021,Alm. Salg,132,45417,4 1934 Aktuel værdi,4,1934,0,2300,København S,
992,E,Prøvestens Allé 3A,2300 København S,2870000,27-10-2021,Alm. Salg,74,38784,2 1900 -4% Aktuel værdi,2,1900,-4,2300,København S,
1586,E,Rosenvængets Allé 5A,2100 København Ø,3750000,28-09-2021,Alm. Salg,59,63559,2 1875 Aktuel værdi,2,1875,0,2100,København Ø,
1812,E,Øresundsvej 126A,2300 København S,2895000,21-09-2021,Alm. Salg,54,53611,2 1920 -3% Aktuel værdi,2,1920,-3,2300,København S,
1865,E,Constantin Hansens Gade 8C,1799 København V,3120000,19-09-2021,Alm. Salg,42,74286,1 2019 -2% Aktuel værdi,1,2019,-2,1799,København V,


There are 433 observations where a floor number can not be extracted. A quick check on some of the addresses shows that the type column is either coded incorrectly or the observations is an apartment located in a villa. Keep the obersvations for now, setting the floor variable to -1 for easy identification later.

In [7]:
df.loc[df['Floor'].isnull(), 'Floor'] = -1

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40673 entries, 0 to 54792
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Type                         40673 non-null  object
 1   StreetAddress                40673 non-null  object
 2   Area                         40673 non-null  object
 3   Price                        40673 non-null  object
 4   Date                         40673 non-null  object
 5   SellType                     40673 non-null  object
 6   Size                         40673 non-null  object
 7   Price_m2                     40673 non-null  object
 8   Rooms_YearBuilt_PriceChange  40673 non-null  object
 9   Rooms                        40673 non-null  object
 10  YearBuilt                    40673 non-null  object
 11  PriceChange                  40673 non-null  object
 12  PostalCode                   40673 non-null  object
 13  AreaName                     40

Drop redundant columns

In [8]:
df.drop(['Rooms_YearBuilt_PriceChange', 'Type', 'SellType', 'Area'], inplace = True, axis = 1)

In [9]:
df.loc[df['PriceChange'] == '299.499.900', 'PriceChange'] = 0

In [10]:
num_cols = ['Price', 'Size', 'Price_m2', 'Rooms', 'YearBuilt', 'PriceChange', 'PostalCode', 'Floor']

for col in num_cols:
    
    df[col] = df[col].astype(int)
    
print(df.dtypes)

StreetAddress    object
Price             int32
Date             object
Size              int32
Price_m2          int32
Rooms             int32
YearBuilt         int32
PriceChange       int32
PostalCode        int32
AreaName         object
Floor             int32
dtype: object


Check summary statistics

In [11]:
df.describe()

Unnamed: 0,Price,Size,Price_m2,Rooms,YearBuilt,PriceChange,PostalCode,Floor
count,40673.0,40673.0,40673.0,40673.0,40673.0,40673.0,40673.0,40673.0
mean,6110597.0,84.081528,83751.71,2.828879,1909.852212,-1.215696,2201.134463,2.179136
std,8870159.0,36.692984,179807.1,1.142108,247.538151,4.237192,379.48545,1.933333
min,51199.0,12.0,457.0,1.0,0.0,-99.0,1050.0,-1.0
25%,2225000.0,58.0,34239.0,2.0,1904.0,-2.0,2100.0,1.0
50%,3400000.0,76.0,42105.0,3.0,1935.0,0.0,2300.0,2.0
75%,5275000.0,102.0,52536.0,3.0,1975.0,0.0,2450.0,3.0
max,85000000.0,857.0,4083333.0,13.0,2021.0,249.0,2900.0,29.0


# Geographical data
To enable geographical analysis, e.g., distance to public transport, waterfront area etc., I need the geolocation of the addresses. This is possible by using DAWA (Danmarks Adressers Web API). By using the package pydawa, which is a wrapper for the API, this is quite easy.

## Geolocations for apartment addresses
Import the package and try one address

In [33]:
# Import package
import pydawa
import time

# Search for address
adresse = pydawa.Adressesoeg(q = 'Amerika Plads 6')

# Save response
response = adresse.info()

# Get coordinates (in CRS:25832)
adresse.get_koordinater(response[0])

(725829.55, 6178382.86)

Geocode all the addresses in the data frame by creating a function and applying it to a column only containing the street name and street number. To reduce the risk of any maximum number of requests per minute or hour, add a delay. (It took ca. 1-1.5 hour to run the code and geolocate all the addresses)

In [39]:
# Creat  column
df['Street'] = df['StreetAddress'].str.split(',').str[0]

# Create function
def get_lon_lat(add):
    
    adresse = pydawa.Adressesoeg(q = add)
    
    response = adresse.info()
    
    lon_lat = adresse.get_koordinater(response[0])
    
    time.sleep(0.1)
    
    return lon_lat



# Run geocoding for all addresses
lon_lat = df['Street'].apply(get_lon_lat)

Insert longitude and latitude in data frame and inspect the results

In [59]:
df[['Lon', 'Lat']] = lon_lat.values.tolist()

df.head()

Unnamed: 0,StreetAddress,Price,Date,Size,Price_m2,Rooms,YearBuilt,PriceChange,PostalCode,AreaName,Floor,Street,Lon_lat,Lon,Lat
0,"Amerika Plads 6, 3. th",5000000,17-01-2022,93,53763,3,2007,-5,2100,København Ø,3,Amerika Plads 6,"(725829.55, 6178382.86)",725829.55,6178382.86
1,"Strandvejen 8, 3. tv",8495000,14-01-2022,170,49971,6,1929,0,2100,København Ø,3,Strandvejen 8,"(701140.38, 6149790.6)",701140.38,6149790.6
2,"Nørrebrogade 9F, 1. th",5400000,14-01-2022,105,51429,3,1863,-2,2200,København N,1,Nørrebrogade 9F,"(723816.79, 6177062.36)",723816.79,6177062.36
3,"Badensgade 48, 2",7200000,14-01-2022,65,110769,3,1899,0,2300,København S,2,Badensgade 48,"(727190.71, 6174651.68)",727190.71,6174651.68
4,"Badensgade 48, 1",7200000,14-01-2022,79,91139,3,1899,0,2300,København S,1,Badensgade 48,"(727190.71, 6174651.68)",727190.71,6174651.68


Write data to new file

In [None]:
df.to_csv('Data_Boliga_20220208_cleaned_20220313.csv', index  = False)