In [85]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None, "display.max_rows", 100)

df = pd.read_csv("../data/immoscout_cleaned_lat_lon_fixed_v9.csv")

col_names = df.columns.array
col_names[0:2] = ["Index1", "Index2"]
df.columns = col_names

numeric_columns = ["Living space", "Plot area", "Floor space", "Floor", "detail_responsive#surface_living", "detail_responsive#floor", "Wohnfläche", "Stockwerk", "Nutzfläche", "Grundstücksfläche", "detail_responsive#surface_property", "detail_responsive#surface_usable", "Surface habitable", "Surface du terrain", "Surface utile", "Étage", "Superficie abitabile", "Piano", "Superficie del terreno", "Superficie utile", "Floor_merged", "Living_space_merged", "Floor_space_merged", "Plot_area_merged"]
df_numeric = df[numeric_columns]

df_numeric.shape

  df = pd.read_csv("../data/immoscout_cleaned_lat_lon_fixed_v9.csv")


(13378, 24)

In this part of the data analysis we focus on the numeric variables. For each one of them we find a "_merged" column in the dataset. How this column has been merged however is unclear, therefore we cannot assume that this column contains all the data.  
Therefore we split by values containing information on "Living Space", "Plot Area", "Floor Space" and "Floor".

## Living Space

In [86]:
living_space = ["Living space", "Wohnfläche", "Surface habitable", "Superficie abitabile", "detail_responsive#surface_living", "Living_space_merged"]
df_living_space = df_numeric[living_space]

To get a sense of how the "merged" column was created, we inspect the cumulative sum of the value counts per column and check it against the "merged"-count. 

In [87]:
df_living_space.iloc[:,0:5].count().cumsum() == df_living_space[living_space[5]].count()

Living space                        False
Wohnfläche                          False
Surface habitable                   False
Superficie abitabile                 True
detail_responsive#surface_living    False
dtype: bool

This suggests, that the column "Living_space_merged" contains the information of all the columns excluding "detail_responsive#surface_living". 

Let's see, if this is actually true.

First, we merge the relevant columns into one

In [88]:
df_living_space['living_space'] = df_living_space[living_space[0]].fillna('') + \
  (df_living_space[living_space[1]]).fillna('') + \
  (df_living_space[living_space[2]]).fillna('') + \
  (df_living_space[living_space[3]]).fillna('')

(df_living_space['living_space'] == df_living_space[living_space[5]].fillna('')).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_living_space['living_space'] = df_living_space[living_space[0]].fillna('') + \


13378

Counting the True values of the above condition yields the same count as the column count of the dataset meaning that the columns are identical.

The suggestion from the cumulative sum has therefore been proven correct!
Now to get one column with the full information, we merge the "Living_space_merged" column with "detail_responsive#surface_living".

In [89]:
df_living_space['living_space'] = df_living_space[living_space[5]].fillna('') + \
  (df_living_space[living_space[4]]).fillna('')

df_living_space['living_space'] = df_living_space['living_space'].replace('', np.nan)

df_living_space['living_space'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_living_space['living_space'] = df_living_space[living_space[5]].fillna('') + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_living_space['living_space'] = df_living_space['living_space'].replace('', np.nan)


12304

We now have 12304 observations for living space in our dataset. Nice! Let's transfer this back into the numeric subset of the dataframe and drop the unused columns.

In [90]:
df_numeric = df_numeric.drop(living_space, axis=1) # drops 6 columns -> 18 remain
df_numeric["living_space"] = df_living_space['living_space'] # adds 1 column -> 19
df_numeric.shape

(13378, 19)

In [91]:
df_numeric["Space extracted"] = df["Space extracted"]
df_numeric[df_numeric["Space extracted"].notna() & df_numeric["living_space"].isna()]


Unnamed: 0,Plot area,Floor space,Floor,detail_responsive#floor,Stockwerk,Nutzfläche,Grundstücksfläche,detail_responsive#surface_property,detail_responsive#surface_usable,Surface du terrain,Surface utile,Étage,Piano,Superficie del terreno,Superficie utile,Floor_merged,Floor_space_merged,Plot_area_merged,living_space,Space extracted
786,,214 m²,3. floor,,,,,,,,,,,,,3. floor,214 m²,,,200.0
3380,,,3. floor,,,,,,,,,,,,,3. floor,,,,210.0
3696,,,1. floor,,,,,,,,,,,,,1. floor,,,,228.0
6506,,,,,,,,,,,,,,,,,,,,200.0


In [92]:
df_numeric["living_space"] = df_numeric.living_space.str.extract('(\d+)').fillna(0).astype(int)
df_numeric[df_numeric["Space extracted"].notna() & df_numeric["living_space"].isna()]["living_space"] = df_numeric[df_numeric["Space extracted"].notna() & df_numeric["living_space"].isna()]["Space extracted"].astype("int")

(df_numeric["living_space"] == df_numeric["Space extracted"].fillna(0).astype("int")).count()
df_numeric = df_numeric.drop("Space extracted", axis=1)

We then rinse and repeat this process for all data with a "_merged" column present in the dataset. 

## Plot Area

In [93]:
plot_area = ["Plot area", "Grundstücksfläche", "Surface du terrain", "Superficie del terreno", "detail_responsive#surface_property", "Plot_area_merged"]
df_plot_area = df_numeric[plot_area]

df_plot_area[plot_area[0:5]].count().cumsum() == df_plot_area[plot_area[5]].count()

Plot area                             False
Grundstücksfläche                     False
Surface du terrain                    False
Superficie del terreno                 True
detail_responsive#surface_property    False
dtype: bool

In [94]:
df_plot_area['plot_area'] = df_plot_area[plot_area[0]].fillna('') + \
  (df_plot_area[plot_area[1]]).fillna('') + \
  (df_plot_area[plot_area[2]]).fillna('') + \
  (df_plot_area[plot_area[3]]).fillna('')

(df_plot_area['plot_area'] == df_plot_area[plot_area[5]].fillna('')).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_plot_area['plot_area'] = df_plot_area[plot_area[0]].fillna('') + \


13378

In [95]:
df_plot_area['plot_area'] = df_plot_area[plot_area[5]].fillna('') + \
  (df_plot_area[plot_area[4]]).fillna('')

df_plot_area['plot_area'] = df_plot_area['plot_area'].replace('', np.nan)

df_plot_area['plot_area'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_plot_area['plot_area'] = df_plot_area[plot_area[5]].fillna('') + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_plot_area['plot_area'] = df_plot_area['plot_area'].replace('', np.nan)


4953

In [96]:
df_numeric = df_numeric.drop(plot_area, axis=1) # drops 6 columns -> 13 remain
df_numeric["plot_area"] = df_plot_area['plot_area'] # adds 1 column -> 14
df_numeric.shape

(13378, 14)

In [97]:
df_numeric["plot_area"] = df_numeric.plot_area.str.extract('(\d+)').fillna(0).astype(int)

## Floor Space

In [98]:
floor_space = ["Floor space", "Nutzfläche", "Surface utile", "Superficie utile", "detail_responsive#surface_usable", "Floor_space_merged"]
df_floor_space = df_numeric[floor_space]

df_floor_space[floor_space[0:5]].count().cumsum() == df_floor_space[floor_space[5]].count()

Floor space                         False
Nutzfläche                          False
Surface utile                       False
Superficie utile                     True
detail_responsive#surface_usable    False
dtype: bool

In [99]:
df_floor_space['floor_space'] = df_floor_space[floor_space[0]].fillna('') + \
  (df_floor_space[floor_space[1]]).fillna('') + \
  (df_floor_space[floor_space[2]]).fillna('') + \
  (df_floor_space[floor_space[3]]).fillna('')

(df_floor_space['floor_space'] == df_floor_space[floor_space[5]].fillna('')).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor_space['floor_space'] = df_floor_space[floor_space[0]].fillna('') + \


13378

In [100]:
df_floor_space['floor_space'] = df_floor_space[floor_space[0]].fillna('') + \
  (df_floor_space[floor_space[1]]).fillna('') + \
  (df_floor_space[floor_space[2]]).fillna('') + \
  (df_floor_space[floor_space[3]]).fillna('')

df_floor_space[df_floor_space['floor_space'] != '']['floor_space'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor_space['floor_space'] = df_floor_space[floor_space[0]].fillna('') + \


2842

In [101]:
df_floor_space['floor_space'] = df_floor_space[floor_space[5]].fillna('') + \
  (df_floor_space[floor_space[4]]).fillna('')

df_floor_space['floor_space'] = df_floor_space['floor_space'].replace('', np.nan)

df_floor_space['floor_space'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor_space['floor_space'] = df_floor_space[floor_space[5]].fillna('') + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor_space['floor_space'] = df_floor_space['floor_space'].replace('', np.nan)


2953

In [102]:
df_numeric = df_numeric.drop(floor_space, axis=1) # drops 6 columns -> 8 remain
df_numeric["floor_space"] = df_floor_space['floor_space'] # adds 1 column -> 9
df_numeric.shape

(13378, 9)

In [103]:
df_numeric["floor_space"] = df_numeric.floor_space.str.extract('(\d+)').fillna(0).astype(int)

## Floor

In [104]:
floor = ["Floor", "Stockwerk", "Étage", "Piano", "detail_responsive#floor", "Floor_merged"]
df_floor = df_numeric[floor]

df_floor[floor[0:5]].count().cumsum() == df_floor[floor[5]].count()

Floor                      False
Stockwerk                  False
Étage                      False
Piano                       True
detail_responsive#floor    False
dtype: bool

In [105]:
df_floor['floor'] = df_floor[floor[0]].fillna('') + \
  (df_floor[floor[1]]).fillna('') + \
  (df_floor[floor[2]]).fillna('') + \
  (df_floor[floor[3]]).fillna('')

(df_floor['floor'] == df_floor[floor[5]].fillna('')).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor['floor'] = df_floor[floor[0]].fillna('') + \


13378

In [106]:
df_floor['floor'] = df_floor[floor[0]].fillna('') + \
  (df_floor[floor[1]]).fillna('') + \
  (df_floor[floor[2]]).fillna('') + \
  (df_floor[floor[3]]).fillna('')

df_floor[df_floor['floor'] != '']['floor'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor['floor'] = df_floor[floor[0]].fillna('') + \


5414

In [107]:
df_floor['floor'] = df_floor[floor[5]].fillna('') + \
  (df_floor[floor[4]]).fillna('')

df_floor['floor'] = df_floor['floor'].replace('', np.nan)

df_floor['floor'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor['floor'] = df_floor[floor[5]].fillna('') + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor['floor'] = df_floor['floor'].replace('', np.nan)


5620

In [108]:
df_numeric = df_numeric.drop(floor, axis=1) # drops 6 columns -> 3 remain
df_numeric["floor"] = df_floor['floor'] # adds 1 column -> 4
df_numeric.shape

(13378, 4)

In [109]:
df_numeric

Unnamed: 0,living_space,plot_area,floor_space,floor
0,100,0,0,4. floor
1,156,222,242,
2,93,0,0,2. floor
3,154,370,257,
4,142,0,0,Ground floor
...,...,...,...,...
13373,70,0,0,
13374,0,284,0,
13375,150,160,0,
13376,145,853,140,


# Cleaning and Parsing
## Floor

In [110]:
df_numeric["floor"].unique()

array(['4. floor', nan, '2. floor', 'Ground floor', '3. floor',
       '6. floor', '1. floor', '5. floor', '14. floor', '20. floor',
       '8. floor', '2. Basement', '7. floor', '15. floor', '10. floor',
       '11. floor', '4. Basement', '100. floor', '12. floor',
       '1. Basement', '21. floor', '9. floor', '3. Basement',
       '999. floor', '23. floor'], dtype=object)

In [111]:
import re

def parse_floor(x):
  if x != x:
    return np.nan
  elif x == "Ground floor":
    return 0
  elif re.search('\. floor', x):
    return re.search('\d+', x).group()
  elif re.search('Basement', x):
    return "-" + re.search('\d+', x).group()


df_numeric["floor"] = df_numeric["floor"].apply(parse_floor)
df_numeric["floor"].unique()

array(['4', nan, '2', 0, '3', '6', '1', '5', '14', '20', '8', '-2', '7',
       '15', '10', '11', '-4', '100', '12', '-1', '21', '9', '-3', '999',
       '23'], dtype=object)

In [112]:
df_numeric["floor"] = df_numeric["floor"].astype(float)
df_numeric["floor"].unique()

array([  4.,  nan,   2.,   0.,   3.,   6.,   1.,   5.,  14.,  20.,   8.,
        -2.,   7.,  15.,  10.,  11.,  -4., 100.,  12.,  -1.,  21.,   9.,
        -3., 999.,  23.])

In [113]:
df_numeric.std()

living_space    123.475275
plot_area       845.261158
floor_space     153.571314
floor            29.809057
dtype: float64

# Availability

In [114]:
availability = ["Availability", "Verfügbarkeit", "Disponibilité", "Disponibilità", "detail_responsive#available_from",  "Availability_merged"]
df_availability = df[availability]

df_availability.iloc[:, 0:5].count().cumsum() == df_availability.iloc[:, 5].count()

Availability                        False
Verfügbarkeit                       False
Disponibilité                       False
Disponibilità                        True
detail_responsive#available_from    False
dtype: bool

In [115]:
df_availability['availability'] = df_availability[availability[0]].fillna('') + \
  (df_availability[availability[1]]).fillna('') + \
  (df_availability[availability[2]]).fillna('') + \
  (df_availability[availability[3]]).fillna('')

(df_availability['availability'] == df_availability[availability[5]].fillna('')).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_availability['availability'] = df_availability[availability[0]].fillna('') + \


13378

In [116]:
df_availability['availability'] = df_availability[availability[5]].fillna('') + \
  (df_availability[availability[4]]).fillna('')

df_availability['availability'] = df_availability['availability'].replace('', np.nan)

df_availability['availability'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_availability['availability'] = df_availability[availability[5]].fillna('') + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_availability['availability'] = df_availability['availability'].replace('', np.nan)


13378

In [117]:
df_availability["availability"].unique()

array(['On request', 'Immediately', '30.12.2022', '01.12.2022',
       '01.04.2023', '01.08.2023', '01.10.2022', '01.11.2022',
       '01.09.2023', '01.07.2023', '07.07.2023', '22.10.2022',
       '01.02.2023', '01.06.2023', '31.10.2023', '01.01.2023',
       '01.05.2023', '01.12.2023', '01.10.2023', '30.11.2022',
       '31.12.2023', '20.03.2023', '19.01.2024', '01.03.2023',
       '01.05.2024', '15.08.2023', '31.12.2022', '31.03.2023',
       '30.06.2024', '01.02.2024', '31.07.2023', '02.01.2023',
       '15.10.2022', '11.11.2022', '30.11.2023', '01.04.2024',
       '01.12.2024', '30.09.2022', '01.04.2025', '01.10.2024',
       '01.07.2024', '01.11.2024', '15.12.2022', '01.06.2024',
       '01.01.2024', '01.11.2023', '25.01.2024', '24.06.2023',
       '26.10.2022', '28.02.2023', '15.09.2022', '30.09.2023',
       '30.01.2024', '03.04.2023', '15.02.2024', '01.04.2030',
       '30.04.2023', '05.09.2022', '03.10.2022', '31.05.2024',
       '31.05.2023', '31.03.2024', '30.12.2023', '16.1

In [118]:
df_numeric["availability"] = df_availability["availability"]

# Gross return

In [119]:
df["Gross return"].unique()

array([nan, '0.00 %', '4.5 %'], dtype=object)

In [120]:
df["Gross return"].count()

6

This column does not contain a whole lot of information, therefore we will not consider it for our analysis. 

# Rooms

In [121]:
def parse_rooms(x):
  pattern = '(\d+\.\d) rooms'
  match = re.search(pattern, x)
  if match is not None:
    result = match.group(1)
  else:
    result = np.NaN
  return result

df_numeric["rooms"] = df["details_structured"].apply(parse_rooms).astype("float")

# Price

In [122]:
df_numeric["price"] = df["price_cleaned"]

In [123]:
df_numeric

Unnamed: 0,living_space,plot_area,floor_space,floor,availability,rooms,price
0,100,0,0,4.0,On request,3.5,1150000.0
1,156,222,242,,On request,4.5,1420000.0
2,93,0,0,2.0,Immediately,2.5,720000.0
3,154,370,257,,On request,4.5,1430000.0
4,142,0,0,0.0,On request,4.5,995000.0
...,...,...,...,...,...,...,...
13373,70,0,0,,On request,2.5,1101000.0
13374,0,284,0,,On request,,1750000.0
13375,150,160,0,,On request,6.5,1415000.0
13376,145,853,140,,Immediately,,1465000.0
