In [237]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None, "display.max_rows", 100)

df = pd.read_csv("../data/immoscout_cleaned_lat_lon_fixed_v9.csv")

col_names = df.columns.array
col_names[0:2] = ["Index1", "Index2"]
df.columns = col_names

numeric_columns = ["Living space", "Plot area", "Floor space", "Floor", "detail_responsive#surface_living", "detail_responsive#floor", "Wohnfläche", "Stockwerk", "Nutzfläche", "Grundstücksfläche", "detail_responsive#surface_property", "detail_responsive#surface_usable", "Surface habitable", "Surface du terrain", "Surface utile", "Étage", "Superficie abitabile", "Piano", "Superficie del terreno", "Superficie utile", "Floor_merged", "Living_space_merged", "Floor_space_merged", "Plot_area_merged"]
df_numeric = df[numeric_columns]

df_numeric.shape

  df = pd.read_csv("../data/immoscout_cleaned_lat_lon_fixed_v9.csv")


(13378, 24)

In this part of the data analysis we focus on the numeric variables. For each one of them we find a "_merged" column in the dataset. How this column has been merged however is unclear, therefore we cannot assume that this column contains all the data.  
Therefore we split by values containing information on "Living Space", "Plot Area", "Floor Space" and "Floor".

## Living Space

In [238]:
living_space = ["Living space", "Wohnfläche", "Surface habitable", "Superficie abitabile", "detail_responsive#surface_living", "Living_space_merged"]
df_living_space = df_numeric[living_space]

To get a sense of how the "merged" column was created, we inspect the cumulative sum of the value counts per column and check it against the "merged"-count. 

In [239]:
df_living_space.iloc[:,0:5].count().cumsum() == df_living_space[living_space[5]].count()

Living space                        False
Wohnfläche                          False
Surface habitable                   False
Superficie abitabile                 True
detail_responsive#surface_living    False
dtype: bool

This suggests, that the column "Living_space_merged" contains the information of all the columns excluding "detail_responsive#surface_living". 

Let's see, if this is actually true.

First, we merge the relevant columns into one

In [240]:
df_living_space['living_space'] = df_living_space[living_space[0]].fillna('') + \
  (df_living_space[living_space[1]]).fillna('') + \
  (df_living_space[living_space[2]]).fillna('') + \
  (df_living_space[living_space[3]]).fillna('')

(df_living_space['living_space'] == df_living_space[living_space[5]].fillna('')).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_living_space['living_space'] = df_living_space[living_space[0]].fillna('') + \


13378

Counting the True values of the above condition yields the same count as the column count of the dataset meaning that the columns are identical.

The suggestion from the cumulative sum has therefore been proven correct!
Now to get one column with the full information, we merge the "Living_space_merged" column with "detail_responsive#surface_living".

In [241]:
df_living_space['living_space'] = df_living_space[living_space[5]].fillna('') + \
  (df_living_space[living_space[4]]).fillna('')

df_living_space['living_space'] = df_living_space['living_space'].replace('', np.nan)

df_living_space['living_space'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_living_space['living_space'] = df_living_space[living_space[5]].fillna('') + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_living_space['living_space'] = df_living_space['living_space'].replace('', np.nan)


12304

We now have 12304 observations for living space in our dataset. Nice! Let's transfer this back into the numeric subset of the dataframe and drop the unused columns.

In [242]:
df_numeric = df_numeric.drop(living_space, axis=1) # drops 6 columns -> 18 remain
df_numeric["living_space"] = df_living_space['living_space'] # adds 1 column -> 19
df_numeric.shape

(13378, 19)

We then rinse and repeat this process for all data with a "_merged" column present in the dataset. 

## Plot Area

In [243]:
plot_area = ["Plot area", "Grundstücksfläche", "Surface du terrain", "Superficie del terreno", "detail_responsive#surface_property", "Plot_area_merged"]
df_plot_area = df_numeric[plot_area]

df_plot_area[plot_area[0:5]].count().cumsum() == df_plot_area[plot_area[5]].count()

Plot area                             False
Grundstücksfläche                     False
Surface du terrain                    False
Superficie del terreno                 True
detail_responsive#surface_property    False
dtype: bool

In [244]:
df_plot_area['plot_area'] = df_plot_area[plot_area[0]].fillna('') + \
  (df_plot_area[plot_area[1]]).fillna('') + \
  (df_plot_area[plot_area[2]]).fillna('') + \
  (df_plot_area[plot_area[3]]).fillna('')

(df_plot_area['plot_area'] == df_plot_area[plot_area[5]].fillna('')).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_plot_area['plot_area'] = df_plot_area[plot_area[0]].fillna('') + \


13378

In [245]:
df_plot_area['plot_area'] = df_plot_area[plot_area[5]].fillna('') + \
  (df_plot_area[plot_area[4]]).fillna('')

df_plot_area['plot_area'] = df_plot_area['plot_area'].replace('', np.nan)

df_plot_area['plot_area'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_plot_area['plot_area'] = df_plot_area[plot_area[5]].fillna('') + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_plot_area['plot_area'] = df_plot_area['plot_area'].replace('', np.nan)


4953

In [246]:
df_numeric = df_numeric.drop(plot_area, axis=1) # drops 6 columns -> 13 remain
df_numeric["plot_area"] = df_plot_area['plot_area'] # adds 1 column -> 14
df_numeric.shape

(13378, 14)

## Floor Space

In [247]:
floor_space = ["Floor space", "Nutzfläche", "Surface utile", "Superficie utile", "detail_responsive#surface_usable", "Floor_space_merged"]
df_floor_space = df_numeric[floor_space]

df_floor_space[floor_space[0:5]].count().cumsum() == df_floor_space[floor_space[5]].count()

Floor space                         False
Nutzfläche                          False
Surface utile                       False
Superficie utile                     True
detail_responsive#surface_usable    False
dtype: bool

In [248]:
df_floor_space['floor_space'] = df_floor_space[floor_space[0]].fillna('') + \
  (df_floor_space[floor_space[1]]).fillna('') + \
  (df_floor_space[floor_space[2]]).fillna('') + \
  (df_floor_space[floor_space[3]]).fillna('')

(df_floor_space['floor_space'] == df_floor_space[floor_space[5]].fillna('')).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor_space['floor_space'] = df_floor_space[floor_space[0]].fillna('') + \


13378

In [249]:
df_floor_space['floor_space'] = df_floor_space[floor_space[0]].fillna('') + \
  (df_floor_space[floor_space[1]]).fillna('') + \
  (df_floor_space[floor_space[2]]).fillna('') + \
  (df_floor_space[floor_space[3]]).fillna('')

df_floor_space[df_floor_space['floor_space'] != '']['floor_space'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor_space['floor_space'] = df_floor_space[floor_space[0]].fillna('') + \


2842

In [250]:
df_floor_space['floor_space'] = df_floor_space[floor_space[5]].fillna('') + \
  (df_floor_space[floor_space[4]]).fillna('')

df_floor_space['floor_space'] = df_floor_space['floor_space'].replace('', np.nan)

df_floor_space['floor_space'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor_space['floor_space'] = df_floor_space[floor_space[5]].fillna('') + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor_space['floor_space'] = df_floor_space['floor_space'].replace('', np.nan)


2953

In [251]:
df_numeric = df_numeric.drop(floor_space, axis=1) # drops 6 columns -> 8 remain
df_numeric["floor_space"] = df_floor_space['floor_space'] # adds 1 column -> 9
df_numeric.shape

(13378, 9)

## Floor

In [252]:
floor = ["Floor", "Stockwerk", "Étage", "Piano", "detail_responsive#floor", "Floor_merged"]
df_floor = df_numeric[floor]

df_floor[floor[0:5]].count().cumsum() == df_floor[floor[5]].count()

Floor                      False
Stockwerk                  False
Étage                      False
Piano                       True
detail_responsive#floor    False
dtype: bool

In [253]:
df_floor['floor'] = df_floor[floor[0]].fillna('') + \
  (df_floor[floor[1]]).fillna('') + \
  (df_floor[floor[2]]).fillna('') + \
  (df_floor[floor[3]]).fillna('')

(df_floor['floor'] == df_floor[floor[5]].fillna('')).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor['floor'] = df_floor[floor[0]].fillna('') + \


13378

In [254]:
df_floor['floor'] = df_floor[floor[0]].fillna('') + \
  (df_floor[floor[1]]).fillna('') + \
  (df_floor[floor[2]]).fillna('') + \
  (df_floor[floor[3]]).fillna('')

df_floor[df_floor['floor'] != '']['floor'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor['floor'] = df_floor[floor[0]].fillna('') + \


5414

In [255]:
df_floor['floor'] = df_floor[floor[5]].fillna('') + \
  (df_floor[floor[4]]).fillna('')

df_floor['floor'] = df_floor['floor'].replace('', np.nan)

df_floor['floor'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor['floor'] = df_floor[floor[5]].fillna('') + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_floor['floor'] = df_floor['floor'].replace('', np.nan)


5620

In [256]:
df_numeric = df_numeric.drop(floor, axis=1) # drops 6 columns -> 3 remain
df_numeric["floor"] = df_floor['floor'] # adds 1 column -> 4
df_numeric.shape

(13378, 4)

In [267]:
df_numeric

Unnamed: 0,living_space,plot_area,floor_space,floor
0,100 m²,,,4. floor
1,156 m²,222 m²,242 m²,
2,93 m²,,,2. floor
3,154 m²,370 m²,257 m²,
4,142 m²,,,Ground floor
...,...,...,...,...
13373,70 m²,,,
13374,,284 m²,,
13375,150 m²,160 m²,,
13376,145 m²,853 m²,140 m²,


# Cleaning and Parsing
Now that we have shrunken down the 24 columns to only 4, we can parse the values and assign the correct data types.

## Living Space, Plot Area and Floor Space

In [273]:
df_numeric["living_space"] = df_numeric.living_space.str.extract('(\d+)').fillna(0).astype(int)
df_numeric["plot_area"] = df_numeric.plot_area.str.extract('(\d+)').fillna(0).astype(int)
df_numeric["floor_space"] = df_numeric.floor_space.str.extract('(\d+)').fillna(0).astype(int)


In [276]:
df_numeric.head()

Unnamed: 0,living_space,plot_area,floor_space,floor
0,100,0,0,4. floor
1,156,222,242,
2,93,0,0,2. floor
3,154,370,257,
4,142,0,0,Ground floor


In [275]:
df_numeric.dtypes

living_space     int64
plot_area        int64
floor_space      int64
floor           object
dtype: object

In [321]:
df_numeric["floor"].unique()

array(['4. floor', nan, '2. floor', 'Ground floor', '3. floor',
       '6. floor', '1. floor', '5. floor', '14. floor', '20. floor',
       '8. floor', '2. Basement', '7. floor', '15. floor', '10. floor',
       '11. floor', '4. Basement', '100. floor', '12. floor',
       '1. Basement', '21. floor', '9. floor', '3. Basement',
       '999. floor', '23. floor'], dtype=object)

In [323]:
import re

def parse_floor(x):
  if x != x:
    return np.nan
  elif x == "Ground floor":
    return 0
  elif re.search('\. floor', x):
    return re.search('\d+', x).group()
  elif re.search('Basement', x):
    return "-" + re.search('\d+', x).group()


df_numeric["floor"] = df_numeric["floor"].apply(parse_floor)
df_numeric["floor"].unique()

  

array(['4', nan, '2', 0, '3', '6', '1', '5', '14', '20', '8', '-2', '7',
       '15', '10', '11', '-4', '100', '12', '-1', '21', '9', '-3', '999',
       '23'], dtype=object)

In [327]:
df_numeric["floor"] = df_numeric["floor"].astype(float)
df_numeric["floor"].unique()

array([  4.,  nan,   2.,   0.,   3.,   6.,   1.,   5.,  14.,  20.,   8.,
        -2.,   7.,  15.,  10.,  11.,  -4., 100.,  12.,  -1.,  21.,   9.,
        -3., 999.,  23.])

In [331]:
df_numeric.std()

living_space    123.475275
plot_area       845.261158
floor_space     153.571314
floor            29.809057
dtype: float64