In [25]:
import pandas as pd
import numpy as np

In [26]:
df = pd.read_csv('../data/raw/otodom.csv', index_col=0, na_values='-1')
df.head()

Unnamed: 0,title,address,price,price_m2,area_m2,rooms,floor,outdoors,parking,market,build_yr,building_type,elevator,heating_type
0,"REZERWACJA Słoneczne, duże, 4 pokoje przy lesie","Gdynia, Pustki Cisowskie-Demptowo, Pustki Ciso...",530 000 zł,6 625 zł/m²,80,4,Piętro\n3/3,balkon,zapytaj,wtórny,1981,blok,nie,miejskie
1,Komfortowe Mieszkanie Port Deco M4,"Gdynia, Oksywie, ul. Benisławskiego",477 037 zł,10 982 zł/m²,4344,2,Piętro\nparter/11,balkon,garaż/miejsce parkingowe,pierwotny,2023,brak,tak,zapytaj
2,Apartament z ogródkiem w otulinie lasu,"Gdynia, Orłowo, Bernadowska",1 600 000 zł,24 375 zł/m²,6564,3,Piętro\nparter/2,zapytaj,zapytaj,wtórny,2018,brak,nie,zapytaj
3,Elegancki apartament z widokiem na las,"Gdynia, Orłowo, Bernadowska",3 399 000 zł,34 571 zł/m²,9832,4,Piętro\n1/2,balkon,garaż/miejsce parkingowe,wtórny,2020,brak,nie,zapytaj
4,Osiedle Beauforta - słoneczne mieszkanie,"Gdynia, Pogórze",522 950 zł,8 900 zł/m²,5876,3,Piętro\n1/3,balkon,garaż/miejsce parkingowe,wtórny,2022,apartamentowiec,tak,miejskie


In [27]:
# drop unnecessary columns
df.drop('title', axis=1, inplace=True)

# make some NaN
df = df.replace('zapytaj', np.nan)
df = df.replace('brak', np.nan)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1927 entries, 0 to 1926
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   address        1922 non-null   object
 1   price          1924 non-null   object
 2   price_m2       1753 non-null   object
 3   area_m2        1925 non-null   object
 4   rooms          1925 non-null   object
 5   floor          1925 non-null   object
 6   outdoors       1349 non-null   object
 7   parking        1086 non-null   object
 8   market         1842 non-null   object
 9   build_yr       1608 non-null   object
 10  building_type  1361 non-null   object
 11  elevator       1842 non-null   object
 12  heating_type   1303 non-null   object
dtypes: object(13)
memory usage: 210.8+ KB


### Clean and convert to number type 

In [29]:
# area
df["area_m2"] = df["area_m2"].str.replace(',','.').astype(float) 

# rooms
df["rooms"] = df['rooms'].apply(lambda x: x if str(x).isdecimal() else np.nan).astype(float)  # string "wiecej" replaced with np.nan

all prices per square meters are in zl/m2

In [30]:
# price per square meters to float 
df['price_m2'] = df['price_m2'].str[:-5].str.replace(' ', '').astype(float)

In [31]:
# total price
df['price'] = df['price'].replace('Zapytaj o cenę', np.nan).str.replace('zł', 'pln').str.replace(' ', '').str.replace(',', '.')

df['currency'] = df.price[~df.price.isna()].apply(lambda x: x[-3:].lower())  # helping column
df['price'] = df.price[~df.price.isna()].apply(lambda x: x[:-3])

In [32]:
df[df['currency'] == 'eur']
# only 3 apartments in dataset has price in eur. But we got price per square meter in pln and apartment area.

Unnamed: 0,address,price,price_m2,area_m2,rooms,floor,outdoors,parking,market,build_yr,building_type,elevator,heating_type,currency
1055,"Gdynia, Śródmieście, A. Hryniewickiego",1000000,38070.0,121.0,4.0,Piętro\n> 10/28,,,wtórny,,,nie,,eur
1462,"Gdynia, Śródmieście, A. Hryniewickiego",1000000,38070.0,121.0,4.0,Piętro\n> 10/29,,,wtórny,2009.0,,nie,,eur
1698,"Gdynia, Orłowo",1000000,33996.0,135.5,4.0,Piętro\n1/2,balkon,garaż/miejsce parkingowe,pierwotny,2021.0,apartamentowiec,tak,gazowe,eur


In [33]:
# change prive from eur to pln
df.loc[df['currency'] == 'eur','price'] = df['price_m2'] * df['area_m2']
df.drop('currency', axis=1, inplace=True)  # drop helping column