In [1]:
import requests
import pandas as pd
import time

# Seminar - APIs, DBs and Live coding

## Task 1: Requesting API
### 1a. Create a function requesting data from sreality


```python
base_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb=1&category_type_cb=1&locality_region_id=10&per_page60&page={}'.format(i)

r = requests.get(base_url)
d = r.json()
```

* function should parametrize: 
    * `category_main_cb` - `{'flat':1, 'house':2, 'land':3 }`
    * `category_type_cb` - `{'sell':1,'rent':2}`
    * `locality_region_id` - use 10 as default value
    * `page` parameter
* use string inputs for `category_main_cb` and `category_type_cb`
* include try/except clause to handle errors
* function should return JSON data in python types
* do not forget to sleep each request at least 0.5s

In [2]:
%%time
print('Hi')

Hi
CPU times: total: 0 ns
Wall time: 0 ns


In [3]:
%%time
time.sleep(5)

CPU times: total: 0 ns
Wall time: 5 s


In [4]:
%%time
time.sleep(1)

CPU times: total: 0 ns
Wall time: 1 s


In [5]:
def request_sreality(page, category_main_str, category_type_str, locality_region_id=10):
    time.sleep(0.5)
    category_mains = {'flat':1, 'house':2, 'land':3 }
    category_types = {'sell':1,'rent':2}
    template_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb={category_main}&category_type_cb={category_type}&locality_region_id={locality_region_id}&per_page60&page={page}'
    try:
        request_url = template_url.format(
            category_main=category_mains[category_main_str],
            category_type=category_types[category_type_str],
            locality_region_id=locality_region_id,
            page=page
        )
        r = requests.get(request_url)
        return r.json()
    except Exception as e:
        print(e)
d = request_sreality(0, 'flat', 'sell', 10)

In [6]:
d.keys()

dict_keys(['meta_description', 'result_size', '_embedded', 'filterLabels', 'title', 'filter', '_links', 'locality', 'locality_dativ', 'logged_in', 'per_page', 'category_instrumental', 'page', 'filterLabels2'])

In [7]:
d['meta_description']

'4460 realit v nabídce prodej bytů Praha. Vyberte si novou nemovitost na sreality.cz s hledáním na mapě a velkými náhledy fotografií nabízených bytů.'

In [8]:
d['result_size']

4460

In [9]:
d['_embedded'].keys()

dict_keys(['estates', 'is_saved', 'not_precise_location_count'])

In [10]:
d

{'meta_description': '4460 realit v nabídce prodej bytů Praha. Vyberte si novou nemovitost na sreality.cz s hledáním na mapě a velkými náhledy fotografií nabízených bytů.',
 'result_size': 4460,
 '_embedded': {'estates': [{'labelsReleased': [['new_building',
      'parking_lots',
      'garage'],
     []],
    'has_panorama': 0,
    'labels': ['Novostavba', 'Parkování', 'Garáž'],
    'is_auction': False,
    'labelsAll': [['new_building',
      'personal',
      'balcony',
      'cellar',
      'elevator',
      'parking_lots',
      'garage',
      'partly_furnished'],
     ['candy_shop',
      'small_shop',
      'theater',
      'vet',
      'tavern',
      'movies',
      'playground',
      'sightseeing',
      'tram',
      'bus_public_transport',
      'post_office',
      'school',
      'metro',
      'shop',
      'atm',
      'kindergarten',
      'drugstore',
      'sports',
      'medic',
      'restaurant',
      'train']],
    'seo': {'category_main_cb': 1,
     'categor

### 1b. Create a function converting sreality json data into pandas dataframe

In [26]:
len(d['_embedded']['estates'])

21

In [37]:
len(d['_embedded']['estates'][4].keys())

27

In [11]:
data_lists = [{'a':44, 'b':443},{'a':14, 'b':4454},{'a':45554, 'b':55553}]
pd.DataFrame(data_lists)

Unnamed: 0,a,b
0,44,443
1,14,4454
2,45554,55553


In [12]:
def convert_sreality_data_to_df(sreality_data):
    return pd.DataFrame(sreality_data['_embedded']['estates'])

raw = convert_sreality_data_to_df(d)

In [13]:
raw.head()

Unnamed: 0,labelsReleased,has_panorama,labels,is_auction,labelsAll,seo,exclusively_at_rk,category,has_floor_plan,_embedded,...,hash_id,attractive_offer,price,price_czk,_links,rus,name,region_tip,gps,has_matterport_url
0,"[[new_building, parking_lots, garage], []]",0,"[Novostavba, Parkování, Garáž]",False,"[[new_building, personal, balcony, cellar, ele...","{'category_main_cb': 1, 'category_sub_cb': 4, ...",1,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,91751756,0,10047000,"{'value_raw': 10047000, 'unit': '', 'name': 'C...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 2+kk 81 m²,2826899,"{'lat': 50.07365718253041, 'lon': 14.456714817...",False
1,"[[parking_lots], [post_office, medic]]",0,"[Parkování, Pošta 6 min. pěšky, Lékař 11 min. ...",False,"[[personal, balcony, brick, parking_lots, furn...","{'category_main_cb': 1, 'category_sub_cb': 5, ...",1,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,1756878156,0,10823000,"{'value_raw': 10823000, 'unit': '', 'name': 'C...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 2+1 92 m²,0,"{'lat': 50.055205182530415, 'lon': 14.40737481...",True
2,"[[], []]",0,[],False,"[[personal, brick, cellar], [theater, vet, can...","{'category_main_cb': 1, 'category_sub_cb': 6, ...",1,1,0,"{'favourite': {'is_favourite': False, '_links'...",...,3097630028,0,9317000,"{'value_raw': 9317000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 3+kk 67 m²,0,"{'lat': 50.05478218253042, 'lon': 14.400988817...",False
3,"[[panel], [shop]]",0,"[Panelová, Obchod 7 min. pěšky]",False,"[[personal, panel, cellar, elevator], [vet, sm...","{'category_main_cb': 1, 'category_sub_cb': 7, ...",0,1,0,"{'favourite': {'is_favourite': False, '_links'...",...,506537292,0,8537000,"{'value_raw': 8537000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 3+1 79 m²,0,"{'lat': 50.11638918253041, 'lon': 14.488324817...",False
4,"[[new_building], [metro]]",0,"[Novostavba, Metro 7 min. pěšky]",False,"[[new_building, personal, balcony, brick, elev...","{'category_main_cb': 1, 'category_sub_cb': 2, ...",1,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,598561868,0,5406000,"{'value_raw': 5406000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 34 m²,0,"{'lat': 50.03676518253042, 'lon': 14.310620817...",False


### 1c. link function `1b` into function `1a`

In [14]:
def request_sreality(page, category_main_str, category_type_str, locality_region_id=10):
    category_mains = {'flat':1, 'house':2, 'land':3 }
    category_types = {'sell':1,'rent':2}
    template_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb={category_main}&category_type_cb={category_type}&locality_region_id={locality_region_id}&per_page60&page={page}'
    
    request_url = template_url.format(
        category_main=category_mains[category_main_str],
        category_type=category_types[category_type_str],
        locality_region_id=locality_region_id,
        page=page
    )
    
    try: 
        r = requests.get(request_url)
        return convert_sreality_data_to_df(r.json())
    except Exception as e:
        print(f'error requesting url {request_url}. Reason: {e.message}')
    
df = request_sreality(0, 'flat', 'sell', 10)
df.head()

Unnamed: 0,labelsReleased,has_panorama,labels,is_auction,labelsAll,seo,exclusively_at_rk,category,has_floor_plan,_embedded,...,hash_id,attractive_offer,price,price_czk,_links,rus,name,region_tip,gps,has_matterport_url
0,"[[loggia, cellar, parking_lots], [school]]",0,"[Lodžie, Sklep, Parkování, Škola 1 min. pěšky]",False,"[[personal, loggia, cellar, parking_lots, part...","{'category_main_cb': 1, 'category_sub_cb': 2, ...",1,1,0,"{'favourite': {'is_favourite': False, '_links'...",...,1815491916,0,4316000,"{'value_raw': 4316000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 29 m²,0,"{'lat': 50.038989182530415, 'lon': 14.47748981...",False
1,"[[furnished], [bus_public_transport]]",0,"[Vybavený, Bus 4 min. pěšky]",False,"[[new_building, personal, brick, elevator, fur...","{'category_main_cb': 1, 'category_sub_cb': 2, ...",1,1,0,"{'favourite': {'is_favourite': False, '_links'...",...,2420491596,0,8006000,"{'value_raw': 8006000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 69 m²,0,"{'lat': 50.08086718253041, 'lon': 14.309168817...",False
2,"[[balcony], []]",0,[Balkon],False,"[[personal, balcony, elevator, garage, not_fur...","{'category_main_cb': 1, 'category_sub_cb': 4, ...",0,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,4054193484,0,9632000,"{'value_raw': 9632000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 2+kk 50 m²,0,"{'lat': 50.08985818253041, 'lon': 14.459570817...",False
3,"[[], []]",0,[],False,"[[personal, elevator, garage, not_furnished], []]","{'category_main_cb': 1, 'category_sub_cb': 2, ...",0,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,2544244044,0,6165000,"{'value_raw': 6165000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 30 m²,0,"{'lat': 50.08985818253041, 'lon': 14.459570817...",False
4,"[[loggia], []]",0,[Lodžie],False,"[[personal, loggia, elevator, garage, not_furn...","{'category_main_cb': 1, 'category_sub_cb': 6, ...",0,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,2882934092,0,11907000,"{'value_raw': 11907000, 'unit': '', 'name': 'C...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 3+kk 72 m²,0,"{'lat': 50.08985818253041, 'lon': 14.459570817...",False


### 1c. Combining multiple requests into single df

* Function should parametrize:
    * `start_page` and `end_page`
    * request parameters
* construct a list of individual request dfs
* then feed it into `pd.concat` function

In [15]:
raw.shape

(21, 27)

In [16]:
request_sreality

<function __main__.request_sreality(page, category_main_str, category_type_str, locality_region_id=10)>

In [24]:
def request_multiply_sreality(start_page, end_page, category_main_str, category_type_str, locality_region_id=10):
    pages = range(start_page, end_page + 1)
    list_of_dfs = [request_sreality(page, category_main_str, category_type_str, locality_region_id) for page in pages]
    return pd.concat(list_of_dfs)

df = request_multiply_sreality(1, 5, 'flat', 'sell',10)
df.shape
df.head()

Unnamed: 0,labelsReleased,has_panorama,labels,is_auction,labelsAll,seo,exclusively_at_rk,category,has_floor_plan,_embedded,...,hash_id,attractive_offer,price,price_czk,_links,rus,name,region_tip,gps,has_matterport_url
0,"[[balcony, cellar, parking_lots], [school, res...",0,"[Balkon, Sklep, Parkování, Škola 2 min. pěšky,...",False,"[[new_building, personal, balcony, cellar, ele...","{'category_main_cb': 1, 'category_sub_cb': 2, ...",1,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,1433929036,0,7623000,"{'value_raw': 7623000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 46 m²,2826899,"{'lat': 50.073214182530414, 'lon': 14.45730481...",False
1,"[[loggia, cellar, parking_lots], [school]]",0,"[Lodžie, Sklep, Parkování, Škola 1 min. pěšky]",False,"[[personal, loggia, cellar, parking_lots, part...","{'category_main_cb': 1, 'category_sub_cb': 2, ...",1,1,0,"{'favourite': {'is_favourite': False, '_links'...",...,1815491916,0,4316000,"{'value_raw': 4316000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 29 m²,0,"{'lat': 50.038989182530415, 'lon': 14.47748981...",False
2,"[[furnished], [bus_public_transport]]",0,"[Vybavený, Bus 4 min. pěšky]",False,"[[new_building, personal, brick, elevator, fur...","{'category_main_cb': 1, 'category_sub_cb': 2, ...",1,1,0,"{'favourite': {'is_favourite': False, '_links'...",...,2420491596,0,8006000,"{'value_raw': 8006000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 69 m²,0,"{'lat': 50.08086718253041, 'lon': 14.309168817...",False
3,"[[balcony], []]",0,[Balkon],False,"[[personal, balcony, elevator, garage, not_fur...","{'category_main_cb': 1, 'category_sub_cb': 4, ...",0,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,4054193484,0,9632000,"{'value_raw': 9632000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 2+kk 50 m²,0,"{'lat': 50.08985818253041, 'lon': 14.459570817...",False
4,"[[], []]",0,[],False,"[[personal, elevator, garage, not_furnished], []]","{'category_main_cb': 1, 'category_sub_cb': 2, ...",0,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,2544244044,0,6165000,"{'value_raw': 6165000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 30 m²,0,"{'lat': 50.08985818253041, 'lon': 14.459570817...",False


In [25]:
df = df.reset_index().drop('index', axis=1)
df

Unnamed: 0,labelsReleased,has_panorama,labels,is_auction,labelsAll,seo,exclusively_at_rk,category,has_floor_plan,_embedded,...,hash_id,attractive_offer,price,price_czk,_links,rus,name,region_tip,gps,has_matterport_url
0,"[[balcony, cellar, parking_lots], [school, res...",0,"[Balkon, Sklep, Parkování, Škola 2 min. pěšky,...",False,"[[new_building, personal, balcony, cellar, ele...","{'category_main_cb': 1, 'category_sub_cb': 2, ...",1,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,1433929036,0,7623000,"{'value_raw': 7623000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 46 m²,2826899,"{'lat': 50.073214182530414, 'lon': 14.45730481...",False
1,"[[loggia, cellar, parking_lots], [school]]",0,"[Lodžie, Sklep, Parkování, Škola 1 min. pěšky]",False,"[[personal, loggia, cellar, parking_lots, part...","{'category_main_cb': 1, 'category_sub_cb': 2, ...",1,1,0,"{'favourite': {'is_favourite': False, '_links'...",...,1815491916,0,4316000,"{'value_raw': 4316000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 29 m²,0,"{'lat': 50.038989182530415, 'lon': 14.47748981...",False
2,"[[furnished], [bus_public_transport]]",0,"[Vybavený, Bus 4 min. pěšky]",False,"[[new_building, personal, brick, elevator, fur...","{'category_main_cb': 1, 'category_sub_cb': 2, ...",1,1,0,"{'favourite': {'is_favourite': False, '_links'...",...,2420491596,0,8006000,"{'value_raw': 8006000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 69 m²,0,"{'lat': 50.08086718253041, 'lon': 14.309168817...",False
3,"[[balcony], []]",0,[Balkon],False,"[[personal, balcony, elevator, garage, not_fur...","{'category_main_cb': 1, 'category_sub_cb': 4, ...",0,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,4054193484,0,9632000,"{'value_raw': 9632000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 2+kk 50 m²,0,"{'lat': 50.08985818253041, 'lon': 14.459570817...",False
4,"[[], []]",0,[],False,"[[personal, elevator, garage, not_furnished], []]","{'category_main_cb': 1, 'category_sub_cb': 2, ...",0,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,2544244044,0,6165000,"{'value_raw': 6165000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 30 m²,0,"{'lat': 50.08985818253041, 'lon': 14.459570817...",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,"[[], []]",0,[],False,"[[personal, after_reconstruction, brick], [sma...","{'category_main_cb': 1, 'category_sub_cb': 4, ...",0,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,643286348,0,7308000,"{'value_raw': 7308000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 2+kk 58 m²,0,"{'lat': 50.08401418253042, 'lon': 14.359065817...",False
101,"[[terrace, not_furnished], []]",0,"[Terasa, Nevybavený]",False,"[[personal, terrace, brick, cellar, not_furnis...","{'category_main_cb': 1, 'category_sub_cb': 2, ...",0,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,1421329740,0,6074000,"{'value_raw': 6074000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 50 m²,0,"{'lat': 50.08401418253042, 'lon': 14.359065817...",False
102,"[[], []]",0,[],False,"[[personal, after_reconstruction, brick, cella...","{'category_main_cb': 1, 'category_sub_cb': 2, ...",0,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,2594686284,0,7601000,"{'value_raw': 7601000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 58 m²,0,"{'lat': 50.08401418253042, 'lon': 14.359065817...",False
103,"[[not_furnished], []]",0,[Nevybavený],False,"[[personal, after_reconstruction, brick, cella...","{'category_main_cb': 1, 'category_sub_cb': 4, ...",0,1,0,"{'favourite': {'is_favourite': False, '_links'...",...,2644981068,0,6890000,"{'alt': {'value_raw': 130000, 'unit': 'za m²'}...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 2+kk 57 m²,0,"{'lat': 50.08401418253042, 'lon': 14.359065817...",False


## Task 2: Cleaning data

### 2a. Filter columns
* filter only columns: `['locality', 'price', 'name', 'gps','hash_id','exclusively_at_rk']`
* use `.copy()` to avoid `SettingWithCopyWarning` later


In [26]:
clean = df[['locality', 'price', 'name', 'gps','hash_id','exclusively_at_rk']].copy()
clean

Unnamed: 0,locality,price,name,gps,hash_id,exclusively_at_rk
0,Praha 3 - Žižkov,7623000,Prodej bytu 1+kk 46 m²,"{'lat': 50.073214182530414, 'lon': 14.45730481...",1433929036,1
1,Praha 4 - Michle,4316000,Prodej bytu 1+kk 29 m²,"{'lat': 50.038989182530415, 'lon': 14.47748981...",1815491916,1
2,Praha 6 - Ruzyně,8006000,Prodej bytu 1+kk 69 m²,"{'lat': 50.08086718253041, 'lon': 14.309168817...",2420491596,1
3,Praha 7 - Holešovice,9632000,Prodej bytu 2+kk 50 m²,"{'lat': 50.08985818253041, 'lon': 14.459570817...",4054193484,0
4,Praha 7 - Holešovice,6165000,Prodej bytu 1+kk 30 m²,"{'lat': 50.08985818253041, 'lon': 14.459570817...",2544244044,0
...,...,...,...,...,...,...
100,Praha 6 - Veleslavín,7308000,Prodej bytu 2+kk 58 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",643286348,0
101,Praha 6 - Veleslavín,6074000,Prodej bytu 1+kk 50 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",1421329740,0
102,Praha 6 - Veleslavín,7601000,Prodej bytu 1+kk 58 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",2594686284,0
103,Praha 6 - Veleslavín,6890000,Prodej bytu 2+kk 57 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",2644981068,0


### 2b: GPS
* Convert dictionary in `gps` column into two columns - `lat` and `lon`
* use apply function on gps column
* Note apply can return multiple columns

In [28]:
clean[['lat', 'lon']] = clean.gps.apply(lambda x: pd.Series({'lat': x['lat'], 'lon': x['lon']}))

In [29]:
clean

Unnamed: 0,locality,price,name,gps,hash_id,exclusively_at_rk,lat,lon
0,Praha 3 - Žižkov,7623000,Prodej bytu 1+kk 46 m²,"{'lat': 50.073214182530414, 'lon': 14.45730481...",1433929036,1,50.073214,14.457305
1,Praha 4 - Michle,4316000,Prodej bytu 1+kk 29 m²,"{'lat': 50.038989182530415, 'lon': 14.47748981...",1815491916,1,50.038989,14.477490
2,Praha 6 - Ruzyně,8006000,Prodej bytu 1+kk 69 m²,"{'lat': 50.08086718253041, 'lon': 14.309168817...",2420491596,1,50.080867,14.309169
3,Praha 7 - Holešovice,9632000,Prodej bytu 2+kk 50 m²,"{'lat': 50.08985818253041, 'lon': 14.459570817...",4054193484,0,50.089858,14.459571
4,Praha 7 - Holešovice,6165000,Prodej bytu 1+kk 30 m²,"{'lat': 50.08985818253041, 'lon': 14.459570817...",2544244044,0,50.089858,14.459571
...,...,...,...,...,...,...,...,...
100,Praha 6 - Veleslavín,7308000,Prodej bytu 2+kk 58 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",643286348,0,50.084014,14.359066
101,Praha 6 - Veleslavín,6074000,Prodej bytu 1+kk 50 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",1421329740,0,50.084014,14.359066
102,Praha 6 - Veleslavín,7601000,Prodej bytu 1+kk 58 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",2594686284,0,50.084014,14.359066
103,Praha 6 - Veleslavín,6890000,Prodej bytu 2+kk 57 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",2644981068,0,50.084014,14.359066


### 2b. Get flat type from name
* Name is always represented by string `Prodej bytu [type of flat] [Area] m^2`
* try picking third word in string
* check meaningfulness using `.value_counts()`

In [30]:
clean['flat_type'] = clean.name.apply(lambda nm:nm.split()[2])

### 2c. Get area from name
* Naive: select the word before last word
* Then try navigating using the index of `'m²'`
* if this also fail, then you will need to use regex

In [31]:
clean['area_1'] = clean.name.apply(lambda nm:nm.split()[3])

In [32]:
n = 'Prodej bytu 4+kk 128 m²'
splited = n.split()
print(splited)
splited.index('m²')

['Prodej', 'bytu', '4+kk', '128', 'm²']


4

In [33]:
int(splited[3])

128

In [34]:
def name_to_area(nm):
    splitted= nm.split()
    m2_idx = splitted.index('m²')
    return int(splitted[m2_idx-1])

clean['area_2'] = clean.name.apply(name_to_area)
clean

Unnamed: 0,locality,price,name,gps,hash_id,exclusively_at_rk,lat,lon,flat_type,area_1,area_2
0,Praha 3 - Žižkov,7623000,Prodej bytu 1+kk 46 m²,"{'lat': 50.073214182530414, 'lon': 14.45730481...",1433929036,1,50.073214,14.457305,1+kk,46,46
1,Praha 4 - Michle,4316000,Prodej bytu 1+kk 29 m²,"{'lat': 50.038989182530415, 'lon': 14.47748981...",1815491916,1,50.038989,14.477490,1+kk,29,29
2,Praha 6 - Ruzyně,8006000,Prodej bytu 1+kk 69 m²,"{'lat': 50.08086718253041, 'lon': 14.309168817...",2420491596,1,50.080867,14.309169,1+kk,69,69
3,Praha 7 - Holešovice,9632000,Prodej bytu 2+kk 50 m²,"{'lat': 50.08985818253041, 'lon': 14.459570817...",4054193484,0,50.089858,14.459571,2+kk,50,50
4,Praha 7 - Holešovice,6165000,Prodej bytu 1+kk 30 m²,"{'lat': 50.08985818253041, 'lon': 14.459570817...",2544244044,0,50.089858,14.459571,1+kk,30,30
...,...,...,...,...,...,...,...,...,...,...,...
100,Praha 6 - Veleslavín,7308000,Prodej bytu 2+kk 58 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",643286348,0,50.084014,14.359066,2+kk,58,58
101,Praha 6 - Veleslavín,6074000,Prodej bytu 1+kk 50 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",1421329740,0,50.084014,14.359066,1+kk,50,50
102,Praha 6 - Veleslavín,7601000,Prodej bytu 1+kk 58 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",2594686284,0,50.084014,14.359066,1+kk,58,58
103,Praha 6 - Veleslavín,6890000,Prodej bytu 2+kk 57 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",2644981068,0,50.084014,14.359066,2+kk,57,57


In [35]:
clean[clean['area_1']==clean['area_2']]

Unnamed: 0,locality,price,name,gps,hash_id,exclusively_at_rk,lat,lon,flat_type,area_1,area_2


In [36]:
clean[clean['area_1'].astype(int)==clean['area_2']]

Unnamed: 0,locality,price,name,gps,hash_id,exclusively_at_rk,lat,lon,flat_type,area_1,area_2
0,Praha 3 - Žižkov,7623000,Prodej bytu 1+kk 46 m²,"{'lat': 50.073214182530414, 'lon': 14.45730481...",1433929036,1,50.073214,14.457305,1+kk,46,46
1,Praha 4 - Michle,4316000,Prodej bytu 1+kk 29 m²,"{'lat': 50.038989182530415, 'lon': 14.47748981...",1815491916,1,50.038989,14.477490,1+kk,29,29
2,Praha 6 - Ruzyně,8006000,Prodej bytu 1+kk 69 m²,"{'lat': 50.08086718253041, 'lon': 14.309168817...",2420491596,1,50.080867,14.309169,1+kk,69,69
3,Praha 7 - Holešovice,9632000,Prodej bytu 2+kk 50 m²,"{'lat': 50.08985818253041, 'lon': 14.459570817...",4054193484,0,50.089858,14.459571,2+kk,50,50
4,Praha 7 - Holešovice,6165000,Prodej bytu 1+kk 30 m²,"{'lat': 50.08985818253041, 'lon': 14.459570817...",2544244044,0,50.089858,14.459571,1+kk,30,30
...,...,...,...,...,...,...,...,...,...,...,...
100,Praha 6 - Veleslavín,7308000,Prodej bytu 2+kk 58 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",643286348,0,50.084014,14.359066,2+kk,58,58
101,Praha 6 - Veleslavín,6074000,Prodej bytu 1+kk 50 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",1421329740,0,50.084014,14.359066,1+kk,50,50
102,Praha 6 - Veleslavín,7601000,Prodej bytu 1+kk 58 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",2594686284,0,50.084014,14.359066,1+kk,58,58
103,Praha 6 - Veleslavín,6890000,Prodej bytu 2+kk 57 m²,"{'lat': 50.08401418253042, 'lon': 14.359065817...",2644981068,0,50.084014,14.359066,2+kk,57,57


## Bonus: Convert `labelsAll` into categorical variables

### Task 4a. Get all possible label names
* deal with nested-list structure
* Hint: try to sum the whole column
* Needed to Iterate through all labels in all rows and 

In [37]:
['d'] + ['c']

['d', 'c']

In [41]:
possible_labels = list(set([i for sublist in raw.labelsAll.sum() for i in sublist]))
possible_labels

['garage',
 'post_office',
 'collective',
 'natural_attraction',
 'tram',
 'playground',
 'movies',
 'terrace',
 'school',
 'atm',
 'sports',
 'furnished',
 'after_reconstruction',
 'not_furnished',
 'drugstore',
 'kindergarten',
 'new_building',
 'cellar',
 'shop',
 'bus_public_transport',
 'partly_furnished',
 'balcony',
 'vet',
 'metro',
 'train',
 'brick',
 'panel',
 'candy_shop',
 'elevator',
 'small_shop',
 'restaurant',
 'personal',
 'tavern',
 'sightseeing',
 'medic',
 'theater',
 'parking_lots']

### 4b. Test existence of label `cellar` for offers
* again deal with nested list of list structure
* write generic function `test_existence_of_label(offer_labels,label)`

In [42]:
def test_existence_of_label(offer_labels,label):
    return 'cellar' in [item for sublist in offer_labels for item in sublist]

raw.labelsAll.apply(lambda offer_labels: test_existence_of_label(offer_labels, 'cellar'))

0      True
1     False
2      True
3      True
4     False
5      True
6      True
7      True
8      True
9      True
10    False
11     True
12     True
13    False
14     True
15     True
16    False
17    False
18    False
19     True
20    False
Name: labelsAll, dtype: bool

### 4c. Test existence of all possible labels
* use apply returning series with all labels

In [43]:
def existence_of_all_labels(offer_labels, possible_labels):
    return pd.Series({
        label:test_existence_of_label(offer_labels,label)
        for label in possible_labels
    })

raw.labelsAll.apply(lambda offer_labels: existence_of_all_labels(offer_labels, possible_labels))

Unnamed: 0,garage,post_office,collective,natural_attraction,tram,playground,movies,terrace,school,atm,...,candy_shop,elevator,small_shop,restaurant,personal,tavern,sightseeing,medic,theater,parking_lots
0,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
