# 6.1 Sourcing Open Data: World Real Estate Transactions

## Content:

##### 01. Importing Libraries & Data
##### 02. Consistency Check & Data Cleaning
##### 03. Data Wrangling
##### 04. Export

### 01. Importing Libraries & Data

In [43]:
# import libraries

import pandas as pd
import numpy as np
import os

In [44]:
# folder path to main project folder
path = r'C:\Users\ThinkPad T570\Documents\05-2024 OpenData WorldRealEstate Analysis_LazerHF'

In [52]:
# importing original World Real Estate dataframe
df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'original_world_real_estate_data.csv'))

In [53]:
df.head(25)

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_bedrooms,apartment_bathrooms,apartment_total_area,apartment_living_area,price_in_USD,image,url
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,2.0,2.0,120 m²,110 m²,315209.0,https://realting.com/uploads/bigSlider/ab3/888...,https://realting.com/property-for-sale/turkey/...
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,,,500 m²,480 m²,1108667.0,https://realting.com/uploads/bigSlider/87b/679...,https://realting.com/property-for-sale/turkey/...
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,1.0,1.0,65 m²,60 m²,173211.0,https://realting.com/uploads/bigSlider/030/a11...,https://realting.com/property-for-sale/turkey/...
3,"1 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2020.0,15.0,5.0,2.0,1.0,1.0,,40 m²,99900.0,https://realting.com/uploads/bigSlider/e9a/e06...,https://realting.com/property-for-sale/thailan...
4,"2 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2026.0,8.0,3.0,3.0,2.0,1.0,,36 m²,67000.0,https://realting.com/uploads/bigSlider/453/aa2...,https://realting.com/property-for-sale/thailan...
5,"1 room apartment 28 m² in Batumi, Georgia",Georgia,"Abkhazia, Batumi, Georgia",2026.0,,,1.0,,1.0,28 m²,,35622.0,https://realting.com/uploads/bigSlider/16f/406...,https://realting.com/property-for-sale/georgia...
6,"4 room apartment 245 m² in Yesiloez, Turkey",Turkey,"Yesiloez, Mediterranean Region, Alanya, Turkey",2007.0,2.0,3.0,5.0,4.0,4.0,245 m²,245 m²,274415.0,https://realting.com/uploads/bigSlider/9c8/6f1...,https://realting.com/property-for-sale/turkey/...
7,"1 room studio apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2026.0,8.0,2.0,1.0,1.0,1.0,,25 m²,44700.0,https://realting.com/uploads/bigSlider/b50/f4b...,https://realting.com/property-for-sale/thailan...
8,Apartment 1 bathroom 74 m² in Ratisevina-Susce...,Montenegro,"Ratisevina-Suscepan-Trebesin, Herceg Novi, Mon...",,,,,,1.0,74 m²,,168881.0,https://realting.com/uploads/bigSlider/dd6/fa8...,https://realting.com/property-for-sale/montene...
9,"1 room apartment 50 m² in Becici, Montenegro",Montenegro,"Becici, Sveti Stefan, Budva Municipality, Mont...",,16.0,12.0,2.0,1.0,1.0,50 m²,,254404.0,https://realting.com/uploads/bigSlider/820/d3f...,https://realting.com/property-for-sale/montene...


In [54]:
df.shape

(147536, 14)

In [55]:
print(df.columns)

Index(['title', 'country', 'location', 'building_construction_year',
       'building_total_floors', 'apartment_floor', 'apartment_rooms',
       'apartment_bedrooms', 'apartment_bathrooms', 'apartment_total_area',
       'apartment_living_area', 'price_in_USD', 'image', 'url'],
      dtype='object')


### 02. Consistency Check & Data Cleaning

In [56]:
# deleting columns 'image' and 'url' because no relevant data for the analysis
# deleting columns 'apartment_living_area' and 'apartment_bathrooms' as the initial data inspection uncovered that these columns contain over 75% missing values
# deleting 'apartment_bathrooms' because it contains over 60% of missing values and is also not relevant for the analysis output

df = df.drop(['image', 'url', 'apartment_living_area', 'apartment_bathrooms', 'apartment_bedrooms'], axis=1)

In [57]:
df.shape

(147536, 9)

#### Removing Duplicates

In [58]:
# finding duplicates
df = df.drop_duplicates()
df

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,120 m²,315209.0
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500 m²,1108667.0
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,65 m²,173211.0
3,"1 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2020.0,15.0,5.0,2.0,,99900.0
4,"2 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2026.0,8.0,3.0,3.0,,67000.0
...,...,...,...,...,...,...,...,...,...
147531,"5 room apartment 310 m² in Gazipasa, Turkey",Turkey,"Mediterranean Region, Gazipasa, Turkey",,,,,310 m²,597810.0
147532,"4 room apartment 192 m² in Marmara Region, Turkey",Turkey,"Marmara Region, Turkey",2023.0,5.0,,5.0,192 m²,637195.0
147533,"2 room apartment in Marmara Region, Turkey",Turkey,"Marmara Region, Turkey",,,,3.0,,477146.0
147534,"Apartment in Akarca, Turkey",Turkey,"Akarca, Central Anatolia Region, Turkey",2023.0,,,,,819163.0


#### Formatting

In [61]:
# removing m2 suffix from "apartment_total_area" column
df["apartment_total_area"] = df["apartment_total_area"].str.rstrip("m²")
df

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,120,315209.0
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500,1108667.0
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,65,173211.0
3,"1 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2020.0,15.0,5.0,2.0,,99900.0
4,"2 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2026.0,8.0,3.0,3.0,,67000.0
...,...,...,...,...,...,...,...,...,...
147531,"5 room apartment 310 m² in Gazipasa, Turkey",Turkey,"Mediterranean Region, Gazipasa, Turkey",,,,,310,597810.0
147532,"4 room apartment 192 m² in Marmara Region, Turkey",Turkey,"Marmara Region, Turkey",2023.0,5.0,,5.0,192,637195.0
147533,"2 room apartment in Marmara Region, Turkey",Turkey,"Marmara Region, Turkey",,,,3.0,,477146.0
147534,"Apartment in Akarca, Turkey",Turkey,"Akarca, Central Anatolia Region, Turkey",2023.0,,,,,819163.0


In [116]:
# removing blanks in "apartment_total_area" column
df["apartment_total_area"] = df["apartment_total_area"].str.replace(" ","")
df

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,120,315209.0
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500,1108667.0
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,65,173211.0
3,"1 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2020.0,15.0,5.0,2.0,,99900.0
4,"2 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2026.0,8.0,3.0,3.0,,67000.0
...,...,...,...,...,...,...,...,...,...
147531,"5 room apartment 310 m² in Gazipasa, Turkey",Turkey,"Mediterranean Region, Gazipasa, Turkey",,,,,310,597810.0
147532,"4 room apartment 192 m² in Marmara Region, Turkey",Turkey,"Marmara Region, Turkey",2023.0,5.0,,5.0,192,637195.0
147533,"2 room apartment in Marmara Region, Turkey",Turkey,"Marmara Region, Turkey",,,,3.0,,477146.0
147534,"Apartment in Akarca, Turkey",Turkey,"Akarca, Central Anatolia Region, Turkey",2023.0,,,,,819163.0


In [237]:
pd.options.display.float_format = '{:,.0f}'.format

In [235]:
# addressing missing values
df=df.fillna('')

In [234]:
# formatting columns with float decimal numbers
#cols = ['building_total_floors', 'apartment_floor', 'apartment_rooms', 'apartment_total_area']
#df[cols] = df[cols].applymap(np.int64)

In [248]:
df['building_construction_year'] = df['building_construction_year'].astype(int)

ValueError: invalid literal for int() with base 10: ''

In [249]:
df

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD,Area,State,Region
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5,1,3,120,315209,Mediterranean Region,,Turkey
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021,2,,,500,1108667,Kalkan,"Kas, Turkey",Mediterranean Region
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5,2,2,65,173211,Mediterranean Region,Turkey,Antalya
5,"1 room apartment 28 m² in Batumi, Georgia",Georgia,"Abkhazia, Batumi, Georgia",2026,,,1,28,35622,Abkhazia,Georgia,Batumi
6,"4 room apartment 245 m² in Yesiloez, Turkey",Turkey,"Yesiloez, Mediterranean Region, Alanya, Turkey",2007,2,3,5,245,274415,Yesiloez,"Alanya, Turkey",Mediterranean Region
...,...,...,...,...,...,...,...,...,...,...,...,...
147529,"3 room apartment 106 m² in Yaylali, Turkey",Turkey,"Yaylali, Mediterranean Region, Alanya, Turkey",2023,,,3,106,510856,Yaylali,"Alanya, Turkey",Mediterranean Region
147530,3 room apartment 168 m² in Bahcelievler Mahall...,Turkey,"Bahcelievler Mahallesi, Marmara Region, Turkey",,,,,168,614848,Bahcelievler Mahallesi,Turkey,Marmara Region
147531,"5 room apartment 310 m² in Gazipasa, Turkey",Turkey,"Mediterranean Region, Gazipasa, Turkey",,,,,310,597810,Mediterranean Region,Turkey,Gazipasa
147532,"4 room apartment 192 m² in Marmara Region, Turkey",Turkey,"Marmara Region, Turkey",2023,5,,5,192,637195,Marmara Region,,Turkey


#### Missing Values

In [63]:
# finding missing values
df.isnull().sum()

title                             0
country                         117
location                        118
building_construction_year    78904
building_total_floors         75701
apartment_floor               89070
apartment_rooms               70256
apartment_total_area           5062
price_in_USD                   1667
dtype: int64

In [64]:
# addressing missing values
df=df.fillna('')
df

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,120,315209.0
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500,1108667.0
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,65,173211.0
3,"1 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2020.0,15.0,5.0,2.0,,99900.0
4,"2 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2026.0,8.0,3.0,3.0,,67000.0
...,...,...,...,...,...,...,...,...,...
147531,"5 room apartment 310 m² in Gazipasa, Turkey",Turkey,"Mediterranean Region, Gazipasa, Turkey",,,,,310,597810.0
147532,"4 room apartment 192 m² in Marmara Region, Turkey",Turkey,"Marmara Region, Turkey",2023.0,5.0,,5.0,192,637195.0
147533,"2 room apartment in Marmara Region, Turkey",Turkey,"Marmara Region, Turkey",,,,3.0,,477146.0
147534,"Apartment in Akarca, Turkey",Turkey,"Akarca, Central Anatolia Region, Turkey",2023.0,,,,,819163.0


In [73]:
df.dtypes

title                         object
country                       object
location                      object
building_construction_year    object
building_total_floors         object
apartment_floor               object
apartment_rooms               object
apartment_total_area          object
price_in_USD                  object
dtype: object

In [212]:
# adapting the data type of the individual columns
df['building_construction_year'] = pd.to_numeric(df['building_construction_year'])
df['building_total_floors'] = pd.to_numeric(df['building_total_floors'])
df['apartment_floor'] = pd.to_numeric(df['apartment_floor'])
df['apartment_rooms'] = pd.to_numeric(df['apartment_rooms'])
df['apartment_total_area'] = pd.to_numeric(df['apartment_total_area'])
df['price_in_USD'] = pd.to_numeric(df['price_in_USD'])

In [213]:
df.dtypes

title                          object
country                        object
location                       object
building_construction_year    float64
building_total_floors         float64
apartment_floor               float64
apartment_rooms               float64
apartment_total_area          float64
price_in_USD                  float64
Area                           object
State                          object
Region                         object
dtype: object

In [214]:
df

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD,Area,State,Region
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,120.0,315209.0,Mediterranean Region,,Turkey
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500.0,1108667.0,Kalkan,"Kas, Turkey",Mediterranean Region
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,65.0,173211.0,Mediterranean Region,Turkey,Antalya
5,"1 room apartment 28 m² in Batumi, Georgia",Georgia,"Abkhazia, Batumi, Georgia",2026.0,,,1.0,28.0,35622.0,Abkhazia,Georgia,Batumi
6,"4 room apartment 245 m² in Yesiloez, Turkey",Turkey,"Yesiloez, Mediterranean Region, Alanya, Turkey",2007.0,2.0,3.0,5.0,245.0,274415.0,Yesiloez,"Alanya, Turkey",Mediterranean Region
...,...,...,...,...,...,...,...,...,...,...,...,...
147529,"3 room apartment 106 m² in Yaylali, Turkey",Turkey,"Yaylali, Mediterranean Region, Alanya, Turkey",2023.0,,,3.0,106.0,510856.0,Yaylali,"Alanya, Turkey",Mediterranean Region
147530,3 room apartment 168 m² in Bahcelievler Mahall...,Turkey,"Bahcelievler Mahallesi, Marmara Region, Turkey",,,,,168.0,614848.0,Bahcelievler Mahallesi,Turkey,Marmara Region
147531,"5 room apartment 310 m² in Gazipasa, Turkey",Turkey,"Mediterranean Region, Gazipasa, Turkey",,,,,310.0,597810.0,Mediterranean Region,Turkey,Gazipasa
147532,"4 room apartment 192 m² in Marmara Region, Turkey",Turkey,"Marmara Region, Turkey",2023.0,5.0,,5.0,192.0,637195.0,Marmara Region,,Turkey


In [104]:
# adapting the data type of the individual columns
#df["building_construction_year"] = df["buildiang_construction_year"].apply(int)
#df["price_in_USD"] = df["price_in_USD"].apply(float)
#df['building_construction_year'] = df['building_construction_year'].astype(int)
#df['title', 'country', 'location'] = df['title', 'country', 'location'].astype(str)
#df['building_construction_year', 'building_total_floors', 'apartment_floor', 'apartment_rooms', 'apartment_total_area'] = df['building_construction_year', 'building_total_floors', 'apartment_floor', 'apartment_rooms', 'apartment_total_area'].astype(int)
#df['price_in_USD'] = df['price_in_USD'].astype(float)

#### Mixed-Type Columns

In [147]:
# checking whether a dataframe contains any mixed-type columns
for col in df.columns.tolist():
  weird = (df[[col]].map(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col)

In [150]:
df.dtypes

title                          object
country                        object
location                       object
building_construction_year    float64
building_total_floors         float64
apartment_floor               float64
apartment_rooms               float64
apartment_total_area          float64
price_in_USD                  float64
dtype: object

In [132]:
# dropping missing values in specific columns
df.dropna(subset = ['country', 'location', 'apartment_total_area', 'price_in_USD'], inplace = True)

In [133]:
# show missing values
df.isnull().sum()

title                             0
country                           0
location                          0
building_construction_year    74965
building_total_floors         70764
apartment_floor               83568
apartment_rooms               65314
apartment_total_area              0
price_in_USD                      0
dtype: int64

In [136]:
df

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,120.0,315209.0
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500.0,1108667.0
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,65.0,173211.0
5,"1 room apartment 28 m² in Batumi, Georgia",Georgia,"Abkhazia, Batumi, Georgia",2026.0,,,1.0,28.0,35622.0
6,"4 room apartment 245 m² in Yesiloez, Turkey",Turkey,"Yesiloez, Mediterranean Region, Alanya, Turkey",2007.0,2.0,3.0,5.0,245.0,274415.0
...,...,...,...,...,...,...,...,...,...
147529,"3 room apartment 106 m² in Yaylali, Turkey",Turkey,"Yaylali, Mediterranean Region, Alanya, Turkey",2023.0,,,3.0,106.0,510856.0
147530,3 room apartment 168 m² in Bahcelievler Mahall...,Turkey,"Bahcelievler Mahallesi, Marmara Region, Turkey",,,,,168.0,614848.0
147531,"5 room apartment 310 m² in Gazipasa, Turkey",Turkey,"Mediterranean Region, Gazipasa, Turkey",,,,,310.0,597810.0
147532,"4 room apartment 192 m² in Marmara Region, Turkey",Turkey,"Marmara Region, Turkey",2023.0,5.0,,5.0,192.0,637195.0


In [205]:
# NULL values are shown blank in the df
#df = df.replace('NaN','')
df=df.fillna('')
df

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD,Area,State,Region
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,120.0,315209.0,Mediterranean Region,,Turkey
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500.0,1108667.0,Kalkan,"Kas, Turkey",Mediterranean Region
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,65.0,173211.0,Mediterranean Region,Turkey,Antalya
5,"1 room apartment 28 m² in Batumi, Georgia",Georgia,"Abkhazia, Batumi, Georgia",2026.0,,,1.0,28.0,35622.0,Abkhazia,Georgia,Batumi
6,"4 room apartment 245 m² in Yesiloez, Turkey",Turkey,"Yesiloez, Mediterranean Region, Alanya, Turkey",2007.0,2.0,3.0,5.0,245.0,274415.0,Yesiloez,"Alanya, Turkey",Mediterranean Region
...,...,...,...,...,...,...,...,...,...,...,...,...
147529,"3 room apartment 106 m² in Yaylali, Turkey",Turkey,"Yaylali, Mediterranean Region, Alanya, Turkey",2023.0,,,3.0,106.0,510856.0,Yaylali,"Alanya, Turkey",Mediterranean Region
147530,3 room apartment 168 m² in Bahcelievler Mahall...,Turkey,"Bahcelievler Mahallesi, Marmara Region, Turkey",,,,,168.0,614848.0,Bahcelievler Mahallesi,Turkey,Marmara Region
147531,"5 room apartment 310 m² in Gazipasa, Turkey",Turkey,"Mediterranean Region, Gazipasa, Turkey",,,,,310.0,597810.0,Mediterranean Region,Turkey,Gazipasa
147532,"4 room apartment 192 m² in Marmara Region, Turkey",Turkey,"Marmara Region, Turkey",2023.0,5.0,,5.0,192.0,637195.0,Marmara Region,,Turkey


### 03. Data Wrangling

In [173]:
# specifying regions by spliting the location column up
# df[["Area", "Region", "State"]] = df["location"].str.split(',',n=2, expand=True)

In [162]:
# selecting listings with specific criteria

# selecting apartments with total area of minimum 20 m2
df_selected = df[(df['apartment_total_area'] > 20.0) & (df['building_construction_year'].between(1800.0, 2024.0))]

In [179]:
df.columns

Index(['title', 'country', 'location', 'building_construction_year',
       'building_total_floors', 'apartment_floor', 'apartment_rooms',
       'apartment_total_area', 'price_in_USD', 'Area', 'State', 'Region'],
      dtype='object')

In [180]:
df_selected = df_selected.drop(columns = 'Area', 'State', 'Region')
df_selected

SyntaxError: positional argument follows keyword argument (374365232.py, line 1)

In [181]:
df_selected.shape

(46797, 9)

#### Creating World Regions

In [182]:
df_selected['country'].value_counts(dropna = False)

country
Belarus            11944
Turkey              9703
Hungary             7042
Greece              4747
Lithuania           2377
Russia              2367
Spain               1923
Georgia             1314
Croatia              864
Montenegro           778
UAE                  728
Latvia               611
Uzbekistan           593
Italy                361
Thailand             330
Poland               295
Northern Cyprus      289
Portugal             203
Austria              139
                      75
Indonesia             46
Serbia                25
Cyprus                24
Czech Republic        12
Finland                7
Name: count, dtype: int64

In [206]:
# Creating a regional segmentation ("Europe", "Africa", "Asia", "Australia", "Americas")
def country_to_region(country):
    if country in ['Cyprus', 'Serbia', 'Austria', 'Northern Cyprus', 'Finland', 'Poland', 'Czech Republic', 'Croatia', 'Portugal', 'Latvia', 'Lithuania', 'Italy', 'Montenegro', 'Greece', 'Belarus', 'Spain', 'Hungary', 'Turkey']:
        return 'Europe'
    elif country in ['UAE']:
        return 'Africa'
    elif country in ['Indonesia', 'Armenia', 'Georgia', 'Thailand', 'Uzbekistan']:
        return 'Asia'
    elif country in ['Australia']:
        return 'Australia'
    elif country in ['United States']:
        return 'Americas'
    else:
        return 'Other'

df_selected['Continent'] = df_selected['country'].apply(country_to_region)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Continent'] = df_selected['country'].apply(country_to_region)


In [207]:
df_selected

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD,Region,Continent
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500.0,1108667.0,Other,Europe
6,"4 room apartment 245 m² in Yesiloez, Turkey",Turkey,"Yesiloez, Mediterranean Region, Alanya, Turkey",2007.0,2.0,3.0,5.0,245.0,274415.0,Other,Europe
13,"Cottage 555 m² in Haranski sielski Saviet, Bel...",Belarus,"Minsk Region, Haranski sielski Saviet, Minsk D...",2017.0,3.0,,,555.0,295000.0,Europe,Europe
14,"1 room studio apartment 38 m² in UAE, UAE",UAE,UAE,2024.0,6.0,,1.0,38.0,97930.0,Africa,Africa
15,"2 room apartment 54 m² in Prague, Czech Republic",Czech Republic,"Prague, Czech Republic",2023.0,4.0,2.0,4.0,54.0,314990.0,Europe,Europe
...,...,...,...,...,...,...,...,...,...,...,...
147521,"Penthouse 5 bedrooms 191 m² in Avsallar, Turkey",Turkey,"Avsallar, Mediterranean Region, Alanya, Turkey",2023.0,9.0,,6.0,191.0,261217.0,Other,Europe
147523,"4 room apartment 215 m² in Istanbul, Turkey",Turkey,"Marmara Region, Istanbul, Turkey",2023.0,,,4.0,215.0,619549.0,Other,Europe
147525,"4 room apartment 225 m² in Ankara, Turkey",Turkey,"Central Anatolia Region, Ankara, Turkey",2022.0,25.0,,5.0,225.0,391279.0,Other,Europe
147529,"3 room apartment 106 m² in Yaylali, Turkey",Turkey,"Yaylali, Mediterranean Region, Alanya, Turkey",2023.0,,,3.0,106.0,510856.0,Other,Europe


In [188]:
df_selected['Continent'].value_counts()

Continent
Europe    41344
Other      2442
Asia       2283
Africa      728
Name: count, dtype: int64

In [208]:
df_selected = df_selected.reset_index(drop=True)
df_selected

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD,Region,Continent
0,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500.0,1108667.0,Other,Europe
1,"4 room apartment 245 m² in Yesiloez, Turkey",Turkey,"Yesiloez, Mediterranean Region, Alanya, Turkey",2007.0,2.0,3.0,5.0,245.0,274415.0,Other,Europe
2,"Cottage 555 m² in Haranski sielski Saviet, Bel...",Belarus,"Minsk Region, Haranski sielski Saviet, Minsk D...",2017.0,3.0,,,555.0,295000.0,Europe,Europe
3,"1 room studio apartment 38 m² in UAE, UAE",UAE,UAE,2024.0,6.0,,1.0,38.0,97930.0,Africa,Africa
4,"2 room apartment 54 m² in Prague, Czech Republic",Czech Republic,"Prague, Czech Republic",2023.0,4.0,2.0,4.0,54.0,314990.0,Europe,Europe
...,...,...,...,...,...,...,...,...,...,...,...
46792,"Penthouse 5 bedrooms 191 m² in Avsallar, Turkey",Turkey,"Avsallar, Mediterranean Region, Alanya, Turkey",2023.0,9.0,,6.0,191.0,261217.0,Other,Europe
46793,"4 room apartment 215 m² in Istanbul, Turkey",Turkey,"Marmara Region, Istanbul, Turkey",2023.0,,,4.0,215.0,619549.0,Other,Europe
46794,"4 room apartment 225 m² in Ankara, Turkey",Turkey,"Central Anatolia Region, Ankara, Turkey",2022.0,25.0,,5.0,225.0,391279.0,Other,Europe
46795,"3 room apartment 106 m² in Yaylali, Turkey",Turkey,"Yaylali, Mediterranean Region, Alanya, Turkey",2023.0,,,3.0,106.0,510856.0,Other,Europe


### 04. Data Understanding

In [209]:
df_selected[['building_construction_year', 'building_total_floors', 'apartment_floor', 'apartment_rooms', 'apartment_total_area', 'price_in_USD']].describe()

Unnamed: 0,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD
count,46797.0,29150.0,20879.0,26097.0,46797.0,46797.0
mean,2004.262495,7.536089,5.168686,2.622639,215.1853,276203.1
std,27.238502,7.833391,5.090878,1.111533,6156.81,528684.1
min,1800.0,1.0,-2.0,-1.0,21.0,1900.0
25%,1994.0,2.0,2.0,2.0,56.0,76507.0
50%,2018.0,5.0,4.0,3.0,82.0,143400.0
75%,2023.0,10.0,7.0,3.0,140.0,284169.0
max,2024.0,81.0,105.0,18.0,1148000.0,21752740.0


### Export

In [194]:
# creating a subset for future real estate project (everything after building_construction_year is 2025 or later)
df_future_realestate = df[df['building_construction_year'] > 2024.0]

In [195]:
df_future_realestate

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD,Area,State,Region
5,"1 room apartment 28 m² in Batumi, Georgia",Georgia,"Abkhazia, Batumi, Georgia",2026.0,,,1.0,28.0,35622.0,Abkhazia,Georgia,Batumi
10,"Penthouse 1 bedroom 96 m² in Kyrenia, Northern...",Northern Cyprus,"Kyrenia, Girne (Kyrenia) District, Northern Cy...",2027.0,1.0,,2.0,96.0,199268.0,Kyrenia,Northern Cyprus,Girne (Kyrenia) District
88,"2 room apartment 92 m² in Karakocali, Turkey",Turkey,"Karakocali, Mediterranean Region, Alanya, Turkey",2025.0,11.0,1.0,2.0,92.0,286406.0,Karakocali,"Alanya, Turkey",Mediterranean Region
128,"1 room apartment 70 m² in Istanbul, Turkey",Turkey,"Marmara Region, Istanbul, Turkey",2025.0,14.0,,2.0,70.0,241298.0,Marmara Region,Turkey,Istanbul
156,"1 room apartment 76 m² in Tashkent, Uzbekistan",Uzbekistan,"Tashkent, Chilanzar District, Uzbekistan",2025.0,11.0,8.0,2.0,76.0,90021.0,Tashkent,Uzbekistan,Chilanzar District
...,...,...,...,...,...,...,...,...,...,...,...,...
147464,"2 room apartment 204 m² in Bali, Indonesia",Indonesia,"Bali, Indonesia",2025.0,4.0,4.0,3.0,204.0,970000.0,Bali,,Indonesia
147469,"1 room apartment 81 m² in Wana Giri, Indonesia",Indonesia,"Wana Giri, West Nusa Tenggara, Kec Ampenan, In...",2025.0,4.0,4.0,2.0,81.0,350000.0,Wana Giri,"Kec Ampenan, Indonesia",West Nusa Tenggara
147476,"1 room apartment 41 m² in Dubai, UAE",UAE,"Dubai, UAE",2025.0,,,1.0,41.0,132143.0,Dubai,,UAE
147481,"1 room apartment 42 m² in Sharjah Emirate, UAE",UAE,"Sharjah Emirate, UAE",2025.0,6.0,,2.0,42.0,171233.0,Sharjah Emirate,,UAE


In [201]:
df_future_realestate = df_future_realestate.drop(columns = 'Area', 'State', 'Region')

SyntaxError: positional argument follows keyword argument (3074614905.py, line 1)

In [202]:
df_future_realestate.head()

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD,Area,State,Region
5,"1 room apartment 28 m² in Batumi, Georgia",Georgia,"Abkhazia, Batumi, Georgia",2026.0,,,1.0,28.0,35622.0,Abkhazia,Georgia,Batumi
10,"Penthouse 1 bedroom 96 m² in Kyrenia, Northern...",Northern Cyprus,"Kyrenia, Girne (Kyrenia) District, Northern Cy...",2027.0,1.0,,2.0,96.0,199268.0,Kyrenia,Northern Cyprus,Girne (Kyrenia) District
88,"2 room apartment 92 m² in Karakocali, Turkey",Turkey,"Karakocali, Mediterranean Region, Alanya, Turkey",2025.0,11.0,1.0,2.0,92.0,286406.0,Karakocali,"Alanya, Turkey",Mediterranean Region
128,"1 room apartment 70 m² in Istanbul, Turkey",Turkey,"Marmara Region, Istanbul, Turkey",2025.0,14.0,,2.0,70.0,241298.0,Marmara Region,Turkey,Istanbul
156,"1 room apartment 76 m² in Tashkent, Uzbekistan",Uzbekistan,"Tashkent, Chilanzar District, Uzbekistan",2025.0,11.0,8.0,2.0,76.0,90021.0,Tashkent,Uzbekistan,Chilanzar District


In [1]:
# export dataframe:
df_selected.to_csv(os.path.join(path, '02 Data','Prepared Data', 'listings_checked.csv'))

NameError: name 'df_selected_clean' is not defined