# World Real Estate Transaction globally

## Content:

##### 01. Importing Libraries & Data
##### 02. Consistency Check & Data Cleaning
##### 03. Data Wrangling
##### 04. Export

### 01. Importing Libraries & Data

In [2]:
# import libraries

import pandas as pd
import numpy as np
import os

In [3]:
# folder path to main project folder
path = r'C:\Users\ThinkPad T570\Documents\05-2024 OpenData WorldRealEstate Analysis_LazerHF'

In [4]:
# importing original World Real Estate dataframe
df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'original_world_real_estate_data.csv'))

In [6]:
df.head()

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_bedrooms,apartment_bathrooms,apartment_total_area,apartment_living_area,price_in_USD,image,url
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,2.0,2.0,120 m²,110 m²,315209.0,https://realting.com/uploads/bigSlider/ab3/888...,https://realting.com/property-for-sale/turkey/...
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,,,500 m²,480 m²,1108667.0,https://realting.com/uploads/bigSlider/87b/679...,https://realting.com/property-for-sale/turkey/...
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,1.0,1.0,65 m²,60 m²,173211.0,https://realting.com/uploads/bigSlider/030/a11...,https://realting.com/property-for-sale/turkey/...
3,"1 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2020.0,15.0,5.0,2.0,1.0,1.0,,40 m²,99900.0,https://realting.com/uploads/bigSlider/e9a/e06...,https://realting.com/property-for-sale/thailan...
4,"2 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2026.0,8.0,3.0,3.0,2.0,1.0,,36 m²,67000.0,https://realting.com/uploads/bigSlider/453/aa2...,https://realting.com/property-for-sale/thailan...


In [7]:
df.shape

(147536, 14)

In [8]:
print(df.columns)

Index(['title', 'country', 'location', 'building_construction_year',
       'building_total_floors', 'apartment_floor', 'apartment_rooms',
       'apartment_bedrooms', 'apartment_bathrooms', 'apartment_total_area',
       'apartment_living_area', 'price_in_USD', 'image', 'url'],
      dtype='object')


### 02. Consistency Check & Data Cleaning

In [9]:
# deleting columns 'image' and 'url' because no relevant data for the analysis
# deleting columns 'apartment_living_area' and 'apartment_bathrooms' as the initial data inspection uncovered that these columns contain over 75% missing values
# deleting 'apartment_bathrooms' because it contains over 60% of missing values and is also not relevant for the analysis output

df = df.drop(['image', 'url', 'apartment_living_area', 'apartment_bathrooms', 'apartment_bedrooms'], axis=1)

In [10]:
df.shape

(147536, 9)

#### Removing Duplicates

In [11]:
# remove duplicates
df = df.drop_duplicates()

#### Mixed-Type Columns

In [12]:
# checking whether a dataframe contains any mixed-type columns
for col in df.columns.tolist():
  weird = (df[[col]].map(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col)

country
location
apartment_total_area


In [14]:
df.dtypes

title                          object
country                        object
location                       object
building_construction_year    float64
building_total_floors         float64
apartment_floor               float64
apartment_rooms               float64
apartment_total_area           object
price_in_USD                  float64
dtype: object

#### Formatting

In [15]:
# removing m2 suffix from "apartment_total_area" column
df["apartment_total_area"] = df["apartment_total_area"].str.rstrip("m²")
df.head()

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,120.0,315209.0
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500.0,1108667.0
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,65.0,173211.0
3,"1 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2020.0,15.0,5.0,2.0,,99900.0
4,"2 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2026.0,8.0,3.0,3.0,,67000.0


In [18]:
# removing blanks in "apartment_total_area" column
df["apartment_total_area"] = df["apartment_total_area"].str.replace(" ","")

In [19]:
df['apartment_total_area'] = pd.to_numeric(df['apartment_total_area'])

In [20]:
df.dtypes

title                          object
country                        object
location                       object
building_construction_year    float64
building_total_floors         float64
apartment_floor               float64
apartment_rooms               float64
apartment_total_area          float64
price_in_USD                  float64
dtype: object

In [21]:
df.head()

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,120.0,315209.0
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500.0,1108667.0
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,65.0,173211.0
3,"1 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2020.0,15.0,5.0,2.0,,99900.0
4,"2 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2026.0,8.0,3.0,3.0,,67000.0


In [22]:
df['building_construction_year'] = df['building_construction_year'].round().astype('Int64')

In [23]:
df.dtypes

title                          object
country                        object
location                       object
building_construction_year      Int64
building_total_floors         float64
apartment_floor               float64
apartment_rooms               float64
apartment_total_area          float64
price_in_USD                  float64
dtype: object

In [24]:
df.head()

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,120.0,315209.0
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500.0,1108667.0
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,65.0,173211.0
3,"1 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2020.0,15.0,5.0,2.0,,99900.0
4,"2 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2026.0,8.0,3.0,3.0,,67000.0


In [25]:
# changing the float64 format
pd.options.display.float_format = '{:,.0f}'.format

In [29]:
df.head()

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5,1.0,3.0,120.0,315209
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2,,,500.0,1108667
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5,2.0,2.0,65.0,173211
3,"1 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2020.0,15,5.0,2.0,,99900
4,"2 room apartment in Pattaya, Thailand",Thailand,"Chon Buri Province, Pattaya, Thailand",2026.0,8,3.0,3.0,,67000


#### Missing Values

In [30]:
# finding missing values
df.isnull().sum()

title                             0
country                         117
location                        118
building_construction_year    78904
building_total_floors         75701
apartment_floor               89070
apartment_rooms               70256
apartment_total_area           5062
price_in_USD                   1667
dtype: int64

In [31]:
# dropping missing values in specific columns
df.dropna(subset = ['country', 'location', 'apartment_total_area', 'price_in_USD'], inplace = True)

In [32]:
# show missing values
df.isnull().sum()

title                             0
country                           0
location                          0
building_construction_year    74925
building_total_floors         70739
apartment_floor               83517
apartment_rooms               65275
apartment_total_area              0
price_in_USD                      0
dtype: int64

### 03. Data Wrangling

#### Creating World Regions

In [41]:
df['country'].value_counts(dropna = False)

country
Turkey             22787
Hungary            21684
Russia             17714
Spain              13214
Belarus            13081
Greece             11391
Montenegro          9690
Georgia             3162
Italy               2821
UAE                 2542
Lithuania           2460
Latvia              2258
Portugal            1835
Thailand            1812
Croatia             1808
Uzbekistan          1659
Czech Republic      1346
Poland              1196
Finland              743
Northern Cyprus      616
United States        375
Austria              187
Armenia              110
Serbia               104
Indonesia             99
Cyprus                69
Australia              1
Name: count, dtype: int64

In [47]:
# Creating a regional segmentation ("Europe", "Africa", "Asia", "Australia", "Americas")
def country_to_region(country):
    if country in ['Cyprus', 'Serbia', 'Austria', 'Northern Cyprus', 'Finland', 'Poland', 'Czech Republic', 'Croatia', 'Portugal', 'Latvia', 'Lithuania', 'Italy', 'Montenegro', 'Greece', 'Belarus', 'Spain', 'Hungary', 'Turkey']:
        return 'Europe'
    elif country in ['UAE']:
        return 'Africa'
    elif country in ['Indonesia', 'Armenia', 'Georgia', 'Thailand', 'Uzbekistan', 'Russia']:
        return 'Asia'
    elif country in ['Australia']:
        return 'Australia'
    elif country in ['United States']:
        return 'Americas'
    else:
        return 'Other'

df['Continent'] = df['country'].apply(country_to_region)

In [48]:
df.head()

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD,Continent
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,120,315209,Europe
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500,1108667,Europe
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,65,173211,Europe
5,"1 room apartment 28 m² in Batumi, Georgia",Georgia,"Abkhazia, Batumi, Georgia",2026.0,,,1.0,28,35622,Asia
6,"4 room apartment 245 m² in Yesiloez, Turkey",Turkey,"Yesiloez, Mediterranean Region, Alanya, Turkey",2007.0,2.0,3.0,5.0,245,274415,Europe


In [49]:
df['Continent'].value_counts()

Continent
Europe       107290
Asia          24556
Africa         2542
Americas        375
Australia         1
Name: count, dtype: int64

In [50]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD,Continent
0,2 room apartment 120 m² in Mediterranean Regio...,Turkey,"Mediterranean Region, Turkey",,5.0,1.0,3.0,120,315209,Europe
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021.0,2.0,,,500,1108667,Europe
2,"1 room apartment 65 m² in Antalya, Turkey",Turkey,"Mediterranean Region, Antalya, Turkey",,5.0,2.0,2.0,65,173211,Europe
3,"1 room apartment 28 m² in Batumi, Georgia",Georgia,"Abkhazia, Batumi, Georgia",2026.0,,,1.0,28,35622,Asia
4,"4 room apartment 245 m² in Yesiloez, Turkey",Turkey,"Yesiloez, Mediterranean Region, Alanya, Turkey",2007.0,2.0,3.0,5.0,245,274415,Europe


#### Creating a Subset for Future Reals Estate Projects

In [65]:
# creating a subset for future real estate project (everything after building_construction_year is 2025 or later)
df_future_realestate = df[df['building_construction_year'] > 2024]

In [67]:
df_future_realestate

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD,Continent
3,"1 room apartment 28 m² in Batumi, Georgia",Georgia,"Abkhazia, Batumi, Georgia",2026,,,1,28,35622,Asia
7,"Penthouse 1 bedroom 96 m² in Kyrenia, Northern...",Northern Cyprus,"Kyrenia, Girne (Kyrenia) District, Northern Cy...",2027,1,,2,96,199268,Europe
77,"2 room apartment 92 m² in Karakocali, Turkey",Turkey,"Karakocali, Mediterranean Region, Alanya, Turkey",2025,11,1,2,92,286406,Europe
115,"1 room apartment 70 m² in Istanbul, Turkey",Turkey,"Marmara Region, Istanbul, Turkey",2025,14,,2,70,241298,Europe
142,"1 room apartment 76 m² in Tashkent, Uzbekistan",Uzbekistan,"Tashkent, Chilanzar District, Uzbekistan",2025,11,8,2,76,90021,Asia
...,...,...,...,...,...,...,...,...,...,...
134699,"2 room apartment 204 m² in Bali, Indonesia",Indonesia,"Bali, Indonesia",2025,4,4,3,204,970000,Asia
134704,"1 room apartment 81 m² in Wana Giri, Indonesia",Indonesia,"Wana Giri, West Nusa Tenggara, Kec Ampenan, In...",2025,4,4,2,81,350000,Asia
134711,"1 room apartment 41 m² in Dubai, UAE",UAE,"Dubai, UAE",2025,,,1,41,132143,Africa
134716,"1 room apartment 42 m² in Sharjah Emirate, UAE",UAE,"Sharjah Emirate, UAE",2025,6,,2,42,171233,Africa


#### Real Estate Selection

In [68]:
# selecting listings with specific criteria

# selecting apartments with total area of minimum 20 m2 & built between 1800 and 2024
df_selected = df[(df['apartment_total_area'] > 20.0) & (df['building_construction_year'].between(1800, 2024))]

In [54]:
df_selected.shape

(46722, 10)

In [55]:
df_selected

Unnamed: 0,title,country,location,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD,Continent
1,"4 room villa 500 m² in Kalkan, Turkey",Turkey,"Kalkan, Mediterranean Region, Kas, Turkey",2021,2,,,500,1108667,Europe
4,"4 room apartment 245 m² in Yesiloez, Turkey",Turkey,"Yesiloez, Mediterranean Region, Alanya, Turkey",2007,2,3,5,245,274415,Europe
10,"Cottage 555 m² in Haranski sielski Saviet, Bel...",Belarus,"Minsk Region, Haranski sielski Saviet, Minsk D...",2017,3,,,555,295000,Europe
11,"1 room studio apartment 38 m² in UAE, UAE",UAE,UAE,2024,6,,1,38,97930,Africa
12,"2 room apartment 54 m² in Prague, Czech Republic",Czech Republic,"Prague, Czech Republic",2023,4,2,4,54,314990,Europe
...,...,...,...,...,...,...,...,...,...,...
134751,"Penthouse 5 bedrooms 191 m² in Avsallar, Turkey",Turkey,"Avsallar, Mediterranean Region, Alanya, Turkey",2023,9,,6,191,261217,Europe
134753,"4 room apartment 215 m² in Istanbul, Turkey",Turkey,"Marmara Region, Istanbul, Turkey",2023,,,4,215,619549,Europe
134755,"4 room apartment 225 m² in Ankara, Turkey",Turkey,"Central Anatolia Region, Ankara, Turkey",2022,25,,5,225,391279,Europe
134759,"3 room apartment 106 m² in Yaylali, Turkey",Turkey,"Yaylali, Mediterranean Region, Alanya, Turkey",2023,,,3,106,510856,Europe


#### Data Understanding

In [69]:
df_selected[['building_construction_year', 'building_total_floors', 'apartment_floor', 'apartment_rooms', 'apartment_total_area', 'price_in_USD']].describe()

Unnamed: 0,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD
count,46722,29078,20819,26041,46722,46722
mean,2004,8,5,3,215,276186
std,27,8,5,1,6162,529027
min,1800,1,-2,-1,21,1900
25%,1994,2,2,2,56,76507
50%,2018,5,4,3,82,143354
75%,2023,10,7,3,140,284041
max,2024,81,105,18,1148000,21752743


In [70]:
df_future_realestate[['building_construction_year', 'building_total_floors', 'apartment_floor', 'apartment_rooms', 'apartment_total_area', 'price_in_USD']].describe()

Unnamed: 0,building_construction_year,building_total_floors,apartment_floor,apartment_rooms,apartment_total_area,price_in_USD
count,12556,11773,11186,12286,12556,12556
mean,2025,16,9,2,60,202988
std,3,7,6,1,89,574164
min,2025,1,1,1,4,19500
25%,2025,12,5,1,37,88270
50%,2025,16,9,2,51,127384
75%,2026,18,13,3,67,162359
max,2316,115,74,7,5300,24109589


In [71]:
df['building_construction_year'].replace('2,316', '2,016', inplace=True)

## 04. Export

In [73]:
# export dataframe:
df_selected.to_csv(os.path.join(path, '02 Data','Prepared Data', 'listings_selected.csv'))

In [74]:
# export dataframe future real estate projects:
df_future_realestate.to_csv(os.path.join(path, '02 Data','Prepared Data', 'listings_future.csv'))