In [34]:
import pandas as pd

# Set options to show all columns
pd.set_option('display.max_columns', None)

data = r'C:\Users\izama\Desktop\immo_scraper_merged_test.csv'
data_scraped = pd.read_csv(data, sep = ',')

# Edit text in the columns
text_edit_columns = ['Subtype', 'Type_of_sale', 'Kitchen_type', 'State_of_building' ]

for column in text_edit_columns:
    data_scraped[column] = data_scraped[column].astype(str)  # Ensure the column is treated as string
    data_scraped[column] = data_scraped[column].str.replace('_', ' ').str.capitalize()

# Remove zip code from brackets, e.g. "Tielt (8700)" -> "Tielt"
data_scraped['locality_name'] = data_scraped['locality_name'].str.replace(r"\s*\(\d+\)", "", regex=True)

#Edit text of cities and street names
names_edit_columns = ['locality_name', 'street']

for column in names_edit_columns:
    data_scraped[column] = data_scraped[column].astype(str)  # Ensure the column is treated as string
    data_scraped[column] = data_scraped[column].str.title()

# Edit the number of facades
number_edit_columns = ['Number_of_bedrooms', 'Living_area', 'Number_of_facades', 'Price']

for column in number_edit_columns:
    data_scraped[column] = pd.to_numeric(data_scraped[column], errors='coerce').fillna(0).astype('Int64')

# Replace 'True', 'False' and empty cells value on binary values
columns_for_change_01 = ['Open_fire', 'Swimming_Pool', 'Furnished'] #Columns for change
data_scraped[columns_for_change_01] = data_scraped[columns_for_change_01].replace({False:0, True:1}).fillna(0).astype('Int64')

# Add m² for area columns 
columns_for_change_m2 = ['Living_area', 'terraceSurface', 'gardenSurface'] #Columns for change

for col in columns_for_change_m2:
    data_scraped[col] = pd.to_numeric(data_scraped[col], errors='coerce').fillna(0).astype('Int64') #Change string value to an integer

for col in columns_for_change_m2:
    data_scraped[col] = data_scraped[col].apply(lambda x: f"{x} m²" if pd.notna(x) and x != 0 else x) #Add m² to the area values

# Replace "0" and "Nan" on "Null"

columns_for_change_null = ['hasTerrace', 'terraceSurface', 'hasGarden', 'gardenSurface', 'epc', 'Kitchen_type', 'State_of_building']
data_scraped[columns_for_change_null] = data_scraped[columns_for_change_null].replace({0: 'null', 'Nan': 'null'})
data_scraped[columns_for_change_null] = data_scraped[columns_for_change_null].fillna('null')

# Remove lines where 'Starting_price' is True
data_scraped = data_scraped[data_scraped['Starting_price'] != True]

# Cleaning data from unnecessary subtypes
values_to_remove = ['House group','Mixed use building', 'Apartment block', 'Castle', 'Other property', 'Exceptional property']
data_scraped = data_scraped[~data_scraped['Subtype'].isin(values_to_remove)]

# Removing unnecessary columns
columns_to_drop = ['Number_of_rooms', 'sale_annuity', 'Starting_price']
data_scraped = data_scraped.drop(columns=columns_to_drop, errors='ignore')


###   Cleaning data from outliers

# Remove outliers 6, 5, 8, 9 from 'Number_of_facades' column
#values_to_remove = [6, 5, 8, 9]
#data_scraped = data_scraped[~data_scraped['Number_of_facades'].isin(values_to_remove)]

# Outliers by 'Number Of Bedrooms' column, checked via IQR.

# In this method, we determine quartile values ​​Q1 (25th percentile) and Q3 (75th percentile) and then calculate IQR.
# Outliers are those that fall outside the range [Q1 - 1.5 * IQR, Q3 + 1.5 * IQR]

q1 = data_scraped['Number_of_bedrooms'].quantile(0.25)
q3 = data_scraped['Number_of_bedrooms'].quantile(0.75)
iqr = q3 - q1

# Specifying the scope of outliners
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Data filtering
data_scraped = data_scraped[(data_scraped['Number_of_bedrooms'] >= lower_bound) & 
                             (data_scraped['Number_of_bedrooms'] <= upper_bound)]

###   Cleaning data from duplicates

# Check which rows are duplicates (indicates True for duplicates)

cleaned_data = data_scraped.drop_duplicates(subset=['latitude', 'longitude', 'street', 'number', 'Subtype'])
duplicates = data_scraped[data_scraped.duplicated(subset=['latitude', 'longitude', 'street', 'number', 'Subtype'])]

#print(duplicates)

cleaned_data.to_csv('clean_data.csv', index=False)
print("The CSV file was saved as 'clean_data.csv'.")


cleaned_data[750:770]


The CSV file was saved as 'clean_data.csv'.


  data_scraped[columns_for_change_01] = data_scraped[columns_for_change_01].replace({False:0, True:1}).fillna(0).astype('Int64')


Unnamed: 0,id,locality_name,Postal_code,Price,Subtype,Number_of_bedrooms,Living_area,Type_of_sale,street,number,latitude,longitude,Open_fire,Swimming_Pool,hasTerrace,terraceSurface,hasGarden,gardenSurface,Kitchen_type,Number_of_facades,State_of_building,Furnished,epc
1073,20228137,Lanaken,3620,898000,Villa,4,370 m²,For sale,Fazantenlaan,5,50.922871,5.669788,0,0,,,True,2842 m²,,4,Good,0,B
1074,20228456,Oud-Turnhout,2360,1350000,Villa,5,569 m²,For sale,Tweesprongdreef,9 - 11,51.309341,5.016089,0,1,True,,,,,4,As new,0,C
1075,20242754,Rumbeke,8800,335000,House,3,148 m²,For sale,Rode-Beukenstraat,5,50.928159,3.128492,0,0,True,60 m²,True,300 m²,,0,,0,C
1077,20243406,Dilbeek,1702,489000,House,3,128 m²,For sale,Tengaerdestraat,18,50.872259,4.256671,0,0,True,22 m²,,,Hyper equipped,3,As new,0,B
1078,20242468,Kuurne,8520,775000,Mansion,3,0,For sale,Roterijstraat,20,50.845546,3.279395,0,0,True,20 m²,True,590 m²,,2,As new,0,B
1079,20223100,Kortrijk,8500,189000,House,4,68 m²,For sale,Sint-Antoniusstraat,113,50.824433,3.275061,0,0,True,1 m²,,,Installed,2,Good,0,D
1080,20221067,Zottegem,9620,260000,House,3,219 m²,For sale,Sint-Andriessteenweg,116,50.868156,3.796439,0,0,,,,,Installed,0,Good,0,E
1082,20223107,Menen,8930,285000,House,4,165 m²,For sale,Harmoniestraat,73,50.7891,3.137096,0,0,True,10 m²,True,200 m²,Semi equipped,3,To renovate,0,F
1083,20221885,Zottegem,9620,289000,House,3,183 m²,For sale,Kleine Meerlaan,15,50.861507,3.80995,0,0,True,33 m²,True,400 m²,Installed,3,To renovate,0,F
1084,20223541,Aalst,9300,305000,House,4,179 m²,For sale,Bergekouter,18,50.941074,4.059296,0,0,True,30 m²,True,40 m²,Hyper equipped,2,Good,0,C


**HELPFUL CODES**

In [36]:
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6597 entries, 0 to 9988
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  6597 non-null   int64  
 1   locality_name       6597 non-null   object 
 2   Postal_code         6597 non-null   object 
 3   Price               6597 non-null   Int64  
 4   Subtype             6597 non-null   object 
 5   Number_of_bedrooms  6597 non-null   Int64  
 6   Living_area         6597 non-null   object 
 7   Type_of_sale        6597 non-null   object 
 8   street              6597 non-null   object 
 9   number              6572 non-null   object 
 10  latitude            6589 non-null   float64
 11  longitude           6589 non-null   float64
 12  Open_fire           6597 non-null   Int64  
 13  Swimming_Pool       6597 non-null   Int64  
 14  hasTerrace          6597 non-null   object 
 15  terraceSurface      6597 non-null   object 
 16  hasGarden  

In [33]:
#Checks the number of rows
data_scraped.shape[0]
#filtered_data.shape[0]
cleaned_data.shape[0]

6597

In [74]:
#Checks the number of columns
data_scraped.shape[1]

24

In [121]:
# Print columns names to list
data_scraped.columns.tolist()

['id',
 'locality_name',
 'Postal_code',
 'Price',
 'Subtype',
 'Number_of_bedrooms',
 'Living_area',
 'Type_of_sale',
 'street',
 'number',
 'latitude',
 'longitude',
 'Open_fire',
 'Swimming_Pool',
 'hasTerrace',
 'terraceSurface',
 'hasGarden',
 'gardenSurface',
 'Kitchen_type',
 'Number_of_facades',
 'State_of_building',
 'Furnished',
 'epc']

In [37]:
#Checks values in column
data_scraped['Kitchen_type'].value_counts()

Kitchen_type
null                  2732
Installed             2257
Hyper equipped        1228
Semi equipped          694
Not installed          219
Usa installed           38
Usa hyper equipped      33
Usa uninstalled          4
Usa semi equipped        2
Name: count, dtype: int64

In [69]:
# Displaying rows with some value from a column
values_to_display = ['Farmhouse'] #Enter value name 
filtered_data = data_scraped[data_scraped['Subtype'].isin(values_to_display)] #Enter the column name
print(filtered_data)

            id              locality_name Postal_code    Price    Subtype  \
1152  20242024                  Koekelare        8680   575000  Farmhouse   
1587  20239187                 Diepenbeek        3590   725000  Farmhouse   
1941  20121636            Corroy-le-grand        1325   449000  Farmhouse   
3249  20235626                 Huldenberg        3040  1200000  Farmhouse   
3285  20201495                  Maarkedal        9680   349000  Farmhouse   
3406  20203089                 Hoegaarden        3320   259000  Farmhouse   
3584  20250406              Celles pottes        7760   459000  Farmhouse   
3796  20205069                     Ravels        2380   865000  Farmhouse   
4067  20230971         Rouvroy dampicourt        6767   220000  Farmhouse   
4141  20150834  Merchtem brussegem vijlst        1785   875000  Farmhouse   
4318  20212276               Buissonville        5580   450000  Farmhouse   
4340  20247882                     Deinze        9800   495000  Farmhouse   

In [12]:
# Displaying duplicates
duplicate_count = data_scraped.duplicated(subset=['latitude', 'longitude', 'street', 'number']).sum()
print("Number of duplicates:", duplicate_count)
duplicates.info()

Number of duplicates: 663
<class 'pandas.core.frame.DataFrame'>
Index: 663 entries, 38 to 9980
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  663 non-null    int64  
 1   locality_name       663 non-null    object 
 2   Postal_code         663 non-null    object 
 3   Price               663 non-null    Int64  
 4   Subtype             663 non-null    object 
 5   Number_of_bedrooms  663 non-null    Int64  
 6   Living_area         663 non-null    object 
 7   Type_of_sale        663 non-null    object 
 8   street              663 non-null    object 
 9   number              474 non-null    object 
 10  latitude            478 non-null    float64
 11  longitude           478 non-null    float64
 12  Open_fire           663 non-null    Int64  
 13  Swimming_Pool       663 non-null    Int64  
 14  hasTerrace          663 non-null    object 
 15  terraceSurface      663 non-null  