Removing all duplicates 

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 1. Load your CSV file
path = (r"C:\Users\vande\becode\immo-eliza-cats-analysis\Nancy\sample_data_copy\properties.csv")
df = pd.read_csv(path)

# 2. Check how many duplicate rows exist
num_duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")


Number of duplicate rows: 0


No blank spaces (e.g. " I love python " => "I love python")
df apply -> Goes through each column
Lamba function -> checks if the column is a string(object type)
.str.strip() -> removes all leading and trailing spaces from text values 

In [3]:
# Check how many string cells have leading/trailing spaces
spaces_before = df.select_dtypes(include='object').apply(
    lambda col: col.str.match(r'^\s|.*\s$').sum()
)

print("Cells with extra spaces BEFORE cleaning:")
print(spaces_before)


Cells with extra spaces BEFORE cleaning:
property_type       0
subproperty_type    0
region              0
province            0
locality            0
equipped_kitchen    0
state_building      0
epc                 0
heating_type        0
dtype: int64


In [4]:
# Strip leading/trailing spaces from all string (object) columns
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

print("Blank spaces removed from string columns!")

Blank spaces removed from string columns!


In [5]:
# Recheck for spaces after cleaning
spaces_after = df.select_dtypes(include='object').apply(
    lambda col: col.str.match(r'^\s|.*\s$').sum()
)

print("Cells with extra spaces AFTER cleaning:")
print(spaces_after)

Cells with extra spaces AFTER cleaning:
property_type       0
subproperty_type    0
region              0
province            0
locality            0
equipped_kitchen    0
state_building      0
epc                 0
heating_type        0
dtype: int64


Checking how many empty values or Nan are there in the document 

In [6]:
df.isna().sum()


id                                    0
price                                 0
property_type                         0
subproperty_type                      0
region                                0
province                              0
locality                              0
zip_code                              0
latitude                          14098
longitude                         14098
construction_year                 33391
total_area_sqm                     7615
surface_land_sqm                  36256
nbr_frontages                     26346
nbr_bedrooms                          0
equipped_kitchen                      0
fl_furnished                          0
fl_open_fire                          0
fl_terrace                            0
terrace_sqm                       13140
fl_garden                             0
garden_sqm                         2939
fl_swimming_pool                      0
fl_floodzone                          0
state_building                        0


Checking a detailed overview with percentages

In [7]:
missing = df.isna().sum()
missing_percent = (missing / len(df)) * 100
missing_summary = pd.DataFrame({
    'Missing Values': missing,
    'Percentage': missing_percent
})
print(missing_summary.sort_values(by='Percentage', ascending=False))

                                Missing Values  Percentage
cadastral_income                         44967   59.550264
surface_land_sqm                         36256   48.014197
construction_year                        33391   44.220047
primary_energy_consumption_sqm           26567   35.182953
nbr_frontages                            26346   34.890281
longitude                                14098   18.670128
latitude                                 14098   18.670128
terrace_sqm                              13140   17.401438
total_area_sqm                            7615   10.084623
garden_sqm                                2939    3.892148
province                                     0    0.000000
region                                       0    0.000000
subproperty_type                             0    0.000000
property_type                                0    0.000000
zip_code                                     0    0.000000
locality                                     0    0.0000

Getting all the common missing values to NaN and checking if it worked

In [8]:
# Replace common missing value indicators with np.nan
df.replace(
    ["", " ", "NA", "NaN", "None", "n/a", "N/A", "unknown", "Unknown"],
    np.nan,
    inplace=True
)


In [9]:
df.isna().sum()


id                                    0
price                                 0
property_type                         0
subproperty_type                      0
region                                0
province                              0
locality                              0
zip_code                              0
latitude                          14098
longitude                         14098
construction_year                 33391
total_area_sqm                     7615
surface_land_sqm                  36256
nbr_frontages                     26346
nbr_bedrooms                          0
equipped_kitchen                      0
fl_furnished                          0
fl_open_fire                          0
fl_terrace                            0
terrace_sqm                       13140
fl_garden                             0
garden_sqm                         2939
fl_swimming_pool                      0
fl_floodzone                          0
state_building                        0


In [10]:
#1. Checking which columns contain numbers before changing the wrongly encoded values
numeric_cols = ["price", "zip_code", "latitude", "longitude", "construction_year", "total_area_sqm", "surface_land_sqm", 
                "nbr_frontages", "nbr_bedrooms", "fl_furnished", "fl_open_fire", "fl_terrace", "terrace_sqm", "fl_garden"
                , "garden_sqm", "fl_swimming_pool", "fl_floodzone", "primary_energy_consumption_sqm", "fl_double_glazing"
                , "cadastral_income"]


In [11]:
#2. Detect non-numeric (wrongly encoded) values
import numpy as np

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')



In [12]:
#Verfying it worked 
df[numeric_cols].isna().sum()


price                                 0
zip_code                              0
latitude                          14098
longitude                         14098
construction_year                 33391
total_area_sqm                     7615
surface_land_sqm                  36256
nbr_frontages                     26346
nbr_bedrooms                          0
fl_furnished                          0
fl_open_fire                          0
fl_terrace                            0
terrace_sqm                       13140
fl_garden                             0
garden_sqm                         2939
fl_swimming_pool                      0
fl_floodzone                          0
primary_energy_consumption_sqm    26567
fl_double_glazing                     0
cadastral_income                  44967
dtype: int64

In [13]:
df[numeric_cols].dtypes


price                             float64
zip_code                            int64
latitude                          float64
longitude                         float64
construction_year                 float64
total_area_sqm                    float64
surface_land_sqm                  float64
nbr_frontages                     float64
nbr_bedrooms                      float64
fl_furnished                        int64
fl_open_fire                        int64
fl_terrace                          int64
terrace_sqm                       float64
fl_garden                           int64
garden_sqm                        float64
fl_swimming_pool                    int64
fl_floodzone                        int64
primary_energy_consumption_sqm    float64
fl_double_glazing                   int64
cadastral_income                  float64
dtype: object

In [14]:
#Load the sample data - made changes in the notebook - save it as csv 
df.to_csv(r"C:\Users\vande\becode\immo-eliza-cats-analysis\Nancy\cleaning.csv", index=False)

