In [2]:
import pandas as pd

# Load data
df = pd.read_csv('filtered_data_residential.csv')

# 1. Drop duplicates based on 'Property ID'
df.drop_duplicates(subset=['Property ID'], keep='first', inplace=True)

# 2. Drop rows without a price
df.dropna(subset=['Price'], inplace=True)

# Print missing data summary
missing_data_summary = df.isnull().sum()
missing_columns = missing_data_summary[missing_data_summary > 0]
print(missing_columns)

# 3. Drop rows with missing 'Region' and keep only residential sales (They contained weird values , and also postal codes not from BelgiumÃ 
df = df.dropna(subset=['Region'])
df = df[df['PriceType'] == 'residential_sale']

# Convert int64 columns to nullable Int64 type
df = df.astype({col: 'Int64' for col in df.select_dtypes('int64').columns})

# Extra binary variables for HasTerrace and HasGarden
df['HasTerrace'] = (df['Terrace_Area'] > 0).astype(int)
df['HasGarden'] = (df['Garden_Area'] > 0).astype(int)

# Create HasKitchen based on KitchenType
df['HasKitchen'] = df['KitchenType'].apply(lambda x: 1 if pd.notnull(x) and x != '' else 0)

# Columns to convert to integer and round values
columns_to_convert = ['Terrace_Area', 'Garden_Area', 'LivingArea', 'LandWidth', 'LandSurface']
df[columns_to_convert] = df[columns_to_convert].apply(lambda x: x.round().astype('Int64'))

# Convert binary columns to integer
binary_columns = ['HasTerrace', 'HasGarden', 'HasKitchen', 'Open_fire']
df[binary_columns] = df[binary_columns].astype(int)

# Filter rows based on conditions
condition = (df['FloorCount'] >= 20) | (df['BedroomCount'] > 20) | (df['FacadeCount'] > 8)
df_filtered = df[~condition]

# Print number of rows dropped and new dataset shape
print(f"Rows dropped: {df.shape[0] - df_filtered.shape[0]}")
print(f"New dataset shape: {df_filtered.shape}")

# Reorder columns and save to CSV
updated_column_order = [
    'Property ID', 'Region', 'Province', 'District', 'Locality', 'PostalCode', 'Street',
    'Price', 'PropertyType', 'PropertySubtype', 'ConstructionYear', 'StateBuilding', 
    'KitchenType', 'FacadeCount', 'FloorCount', 'EPCScore', 'HasTerrace', 'HasGarden', 
    'HasKitchen', 'Open_fire', 'SwimmingPool', 'Furnished', 'BedroomCount', 'LivingArea', 
    'Terrace_Area', 'Garden_Area', 'LandWidth', 'LandSurface', 'ScrapedURL', 
    'PriceType', 'TypeSale', 'LifeAnnuitySale'
]

available_columns = [col for col in updated_column_order if col in df_filtered.columns]
df2 = df_filtered[available_columns]
df2.to_csv('data_restructured_final_v.csv', index=False)

# Display summary information
df2.info()


FileNotFoundError: [Errno 2] No such file or directory: 'filtered_data_residential.csv'