In [1]:
# Configuration parameters
CONFIG = {
    # Data paths
    "data": {
        "data_dir": "/mnt/c/Users/LP031/onedrive_unige/data/immobiliare.it",
    },
    
    # Filtering thresholds
    "filtering": {
        # Column-specific minimum occurrences thresholds
        "min_occurrences": {
            "default": 20,  # Default threshold for categorical values
            "city": 40,     # City-specific threshold
            "zone": 15,     # Zone-specific threshold
            "typology_name": 10,  # Property type threshold
            "garage_type": 25,    # Garage type threshold
            "garage_primary_type": 25  # Garage primary type threshold
        },
        "feature_threshold": 100,  # Minimum occurrences for binary feature extraction from ga4features
        "typologies_to_keep": ["Appartamento", "Attico"]  # Property types to keep in the analysis
    },
    
    # Machine learning parameters
    "ml": {
        "test_size": 0.2,  # Percentage of data to use for testing
        "random_state": 42,  # Random seed for reproducibility
        "n_estimators": 100,  # Number of estimators for tree-based models
        "top_features_to_select": 20  # Number of top features to select in feature selection methods
    },
    
    # Floor normalization mapping
    "floor_mapping": {
        "piano terra": 0,
        "seminterrato": 0,
        "interrato (-1)": -1,
        "interrato (-2)": -2,
        "interrato (-3)": -3,
        "piano rialzato": 0.5,
        "ammezzato": 0.5,
        "su più livelli": None  # Will be converted to np.nan
    }
}

# Print configuration for reference
print("Configuration loaded:")
for section, params in CONFIG.items():
    print(f"\n{section.upper()}:")
    for key, value in params.items():
        if isinstance(value, dict):
            print(f"  - {key}:")
            for subkey, subvalue in value.items():
                print(f"    - {subkey}: {subvalue}")
        else:
            print(f"  - {key}: {value}")

Configuration loaded:

DATA:
  - data_dir: /mnt/c/Users/LP031/onedrive_unige/data/immobiliare.it

FILTERING:
  - min_occurrences:
    - default: 20
    - city: 40
    - zone: 15
    - typology_name: 10
    - garage_type: 25
    - garage_primary_type: 25
  - feature_threshold: 100
  - typologies_to_keep: ['Appartamento', 'Attico']

ML:
  - test_size: 0.2
  - random_state: 42
  - n_estimators: 100
  - top_features_to_select: 20

FLOOR_MAPPING:
  - piano terra: 0
  - seminterrato: 0
  - interrato (-1): -1
  - interrato (-2): -2
  - interrato (-3): -3
  - piano rialzato: 0.5
  - ammezzato: 0.5
  - su più livelli: None


In [2]:
%pip install xgboost

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import re
import ipywidgets as widgets
from IPython.display import display, HTML

# Data preprocessing and modeling
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, FunctionTransformer
from xgboost import XGBRegressor

# Use data directory from configuration
data_dir = CONFIG["data"]["data_dir"]
# Load data from CSV
df = pd.concat([
    pd.read_csv(f'{data_dir}/ads_savona_rent.csv'),
    pd.read_csv(f'{data_dir}/ads_genova_rent.csv')
], ignore_index=True)

# Let's select a subset of relevant features for our ML model
# We'll focus on physical property attributes, location data, and price

# First, examine missing values to help with feature selection
print("Missing values per column:")
print(df.isnull().sum().sort_values(ascending=False).head(15))

# Create a list of features we think are important for predicting property prices
relevant_columns = [
    # Target variable
    'price_value',
    
    # Core property attributes
    'surface',          # Size of the property
    'rooms',            # Number of rooms
    'bathrooms',        # Number of bathrooms
    'floor',            # Floor level
    'typology_name',    # Type of property (apartment, house, etc.)
    
    # Amenities/features
    'elevator',         # Presence of elevator
    'ga4Heating',       # Heating type
    'ga4Garage',        # Garage availability
    'ga4features',      # Additional features (e.g., garden, pool, etc.)
    
    # Location data (important for real estate)
    'latitude',
    'longitude',
    'city',
    'macrozone',        # Area within the city
    'zone',             # More specific location
    
    # Additional attributes that might influence price
    'isNew',            # New construction or not
    'luxury'            # Luxury property flag
]

# Check which columns from our selection actually exist in the dataset
existing_columns = [col for col in relevant_columns if col in df.columns]
missing_columns = [col for col in relevant_columns if col not in df.columns]

print("\nSelected columns that exist in the dataset:")
print(existing_columns)

print("\nSelected columns missing from the dataset:")
print(missing_columns)

# Create our dataset with only the relevant columns that exist
if 'zone' not in df.columns and 'macrozone' in df.columns:
    # If zone doesn't exist but macrozone does, use macrozone
    print("\nUsing 'macrozone' instead of 'zone' for location information")
    df['zone'] = df['macrozone']

# Create the subset dataframe
df_subset = df[existing_columns].copy()

# Display the first few rows of our subset
print("\nSubset DataFrame:")
df_subset.head()
df_subset['typology_name'].value_counts()

Looking in indexes: https://pypi.org/simple, https://consulting_feed_token:****@pkgs.dev.azure.com/RinaCUBE/a11c8c85-c3a2-4e18-ab34-382599832cf2/_packaging/Consulting_Libs/pypi/simple/
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Missing values per column:
matchSearch        1347
propertiesCount    1347
price_max          1347
price_min          1347
description        1345
ga4Garage           976
views               767
elevator            657
price_value         517
caption             387
macrozone           297
photo_caption       280
agency_name         214
agency_id           214
agency_type         214
dtype: int64

Selected columns that exist in the dataset:
['price_value', 'surface', 'rooms', 'bathrooms', 'floor', 'typology_name', 'elevator', 'ga4Heating', 'ga4Garage', 'ga4features', 'latitude', 'longitude', 'city', 'macrozone', 'isNew', 'luxury']

Selected columns missing from the dataset:


typology_name
Appartamento               1274
Attico                       26
Villa unifamiliare           22
Appartamento in villa         6
Palazzo - Edificio            5
Villa bifamiliare             3
Mansarda                      3
Loft                          2
Open space                    2
Villa a schiera               1
Sasso                         1
Villa plurifamiliare          1
Terratetto unifamiliare       1
Name: count, dtype: int64

# Filter the dataframe


In [None]:
# Filter the dataframe to include only rows with specific typology names from config
if 'typology_name' in df_subset.columns:
    df_subset = df_subset[df_subset['typology_name'].isin(CONFIG['filtering']['typologies_to_keep'])].copy()
    print(f"Filtered df_subset shape: {df_subset.shape}")
else:
    print("'typology_name' column not found in df_subset.")
    
# Filter out all cities having less than min_occurrences from config
# Get the city-specific threshold
min_occurrences_city = CONFIG['filtering']['min_occurrences']['city']
city_counts = df_subset['city'].value_counts()
cities_to_keep = city_counts[city_counts >= min_occurrences_city].index
df_subset = df_subset[df_subset['city'].isin(cities_to_keep)]
print(f"Filtered cities with fewer than {min_occurrences_city} occurrences.")
print(f"Remaining cities: {len(cities_to_keep)} out of {len(city_counts)}")
df_subset.city.value_counts()

# Floor Number Normalization

The `floor_number` column contains a mix of formats, including:
- Simple numeric values ('1', '2', '3', etc.)
- Italian text descriptions ('piano terra', 'piano rialzato')
- Floor ranges ('da 2 a 3', 'da seminterrato a piano terra')
- Multi-level properties ('su più livelli', 'piano terra, 1')

We'll normalize these values by converting them into numeric values, where:
- -3, -2, -1: Underground floors (interrato)
- 0: Ground floor (piano terra, seminterrato)
- 0.5: Mezzanine/raised ground floor (piano rialzato, ammezzato)
- 1-25: Standard floors
- For ranges or multiple floors, we'll use the highest floor as it typically represents the property's main position

In [None]:
# Create a function to normalize floor numbers

def normalize_floor(floor_str):
    """
    Normalize floor number strings to numeric values.
    
    Args:
        floor_str: String representation of floor(s)
        
    Returns:
        float: Normalized floor number
        
    Examples:
        '3' -> 3.0
        'piano terra' -> 0.0
        'da 2 a 5' -> 5.0 (taking the highest value)
        'seminterrato, 2' -> 2.0 (taking the highest value)
    """
    if pd.isna(floor_str):
        return np.nan
    
    floor_str = str(floor_str).lower().strip()
    
    # Use floor mapping from configuration
    floor_mapping = CONFIG['floor_mapping']
    # Convert None to np.nan
    floor_mapping = {k: (np.nan if v is None else v) for k, v in floor_mapping.items()}
    
    # Check direct mapping first
    if floor_str in floor_mapping:
        return float(floor_mapping[floor_str])
    
    # For complex strings, extract all numbers
    numbers = []
    
    # Look for Italian range pattern "da X a Y"
    range_match = re.search(r'da\s+(\d+|\w+)\s+a\s+(\d+|\w+)', floor_str)
    if range_match:
        # Extract the second number (end of range)
        end_value = range_match.group(2)
        if end_value.isdigit():
            numbers.append(int(end_value))
        # If the end value is a word (like "piano terra"), map it
        elif end_value in floor_mapping:
            numbers.append(floor_mapping[end_value])
    
    # Extract all digits
    digit_matches = re.findall(r'\d+', floor_str)
    numbers.extend([int(d) for d in digit_matches])
    
    # Extract all known floor types
    for term, value in floor_mapping.items():
        if term in floor_str and not pd.isna(value):  # Skip None/NaN values
            numbers.append(value)
    
    # Return the highest floor (most relevant for pricing)
    if numbers:
        return float(max(numbers))
    
    # If no numbers found, return NaN
    return np.nan

# Test the function on our unique values
floor_test_df = pd.DataFrame({'floor_number': df['floor_number'].unique()})
floor_test_df['normalized_floor'] = floor_test_df['floor_number'].apply(normalize_floor)

# Display the results to verify the normalization
floor_test_df.sort_values('normalized_floor').head(20)

In [None]:
# Perform more comprehensive data preprocessing on our subset

# Handle missing values and data conversion
df_clean = df_subset.copy()

# Convert string columns to appropriate numeric types
for col in ['surface', 'rooms', 'bathrooms']:
    if col in df_clean.columns:
        # Extract numbers from strings if necessary and convert to float
        if df_clean[col].dtype == 'object':
            df_clean[col] = df_clean[col].str.extract('(\d+\.?\d*)').astype(float)

# Handle floor information using our normalize_floor function
if 'floor_number' in df_clean.columns:
    # Apply our normalized floor function
    df_clean['floor_numeric'] = df_clean['floor_number'].apply(normalize_floor)
    
    # Drop the original floor column and keep the numeric version
    df_clean.drop('floor_number', axis=1, inplace=True)

elif 'floor' in df_clean.columns:
    # Apply our normalized floor function
    df_clean['floor_numeric'] = df_clean['floor'].apply(normalize_floor)
    
    # Drop the original floor column and keep the numeric version
    df_clean.drop('floor', axis=1, inplace=True)


## Clean up and collapse garage options
The `garage` column contains various formats and descriptions. We'll standardize these to a few categories
- 'no park': No parking space
- 'common park': common parking space
- 'private park': Private parking space
- 'common box': Common garage
- 'private box': Private garage

In [None]:
df['ga4Garage'].value_counts()

In [None]:
# Function to standardize garage options
def standardize_garage(garage_str):
    """
    Standardize garage descriptions into five categories:
    - 'no park': No parking space
    - 'common park': Common parking space
    - 'private park': Private parking space (not used in this case)
    - 'common box': Common garage/box
    - 'private box': Private garage/box
    
    For combined options, prioritize private box > common box > common park > no park
    
    Args:
        garage_str: String description of garage options
        
    Returns:
        str: Standardized category
    """
    if pd.isna(garage_str):
        return 'no park'
    
    garage_str = str(garage_str).lower()
    
    # Check for box privato (private box/garage)
    if 'box privato' in garage_str:
        return 'private box'
    
    # Check for parcheggio/garage comune (common parking/garage)
    if 'parcheggio' in garage_str or 'garage comune' in garage_str:
        return 'common park'
    
    # Default to no parking if nothing matches
    return 'no park'

# Let's create a more detailed function that considers counts and combinations
def detailed_garage_standardization(garage_str):
    """
    Create a more detailed standardization of garage options that considers:
    1. The priority type (private box > common park)
    2. The count of parking spaces
    
    Args:
        garage_str: String description of garage options
        
    Returns:
        dict: Dictionary with standardized categorization
              - 'primary_type': Main parking type (private_box, common_park, no_park)
              - 'private_box_count': Number of private boxes
              - 'common_park_count': Number of common parking spaces
              - 'total_count': Total number of parking spaces
    """
    if pd.isna(garage_str):
        return {
            'primary_type': 'no_park',
            'private_box_count': 0,
            'common_park_count': 0,
            'total_count': 0
        }
    
    garage_str = str(garage_str).lower()
    
    # Initialize counts
    private_box_count = 0
    common_park_count = 0
    
    # Look for private box counts
    if 'box privato' in garage_str:
        # Extract the count before "in box privato"
        box_matches = re.findall(r'(\d+)\s+in\s+box\s+privato', garage_str)
        if box_matches:
            private_box_count = sum(int(count) for count in box_matches)
    
    # Look for common parking counts
    if 'parcheggio' in garage_str or 'garage comune' in garage_str:
        # Extract the count before "in parcheggio/garage comune"
        park_matches = re.findall(r'(\d+)\s+in\s+parcheggio', garage_str)
        if park_matches:
            common_park_count = sum(int(count) for count in park_matches)
    
    # Determine primary type based on priority
    primary_type = 'no_park'
    if private_box_count > 0:
        primary_type = 'private_box'
    elif common_park_count > 0:
        primary_type = 'common_park'
    
    return {
        'primary_type': primary_type,
        'private_box_count': private_box_count,
        'common_park_count': common_park_count,
        'total_count': private_box_count + common_park_count
    }

# Apply the standardization to the DataFrame
if 'ga4Garage' in df_clean.columns:
    # Add a simple categorization
    df_clean['garage_type'] = df_clean['ga4Garage'].apply(standardize_garage)
    
    # Add detailed garage information
    garage_details = df_clean['ga4Garage'].apply(detailed_garage_standardization)
    df_clean['garage_primary_type'] = garage_details.apply(lambda x: x['primary_type'])
    df_clean['garage_total_count'] = garage_details.apply(lambda x: x['total_count'])
    
    # Drop the original column as we now have standardized versions
    df_clean.drop('ga4Garage', axis=1, inplace=True)

# Show the distribution of standardized garage types
if 'garage_type' in df_clean.columns:
    print("Standardized garage types distribution:")
    print(df_clean['garage_type'].value_counts())
    
    print("\nDetailed garage primary types:")
    print(df_clean['garage_primary_type'].value_counts())
    
    print("\nGarage space count distribution:")
    print(df_clean['garage_total_count'].value_counts().sort_index())

In [None]:
feature_counts = df_clean['ga4features'].str.split(',').explode().str.strip().value_counts()
print("Individual features and their counts:")
print(feature_counts)

# Check how many values meet our threshold from config
threshold = CONFIG['filtering']['feature_threshold']
top_features = feature_counts[feature_counts >= threshold].index.tolist()
print(f"\nFeatures with {threshold} or more occurrences: {len(top_features)}")
print(top_features)

In [None]:
# Create binary columns for features that occur at least feature_threshold times
threshold = CONFIG['filtering']['feature_threshold']
top_features = feature_counts[feature_counts >= threshold].index.tolist()

# Function to check if a feature is present in the comma-separated list
def has_feature(features_str, feature):
    if pd.isna(features_str):
        return 0
    
    feature_list = [f.strip() for f in str(features_str).split(',')]
    return 1 if feature in feature_list else 0

# Create binary columns for each top feature
for feature in top_features:
    # Create a sanitized column name (replace spaces with underscores, lowercase)
    feature_col_name = f"has_{feature.replace(' ', '_').lower()}"
    
    # Apply the function to create binary indicators
    df_clean[feature_col_name] = df_clean['ga4features'].apply(
        lambda x: has_feature(x, feature)
    )

# Drop the original ga4features column as we now have individual feature columns
df_clean.drop('ga4features', axis=1, inplace=True)

# Display the new binary columns
print("Sample of the DataFrame with new binary feature columns:")
print(df_clean[['price_value'] + [f"has_{feature.replace(' ', '_').lower()}" for feature in top_features[:5]]].head())

# Show the total counts for each binary feature to verify
binary_counts = pd.DataFrame({
    'feature': top_features,
    'count': [df_clean[f"has_{feature.replace(' ', '_').lower()}"].sum() for feature in top_features]
}).sort_values('count', ascending=False)

print("\nBinary feature counts:")
print(binary_counts)

In [None]:
# Filter out features that have less than min_occurrences occurrences
# Use column-specific thresholds from config
min_occurrences_dict = CONFIG['filtering']['min_occurrences']
default_min_occurrences = min_occurrences_dict['default']

# List of categorical columns to check for filtering
categorical_cols_to_filter = ['city', 'zone', 'typology_name', 'garage_type', 'garage_primary_type']
categorical_cols_to_filter = [col for col in categorical_cols_to_filter if col in df_clean.columns]

print(f"Filtering out values with fewer than specified occurrences in categorical columns:")
original_shape = df_clean.shape[0]

# For each categorical column, filter out values with fewer than min_occurrences
for col in categorical_cols_to_filter:
    # Get column-specific threshold (or default if not specified)
    threshold = min_occurrences_dict.get(col, default_min_occurrences)
    
    # Get value counts
    value_counts = df_clean[col].value_counts()
    
    # Find values to keep (those with at least the threshold occurrences)
    values_to_keep = value_counts[value_counts >= threshold].index
    
    # Filter the dataframe
    df_clean = df_clean[df_clean[col].isin(values_to_keep)]
    
    # Print how many values were kept for this column
    print(f"  - {col}: Kept {len(values_to_keep)} unique values out of {len(value_counts)} using threshold {threshold}")
    print(f"    Remaining records: {df_clean.shape[0]}")

# Show how many records were filtered out
records_removed = original_shape - df_clean.shape[0]
print(f"\nFiltering removed {records_removed} records ({records_removed/original_shape:.1%} of the original dataset)")
print(f"Shape after filtering: {df_clean.shape}")

# Show distribution of values in the filtered columns
for col in categorical_cols_to_filter:
    print(f"\n{col} value distribution after filtering:")
    print(df_clean[col].value_counts().head())

## Handle categorical columns


In [None]:
# Handle categorical features - exclude binary feature columns which are already 0/1
categorical_cols = ['typology_name', 'city', 'zone', 'elevator', 'ga4Heating', 'garage_type', 'garage_primary_type']
categorical_cols = [col for col in categorical_cols if col in df_clean.columns]

# Separate binary feature columns (starting with 'has_')
binary_feature_cols = [col for col in df_clean.columns if col.startswith('has_')]

# Handle numeric features
numeric_cols = ['surface', 'rooms', 'bathrooms', 'garage_total_count', 'floor_numeric']
numeric_cols = [col for col in numeric_cols if col in df_clean.columns]

# Fill missing values in numeric columns
for col in numeric_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# Encode categorical variables
for col in categorical_cols:
    if df_clean[col].dtype == 'object':
        # Fill missing values with a placeholder
        df_clean[col] = df_clean[col].fillna('Unknown')
        
        # Create dummies with drop_first=True to avoid multicollinearity
        dummies = pd.get_dummies(df_clean[col], prefix=col, drop_first=True)
        
        # Add dummies to dataframe and drop the original column
        df_clean = pd.concat([df_clean, dummies], axis=1)
        df_clean.drop(col, axis=1, inplace=True)

# Handle boolean features
bool_cols = ['isNew', 'luxury'] + binary_feature_cols  # Include our binary feature columns
bool_cols = [col for col in bool_cols if col in df_clean.columns]

for col in bool_cols:
    df_clean[col] = df_clean[col].fillna(0).astype(int)

# Drop rows with missing target values
df_clean = df_clean.dropna(subset=['price_value'])

# Display the processed dataset
print("Processed dataset shape:", df_clean.shape)
print("\nProcessed dataset columns:")
print(df_clean.columns.tolist())
print("\nMissing values in processed dataset:")
print(df_clean.isnull().sum().sum())

# Preview the processed dataset
print("\nPreview of the processed dataset (first 3 rows, selected columns):")
preview_cols = ['price_value']
if 'floor_numeric' in df_clean.columns:
    preview_cols.append('floor_numeric')
if 'garage_total_count' in df_clean.columns:
    preview_cols.append('garage_total_count')
if binary_feature_cols:
    preview_cols += binary_feature_cols[:3]  # Add up to 3 binary feature columns
print(df_clean[preview_cols].head(3))

# Reusable Preprocessing Pipeline

Let's create a scikit-learn pipeline to make our preprocessing reusable and consistent. This will:
1. Handle all preprocessing steps in a single object
2. Apply the exact same transformations to new data during inference
3. Manage categorical variables properly, even when new categories appear
4. Preserve feature names for interpretation

In [None]:
# Custom transformer for extracting numbers from strings
class NumericExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        return X.apply(lambda x: pd.to_numeric(
            re.search(r'(\d+\.?\d*)', str(x)).group(1) if isinstance(x, str) and re.search(r'(\d+\.?\d*)', str(x)) else x, 
            errors='coerce'
        ))

# Custom transformer for floor normalization
class FloorNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Use floor mapping from configuration
        self.floor_mapping = CONFIG['floor_mapping']
        # Convert None to np.nan
        self.floor_mapping = {k: (np.nan if v is None else v) for k, v in self.floor_mapping.items()}
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.apply(self._normalize_floor)
    
    def _normalize_floor(self, floor_str):
        # Check if this is a Series (when applied to a DataFrame column)
        if isinstance(floor_str, pd.Series):
            # Apply the function to each element in the Series
            return floor_str.apply(self._normalize_floor)
            
        # Process a single value
        if pd.isna(floor_str):
            return np.nan
        
        floor_str = str(floor_str).lower().strip()
        
        # Check direct mapping first
        if floor_str in self.floor_mapping:
            return float(self.floor_mapping[floor_str])
        
        # For complex strings, extract all numbers
        numbers = []
        
        # Look for Italian range pattern "da X a Y"
        range_match = re.search(r'da\s+(\d+|\w+)\s+a\s+(\d+|\w+)', floor_str)
        if range_match:
            # Extract the second number (end of range)
            end_value = range_match.group(2)
            if end_value.isdigit():
                numbers.append(int(end_value))
            # If the end value is a word (like "piano terra"), map it
            elif end_value in self.floor_mapping:
                numbers.append(self.floor_mapping[end_value])
        
        # Extract all digits
        digit_matches = re.findall(r'\d+', floor_str)
        numbers.extend([int(d) for d in digit_matches])
        
        # Extract all known floor types
        for term, value in self.floor_mapping.items():
            if term in floor_str and not pd.isna(value):  # Skip None/NaN values
                numbers.append(value)
        
        # Return the highest floor (most relevant for pricing)
        if numbers:
            return float(max(numbers))
        
        # If no numbers found, return NaN
        return np.nan

# Define a function that creates and returns a complete preprocessing pipeline
def create_preprocessing_pipeline(df=None):
    """
    Creates a scikit-learn preprocessing pipeline for the real estate data.
    
    Args:
        df: Optional DataFrame used for detecting columns.
              If None, the pipeline must be initialized with explicit columns later.
    
    Returns:
        A tuple containing:
        - preprocessor: A scikit-learn ColumnTransformer
        - column_info: A dictionary with information about detected columns
    """
    # Initialize column info dictionary
    column_info = {
        'numeric_cols': [],
        'floor_col': None,
        'categorical_cols': [],
        'bool_cols': [],
        'binary_features_cols': []
    }
    
    if df is not None:
        # Auto-detect columns from the dataframe
        numeric_cols = ['surface', 'rooms', 'bathrooms', 'garage_total_count'] 
        numeric_cols = [col for col in numeric_cols if col in df.columns]
        column_info['numeric_cols'] = numeric_cols
        
        floor_col = 'floor_number' if 'floor_number' in df.columns else 'floor'
        floor_col = floor_col if floor_col in df.columns else None
        column_info['floor_col'] = floor_col
        
        categorical_cols = ['typology_name', 'city', 'zone', 'elevator', 'ga4Heating', 
                           'garage_type', 'garage_primary_type']
        categorical_cols = [col for col in categorical_cols if col in df.columns]
        column_info['categorical_cols'] = categorical_cols
        
        bool_cols = ['isNew', 'luxury']
        bool_cols = [col for col in bool_cols if col in df.columns]
        column_info['bool_cols'] = bool_cols
        
        # Detect binary feature columns (starting with 'has_')
        binary_features_cols = [col for col in df.columns if col.startswith('has_')]
        column_info['binary_features_cols'] = binary_features_cols
    
    # Create transformers for each column type
    transformers = []
    
    # Numeric columns: extract numbers, impute missing, and scale
    if column_info['numeric_cols']:
        numeric_pipeline = Pipeline([
            ('extract_numeric', NumericExtractor()),
            ('impute', SimpleImputer(strategy='median')),
            ('scale', StandardScaler())
        ])
        transformers.append(('numeric', numeric_pipeline, column_info['numeric_cols']))
    
    # Floor column: normalize floor values and handle special cases
    if column_info['floor_col']:
        floor_pipeline = Pipeline([
            ('normalize_floor', FloorNormalizer()),
            ('impute', SimpleImputer(strategy='median'))
        ])
        transformers.append(('floor', floor_pipeline, [column_info['floor_col']]))
    
    # Categorical columns: one-hot encode with handling for unknown values
    if column_info['categorical_cols']:
        categorical_pipeline = Pipeline([
            ('impute', SimpleImputer(strategy='constant', fill_value='Unknown')),
            # Convert all values to strings to ensure uniform type
            ('to_string', FunctionTransformer(lambda X: X.astype(str))),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
        ])
        transformers.append(('categorical', categorical_pipeline, column_info['categorical_cols']))
    
    # Boolean columns: convert to int
    if column_info['bool_cols']:
        bool_pipeline = Pipeline([
            ('to_int', FunctionTransformer(lambda X: X.astype(int))),
            ('impute', SimpleImputer(strategy='most_frequent'))
        ])
        transformers.append(('boolean', bool_pipeline, column_info['bool_cols']))
    
    # Binary feature columns: already binary, just impute missing values
    if column_info['binary_features_cols']:
        binary_features_pipeline = Pipeline([
            ('impute', SimpleImputer(strategy='most_frequent'))
        ])
        transformers.append(('binary_features', binary_features_pipeline, column_info['binary_features_cols']))
    
    # Create the final preprocessor
    preprocessor = ColumnTransformer(
        transformers=transformers,
        remainder='drop'  # Drop any columns not explicitly listed
    )
    
    return preprocessor, column_info

# Create the pipeline based on our current dataframe
preprocessor, column_info = create_preprocessing_pipeline(df_subset)

# Now we have a reusable preprocessor that can be:
# 1. Fit on training data: preprocessor.fit(X_train)
# 2. Used to transform both train and test: X_train_processed = preprocessor.transform(X_train)
# 3. Saved to disk for later use: pickle.dump((preprocessor, column_info), open('preprocessor.pkl', 'wb'))
# 4. Loaded and applied to new data: pickle.load(open('preprocessor.pkl', 'rb'))[0].transform(new_data)

# Let's demonstrate how to use it with our current data
print("Using the preprocessing pipeline:")

# Create a copy of our data for demonstration
X = df_subset.copy()
y = None
if 'price_value' in X.columns:
    y = X['price_value']
    X = X.drop('price_value', axis=1)

# Show some sample rows before preprocessing
print("\nBefore preprocessing (first 3 rows, selected columns):")
display_cols = ['surface', 'rooms']
if 'floor_number' in X.columns:
    display_cols.append('floor_number')
elif 'floor' in X.columns:
    display_cols.append('floor')
if 'city' in X.columns:
    display_cols.append('city')
print(X[display_cols].head(3))

# Fit and transform
X_transformed = preprocessor.fit_transform(X)

# Access column information from our stored dictionary
numeric_cols = column_info['numeric_cols']
floor_col = column_info['floor_col']
categorical_cols = column_info['categorical_cols']
bool_cols = column_info['bool_cols']
binary_features_cols = column_info['binary_features_cols']

# Get feature names for the transformed data using the number of output features
# This ensures we have the correct number of names
n_features = X_transformed.shape[1]
print(f"Transformed data has {n_features} features")

# Create generic feature names if needed
feature_names = [f"feature_{i}" for i in range(n_features)]

# Try to get actual feature names where possible
feature_index = 0
numeric_cols_out = []
if 'numeric' in preprocessor.named_transformers_ and len(numeric_cols) > 0:
    numeric_cols_out = numeric_cols
    for i, col in enumerate(numeric_cols):
        if feature_index < n_features:
            feature_names[feature_index] = col
            feature_index += 1

floor_col_out = []
if 'floor' in preprocessor.named_transformers_ and floor_col is not None:
    floor_col_out = ['floor_numeric']
    if feature_index < n_features:
        feature_names[feature_index] = 'floor_numeric'
        feature_index += 1

cat_cols_out = []
if 'categorical' in preprocessor.named_transformers_ and len(categorical_cols) > 0:
    ohe = preprocessor.named_transformers_['categorical'].named_steps['onehot']
    cat_cols_out = ohe.get_feature_names_out(categorical_cols).tolist()
    for i, col in enumerate(cat_cols_out):
        if feature_index < n_features:
            feature_names[feature_index] = col
            feature_index += 1

bool_cols_out = []
if 'boolean' in preprocessor.named_transformers_ and len(bool_cols) > 0:
    bool_cols_out = bool_cols
    for i, col in enumerate(bool_cols):
        if feature_index < n_features:
            feature_names[feature_index] = col
            feature_index += 1

binary_features_cols_out = []
if 'binary_features' in preprocessor.named_transformers_ and len(binary_features_cols) > 0:
    binary_features_cols_out = binary_features_cols
    for i, col in enumerate(binary_features_cols):
        if feature_index < n_features:
            feature_names[feature_index] = col
            feature_index += 1

# Convert to DataFrame to show the transformed data
X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)
print("\nAfter preprocessing (first 3 rows, selected features):")
print(X_transformed_df.head(3))

print(f"\nTransformed data shape: {X_transformed.shape} with {len(feature_names)} features")
print(f"Number of feature types: {len(numeric_cols_out)} numeric, {len(floor_col_out)} floor, " 
      f"{len(cat_cols_out)} categorical, {len(bool_cols_out)} boolean, {len(binary_features_cols_out)} binary features")

# Example of saving the pipeline to disk
# pickle.dump((preprocessor, column_info), open('preprocessor.pkl', 'wb'))
# print("\nPreprocessor saved to 'preprocessor.pkl'")

# Example of how you would use this with new data for prediction
print("\nExample of using the pipeline with new data:")
print("# Load the pipeline and column info")
print("preprocessor, column_info = pickle.load(open('preprocessor.pkl', 'rb'))")
print("")
print("# Create new data with the same features as used during training")
print("new_data = pd.DataFrame([{")
print("    'surface': '120', ")
print("    'rooms': '3', ")
print(f"    '{column_info['floor_col']}': 'piano terra', ")
print("    'city': 'Genova',")
print("    # Add all other features that were used during training...")
print("}])")
print("")
print("# Make prediction")
print("preprocessed_features = preprocessor.transform(new_data)  # Returns preprocessed features ready for the model")

In [None]:
# Build and evaluate ML models using our cleaned dataset

# Drop rows with any remaining NaN values for training
df_ml = df_clean.dropna()
print(f"Dataset shape after dropping NaN values: {df_ml.shape}")

# Check and remove any non-numeric columns before modeling
numeric_columns = df_ml.select_dtypes(include=['number']).columns.tolist()
print(f"Number of numeric columns: {len(numeric_columns)}")

if 'price_value' not in numeric_columns:
    print("Warning: price_value column is not numeric!")
else:
    # Separate features and target
    X = df_ml[numeric_columns].drop('price_value', axis=1)
    y = df_ml['price_value']
    
    print("Data types in features:")
    print(X.dtypes.value_counts())

    # Split the data using parameters from config
    test_size = CONFIG['ml']['test_size']
    random_state = CONFIG['ml']['random_state']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    print(f"Training set: {X_train.shape}")
    print(f"Test set: {X_test.shape}")

    # Define a function to evaluate models
    def evaluate_model(model, X_train, X_test, y_train, y_test):
        # Fit model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        print(f"Mean Absolute Error: €{mae:.2f}")
        print(f"Root Mean Squared Error: €{rmse:.2f}")
        print(f"R² Score: {r2:.4f}")
        
        return model, mae, rmse, r2

    # Train and evaluate multiple models
    # Scale the features for linear models
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

# Linear models (using scaled data)
print("\n--- Linear Regression ---")
lr_model, lr_mae, lr_rmse, lr_r2 = evaluate_model(
    LinearRegression(), X_train_scaled, X_test_scaled, y_train, y_test
)

print("\n--- Ridge Regression ---")
ridge_model, ridge_mae, ridge_rmse, ridge_r2 = evaluate_model(
    Ridge(alpha=1.0), X_train_scaled, X_test_scaled, y_train, y_test
)

# Get n_estimators from config
n_estimators = CONFIG['ml']['n_estimators']
random_state = CONFIG['ml']['random_state']

# Tree-based models (don't need scaling)
print("\n--- Random Forest ---")
rf_model, rf_mae, rf_rmse, rf_r2 = evaluate_model(
    RandomForestRegressor(n_estimators=n_estimators, random_state=random_state), X_train, X_test, y_train, y_test
)

print("\n--- Gradient Boosting ---")
gb_model, gb_mae, gb_rmse, gb_r2 = evaluate_model(
    GradientBoostingRegressor(n_estimators=n_estimators, random_state=random_state), X_train, X_test, y_train, y_test
)

# Compare model performance
models = ['Linear Regression', 'Ridge Regression', 'Random Forest', 'Gradient Boosting']
r2_scores = [lr_r2, ridge_r2, rf_r2, gb_r2]
mae_scores = [lr_mae, ridge_mae, rf_mae, gb_mae]

# Find best model based on R² score
best_model_idx = np.argmax(r2_scores)
print(f"\nBest model based on R² score: {models[best_model_idx]} (R² = {r2_scores[best_model_idx]:.4f})")

# If we have RandomForest or GradientBoosting, check feature importance
if best_model_idx >= 2:  # Index 2 or 3 (tree-based models)
    best_model = [lr_model, ridge_model, rf_model, gb_model][best_model_idx]
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance.head(10))
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['Feature'].head(10)[::-1], 
             feature_importance['Importance'].head(10)[::-1])
    plt.xlabel('Importance')
    plt.title(f'Top 10 Most Important Features ({models[best_model_idx]})')
    plt.tight_layout()
    plt.show()

# Building Models with the Pipeline

Now we'll use our preprocessing pipeline to build a complete model pipeline that includes:
1. The preprocessing steps from above
2. The model training steps
3. Evaluation metrics

This approach ensures that the entire workflow from raw data to predictions is encapsulated in a single pipeline object.

In [None]:
# Prepare data: X and y from our subset
X = df_subset.copy()
if 'price_value' in X.columns:
    y = X.pop('price_value')  # Remove and get the target
else:
    raise ValueError("Target 'price_value' not found in the dataframe")

# Check for and remove NaN values
nan_mask = y.notna()
X = X[nan_mask]
y = y[nan_mask]

print(f"Removed {sum(~nan_mask)} rows with NaN target values")
print(f"Shape after removing NaN values: X: {X.shape}, y: {y.shape}")

# Use train_test_split parameters from config
test_size = CONFIG['ml']['test_size']
random_state = CONFIG['ml']['random_state']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Create a function to build a complete pipeline with preprocessing and model
def build_model_pipeline(model, preprocessor=None, column_info=None):
    """
    Builds a complete sklearn pipeline with preprocessing and model.
    
    Args:
        model: A scikit-learn model/estimator
        preprocessor: A fitted or unfitted preprocessing pipeline
        column_info: Dictionary with column information
        
    Returns:
        A complete pipeline
    """
    if preprocessor is None:
        # Create a new preprocessor if one isn't provided
        preprocessor, column_info = create_preprocessing_pipeline()
    
    # Create a pipeline that first preprocesses the data, then fits the model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    return pipeline, column_info

# Define evaluation function
def evaluate_model_pipeline(pipeline, X_train, X_test, y_train, y_test, model_name="Model"):
    """
    Fits a model pipeline and evaluates its performance.
    
    Args:
        pipeline: A scikit-learn pipeline with preprocessing and model
        X_train: Training features
        X_test: Test features
        y_train: Training target
        y_test: Test target
        model_name: Name of the model for display
        
    Returns:
        The fitted pipeline and performance metrics
    """
    # Fit the pipeline on training data
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    
    # Calculate metrics
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    
    # Print results
    print(f"\n--- {model_name} ---")
    print(f"Training MAE: €{train_mae:.2f}, Test MAE: €{test_mae:.2f}")
    print(f"Training RMSE: €{train_rmse:.2f}, Test RMSE: €{test_rmse:.2f}")
    print(f"Training R²: {train_r2:.4f}, Test R²: {test_r2:.4f}")
    
    # Return the fitted pipeline and metrics
    return pipeline, {
        'name': model_name,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_r2': train_r2,
        'test_r2': test_r2
    }

# Create a shared preprocessor for all models
preprocessor, column_info = create_preprocessing_pipeline(X)

# Get n_estimators from config
n_estimators = CONFIG['ml']['n_estimators']
random_state = CONFIG['ml']['random_state']

# Define models
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge(alpha=1.0)),
    ('Random Forest', RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)),
    ('Gradient Boosting', GradientBoostingRegressor(n_estimators=n_estimators, random_state=random_state))
]

# Train and evaluate all models
results = []
trained_pipelines = {}

for name, model in models:
    pipeline, _ = build_model_pipeline(model, preprocessor, column_info)
    trained_pipeline, metrics = evaluate_model_pipeline(
        pipeline, X_train, X_test, y_train, y_test, name
    )
    results.append(metrics)
    trained_pipelines[name] = trained_pipeline

# Compare model performance with a visualization
results_df = pd.DataFrame(results)
best_model = results_df.loc[results_df['test_r2'].idxmax()]

print(f"\nBest model based on Test R² score: {best_model['name']} (R² = {best_model['test_r2']:.4f})")

# Plot R² scores
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
bars = plt.bar(results_df['name'], results_df['test_r2'], color='skyblue')
plt.title('Test R² Scores by Model')
plt.ylabel('R² Score (higher is better)')
plt.xticks(rotation=45)
plt.tight_layout()

# Highlight the best model
best_idx = results_df['test_r2'].idxmax()
bars[best_idx].set_color('darkblue')

# Plot MAE scores
plt.subplot(1, 2, 2)
bars = plt.bar(results_df['name'], results_df['test_mae'], color='lightcoral')
plt.title('Test MAE Scores by Model')
plt.ylabel('Mean Absolute Error € (lower is better)')
plt.xticks(rotation=45)
plt.tight_layout()

# Highlight the best model by MAE
best_mae_idx = results_df['test_mae'].idxmin()
bars[best_mae_idx].set_color('darkred')

plt.show()

# If the best model is Random Forest or Gradient Boosting, check feature importance
best_model_name = best_model['name']
if best_model_name in ['Random Forest', 'Gradient Boosting']:
    # Get the trained model from the pipeline
    model_step = trained_pipelines[best_model_name].named_steps['model']
    
    # Get the preprocessor to get feature names
    preprocessor_step = trained_pipelines[best_model_name].named_steps['preprocessor']
    
    # Extract feature names from the column information
    feature_names = []
    
    # Add numeric features
    if column_info['numeric_cols']:
        feature_names.extend(column_info['numeric_cols'])
    
    # Add floor feature
    if column_info['floor_col'] is not None:
        feature_names.append('floor_numeric')
    
    # Add categorical features (one-hot encoded)
    if column_info['categorical_cols']:
        categorical_transformer = preprocessor_step.named_transformers_.get('categorical')
        if categorical_transformer:
            ohe = categorical_transformer.named_steps['onehot']
            categorical_features = ohe.get_feature_names_out(column_info['categorical_cols']).tolist()
            feature_names.extend(categorical_features)
    
    # Add boolean features
    if column_info['bool_cols']:
        feature_names.extend(column_info['bool_cols'])
    
    # Add binary features from ga4features extraction
    if 'binary_features_cols' in column_info and column_info['binary_features_cols']:
        feature_names.extend(column_info['binary_features_cols'])
    
    # Create feature importance dataframe
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': model_step.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance.head(15))  # Show top 15 instead of 10 to see more binary features
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))  # Larger figure to accommodate more features
    plt.barh(feature_importance['Feature'].head(15)[::-1], 
             feature_importance['Importance'].head(15)[::-1])
    plt.xlabel('Importance')
    plt.title(f'Top 15 Most Important Features ({best_model_name})')
    plt.tight_layout()
    plt.show()
    
    # Specifically analyze the importance of the binary features from ga4features
    if 'binary_features_cols' in column_info and column_info['binary_features_cols']:
        binary_features_importance = feature_importance[
            feature_importance['Feature'].isin(column_info['binary_features_cols'])
        ].sort_values('Importance', ascending=False)
        
        print("\nImportance of Property Features (ga4features):")
        print(binary_features_importance)
        
        # Plot just the binary features importance if there are enough
        if len(binary_features_importance) > 3:
            plt.figure(figsize=(10, 6))
            plt.barh(binary_features_importance['Feature'].head(10)[::-1], 
                     binary_features_importance['Importance'].head(10)[::-1])
            plt.xlabel('Importance')
            plt.title(f'Property Features Importance ({best_model_name})')
            plt.tight_layout()
            plt.show()

# Save the best model for future use
best_pipeline = trained_pipelines[best_model_name]
# Save both the pipeline and column_info for later use
# pickle.dump({'pipeline': best_pipeline, 'column_info': column_info}, 
#           open(f'best_model_{best_model_name.replace(" ", "_").lower()}.pkl', 'wb'))
print(f"\nBest model pipeline is ready for use. You can save it using:")
print(f"pickle.dump({{'pipeline': best_pipeline, 'column_info': column_info}}, open('best_model.pkl', 'wb'))")

# Example of how to use the saved model for prediction
print("\nExample of using the saved model for prediction:")
print("# Load the model")
print("saved_data = pickle.load(open('best_model.pkl', 'rb'))")
print("model = saved_data['pipeline']")
print("column_info = saved_data['column_info']")
print("")
print("# Create new data with the same features as the training data")
print("new_data = pd.DataFrame([{")
print("    'surface': '120', ")
print("    'rooms': '3', ")
print(f"    '{column_info['floor_col']}': 'piano terra', ")
print("    'city': 'Genova',")
print("    'has_arredato': 1,  # Example of binary feature")
print("    'has_balcone': 1,   # Example of binary feature")
print("    # Add all other features that were used during training...")
print("}])")
print("")
print("# Make prediction")
print("prediction = model.predict(new_data)")
print("print(f'Predicted price: €{prediction[0]:.2f}')")

# Feature Selection

Let's add the ability to select specific features for our model. This can help:
1. Reduce model complexity and improve interpretability
2. Focus on the most important predictors
3. Remove redundant or irrelevant features
4. Create simpler models that may generalize better

We'll implement both manual feature selection and automated feature selection methods.

In [None]:
%pip install ipywidgets

# First, let's explore all available features in our dataset
X = df_subset.copy()
if 'price_value' in X.columns:
    y = X.pop('price_value')  # Remove the target variable
else:
    raise ValueError("Target 'price_value' not found in the dataframe")

# Get a clean mask for non-NaN values in the target
nan_mask = y.notna()
X = X[nan_mask]
y = y[nan_mask]

# Get the preprocessor to extract feature names
preprocessor, column_info = create_preprocessing_pipeline(X)

# Initialize lists to store different types of features
all_features = []

# Add numeric features
numeric_features = []
if column_info['numeric_cols']:
    numeric_features = column_info['numeric_cols']
    all_features.extend(numeric_features)
    
# Add floor feature
floor_feature = None
if column_info['floor_col'] is not None:
    floor_feature = 'floor_numeric'
    all_features.append(floor_feature)
    
# Add categorical features
categorical_features = []
if column_info['categorical_cols']:
    categorical_features = column_info['categorical_cols']
    all_features.extend(categorical_features)
    
# Add boolean features
boolean_features = []
if column_info['bool_cols']:
    boolean_features = column_info['bool_cols']
    all_features.extend(boolean_features)
    
# Add binary features from ga4features
binary_features = []
if 'binary_features_cols' in column_info and column_info['binary_features_cols']:
    binary_features = column_info['binary_features_cols']
    all_features.extend(binary_features)

print(f"Total number of potential features: {len(all_features)}")
print(f"\nNumeric features ({len(numeric_features)}):")
print(numeric_features)

print(f"\nFloor feature:")
print(floor_feature)

print(f"\nCategorical features ({len(categorical_features)}):")
print(categorical_features)

print(f"\nBoolean features ({len(boolean_features)}):")
print(boolean_features)

print(f"\nBinary features from property characteristics ({len(binary_features)}):")
print(binary_features[:10])  # Show first 10 to avoid overwhelming output
if len(binary_features) > 10:
    print(f"... and {len(binary_features) - 10} more binary features")

# Create feature selection widgets
feature_groups = {
    "Numeric Features": numeric_features,
    "Floor Feature": [floor_feature] if floor_feature else [],
    "Categorical Features": categorical_features,
    "Boolean Features": boolean_features,
    "Binary Property Features": binary_features
}

# Dictionary to store the selected features
selected_features = {}

# Create UI for each feature group
for group_name, features in feature_groups.items():
    if not features:  # Skip empty feature groups
        continue
        
    print(f"\n## {group_name}")
    
    # Create a checkbox widget for each feature
    checkboxes = []
    for feature in features:
        checkbox = widgets.Checkbox(
            value=True,  # Default to selected
            description=feature,
            disabled=False
        )
        checkboxes.append((feature, checkbox))
        display(checkbox)
    
    # Store checkboxes for later access
    selected_features[group_name] = checkboxes

# Create a button to finalize feature selection
select_button = widgets.Button(
    description='Apply Feature Selection',
    button_style='success',
    tooltip='Click to apply the selected features',
    icon='check'
)

# Function to handle button click
def on_select_button_clicked(b):
    # Collect all selected features
    final_selected_features = []
    
    for group_name, feature_checkboxes in selected_features.items():
        for feature, checkbox in feature_checkboxes:
            if checkbox.value:
                final_selected_features.append(feature)
    
    print(f"\nSelected {len(final_selected_features)} features for modeling:")
    print(final_selected_features)
    
    # Store in a global variable for use in next cells
    global user_selected_features
    user_selected_features = final_selected_features
    
    # Create a modified version of the preprocessing pipeline that only includes selected features
    global user_selected_column_info
    user_selected_column_info = {
        'numeric_cols': [f for f in numeric_features if f in final_selected_features],
        'floor_col': column_info['floor_col'] if floor_feature in final_selected_features else None,
        'categorical_cols': [f for f in categorical_features if f in final_selected_features],
        'bool_cols': [f for f in boolean_features if f in final_selected_features],
        'binary_features_cols': [f for f in binary_features if f in final_selected_features]
    }
    
    print("\nFeature selection applied successfully!")
    print("You can now train models with these features in the next cell.")

select_button.on_click(on_select_button_clicked)
display(select_button)

# Provide instructions
display(HTML("<p><i>Select the features you want to include in your model, then click 'Apply Feature Selection'</i></p>"))

In [None]:
# Automated Feature Selection Methods

# Make sure we have the preprocessed data
X = df_subset.copy()
if 'price_value' in X.columns:
    y = X.pop('price_value')
else:
    raise ValueError("Target 'price_value' not found in the dataframe")

# Handle NaN values
nan_mask = y.notna()
X = X[nan_mask]
y = y[nan_mask]

# Split data using parameters from config
test_size = CONFIG['ml']['test_size']
random_state = CONFIG['ml']['random_state']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# Create a preprocessor
preprocessor, column_info = create_preprocessing_pipeline(X)

# Fit and transform training data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names
feature_names = []

# Add numeric features
if column_info['numeric_cols']:
    feature_names.extend(column_info['numeric_cols'])

# Add floor feature
if column_info['floor_col'] is not None:
    feature_names.append('floor_numeric')

# Add categorical features (one-hot encoded)
if column_info['categorical_cols']:
    categorical_transformer = preprocessor.named_transformers_.get('categorical')
    if categorical_transformer:
        ohe = categorical_transformer.named_steps['onehot']
        cat_features = ohe.get_feature_names_out(column_info['categorical_cols']).tolist()
        feature_names.extend(cat_features)

# Add boolean features
if column_info['bool_cols']:
    feature_names.extend(column_info['bool_cols'])

# Add binary features
if 'binary_features_cols' in column_info and column_info['binary_features_cols']:
    feature_names.extend(column_info['binary_features_cols'])

print(f"Number of features after preprocessing: {len(feature_names)}")
print(f"Shape of X_train_processed: {X_train_processed.shape}")

# Get top_features_to_select from config
top_n_features = CONFIG['ml']['top_features_to_select']

# Method 1: Correlation with Target
print("\n## Method 1: Correlation with Target (F-regression)")
selector = SelectKBest(score_func=f_regression, k=min(top_n_features, len(feature_names)))
X_new = selector.fit_transform(X_train_processed, y_train)

# Get scores and p-values
scores = selector.scores_
p_values = selector.pvalues_

# Debug: Check lengths of arrays
print(f"Length of feature_names: {len(feature_names)}")
print(f"Length of scores: {len(scores)}")
print(f"Length of p_values: {len(p_values)}")

# Make sure feature_names matches the number of features in X_train_processed
if len(feature_names) != X_train_processed.shape[1]:
    print("Warning: Feature names length doesn't match the number of features in the processed data!")
    # If more feature names than actual features, truncate feature_names
    if len(feature_names) > X_train_processed.shape[1]:
        feature_names = feature_names[:X_train_processed.shape[1]]
        print(f"Truncated feature_names to match X_train_processed shape: {len(feature_names)}")
    # If fewer feature names than actual features, add generic names
    else:
        additional_features = X_train_processed.shape[1] - len(feature_names)
        feature_names.extend([f"unknown_feature_{i}" for i in range(additional_features)])
        print(f"Added {additional_features} generic feature names to match X_train_processed shape")

# Create DataFrame with feature scores
feature_scores = pd.DataFrame({
    'Feature': feature_names,
    'Score': scores,
    'P-value': p_values
})

# Sort by score in descending order
feature_scores = feature_scores.sort_values('Score', ascending=False)

print(f"\nTop {top_n_features} features by correlation with target:")
print(feature_scores.head(top_n_features))

# Plot top features
plt.figure(figsize=(12, 8))
sns.barplot(x='Score', y='Feature', data=feature_scores.head(top_n_features))
plt.title(f'Top {top_n_features} Features by F-regression Score')
plt.tight_layout()
plt.show()

# Method 2: Recursive Feature Elimination (RFE)
print("\n## Method 2: Recursive Feature Elimination (RFE)")
n_features_to_select = min(top_n_features, len(feature_names))
n_estimators = CONFIG['ml']['n_estimators']
random_state = CONFIG['ml']['random_state']
estimator = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
rfe = RFE(estimator, n_features_to_select=n_features_to_select)

# Fit RFE
rfe.fit(X_train_processed, y_train)

# Get selected features
selected_mask = rfe.support_
ranking = rfe.ranking_

# Create DataFrame with feature ranking
feature_ranking = pd.DataFrame({
    'Feature': feature_names,
    'Selected': selected_mask,
    'Ranking': ranking
})

# Sort by ranking (ascending = better)
feature_ranking = feature_ranking.sort_values('Ranking')

print("\nTop features selected by RFE:")
print(feature_ranking[feature_ranking['Selected']].reset_index(drop=True))

# Plot ranking
plt.figure(figsize=(12, 8))
sns.barplot(x='Ranking', y='Feature', 
            data=feature_ranking.sort_values('Ranking').head(top_n_features))
plt.title('Feature Ranking by RFE (lower is better)')
plt.tight_layout()
plt.show()

# Method 3: Feature Importance from Random Forest
print("\n## Method 3: Feature Importance from Random Forest")
n_estimators = CONFIG['ml']['n_estimators']
random_state = CONFIG['ml']['random_state']
rf = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
rf.fit(X_train_processed, y_train)

# Get feature importances
importances = rf.feature_importances_

# Create DataFrame with feature importances
feature_importances = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort by importance in descending order
feature_importances = feature_importances.sort_values('Importance', ascending=False)

print(f"\nTop {top_n_features} features by Random Forest importance:")
print(feature_importances.head(top_n_features))

# Plot importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importances.head(top_n_features))
plt.title('Feature Importance from Random Forest')
plt.tight_layout()
plt.show()

# Create a button to choose top features automatically
top_n_slider = widgets.IntSlider(
    value=top_n_features,
    min=5,
    max=min(50, len(feature_names)),
    step=1,
    description='Top N features:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

method_dropdown = widgets.Dropdown(
    options=['Correlation (F-regression)', 'RFE', 'Random Forest Importance'],
    value='Random Forest Importance',
    description='Method:',
    disabled=False,
)

auto_select_button = widgets.Button(
    description='Select Top Features',
    button_style='primary',
    tooltip='Click to select top features using the chosen method',
    icon='check'
)

display(method_dropdown)
display(top_n_slider)
display(auto_select_button)

# Function to handle automatic selection
def on_auto_select_clicked(b):
    method = method_dropdown.value
    n = top_n_slider.value
    
    if method == 'Correlation (F-regression)':
        top_features = feature_scores.head(n)['Feature'].tolist()
    elif method == 'RFE':
        top_features = feature_ranking.head(n)['Feature'].tolist()
    else:  # Random Forest Importance
        top_features = feature_importances.head(n)['Feature'].tolist()
    
    print(f"\nAutomatically selected top {n} features using {method}:")
    print(top_features)
    
    # Store in a global variable for use in next cells
    global auto_selected_features
    auto_selected_features = top_features
    
    # Create a modified version of the column info
    global auto_selected_column_info
    auto_selected_column_info = {
        'numeric_cols': [f for f in column_info['numeric_cols'] if f in top_features],
        'floor_col': column_info['floor_col'] if 'floor_numeric' in top_features else None,
        'categorical_cols': [],  # We'll handle these differently as they're one-hot encoded
        'bool_cols': [f for f in column_info['bool_cols'] if f in top_features],
        'binary_features_cols': [f for f in column_info.get('binary_features_cols', []) if f in top_features]
    }
    
    # Handle categorical features (need to match original feature names before one-hot encoding)
    for cat_feature in column_info['categorical_cols']:
        for top_feature in top_features:
            if top_feature.startswith(cat_feature + '_'):
                if cat_feature not in auto_selected_column_info['categorical_cols']:
                    auto_selected_column_info['categorical_cols'].append(cat_feature)
    
    print("\nAutomatic feature selection applied successfully!")
    print("You can now train models with these features in the next cell.")

auto_select_button.on_click(on_auto_select_clicked)

# Display help text
display(HTML("""<p><i>Choose the feature selection method and number of features to include, 
then click 'Select Top Features'</i></p>"""))

In [None]:
# Train model with selected features

# Function to train and evaluate models with selected features
def train_model_with_selected_features(selected_column_info, selection_method="user"):
    """
    Train and evaluate models using only the selected features
    
    Args:
        selected_column_info: Dictionary with information about selected columns
        selection_method: String describing the selection method (for display purposes)
    """
    print(f"\n## Training models with {selection_method} selected features")
    
    # Prepare data
    X = df_subset.copy()
    if 'price_value' in X.columns:
        y = X.pop('price_value')
    else:
        raise ValueError("Target 'price_value' not found in the dataframe")

    # Handle NaN values
    nan_mask = y.notna()
    X = X[nan_mask]
    y = y[nan_mask]

    # Split data using parameters from config
    test_size = CONFIG['ml']['test_size']
    random_state = CONFIG['ml']['random_state']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Create a custom preprocessor with selected features
    preprocessor = create_preprocessing_pipeline(X)[0]  # Reuse existing function but extract only preprocessor
    
    # Get model parameters from config
    n_estimators = CONFIG['ml']['n_estimators']
    random_state = CONFIG['ml']['random_state']
    
    # Define models to train - REUSING the same models as before but with config parameters
    models = [
        ('Random Forest', RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)),
        ('Gradient Boosting', GradientBoostingRegressor(n_estimators=n_estimators, random_state=random_state)),
        ('XGBoost', XGBRegressor(n_estimators=n_estimators, learning_rate=0.1, random_state=random_state))
    ]
    
    # Train and evaluate all models - REUSING the existing functions
    results = []
    trained_pipelines = {}

    for name, model in models:
        print(f"\nTraining {name}...")
        # REUSE build_model_pipeline
        pipeline, _ = build_model_pipeline(model, preprocessor, selected_column_info)
        
        # REUSE evaluate_model_pipeline
        trained_pipeline, metrics = evaluate_model_pipeline(
            pipeline, X_train, X_test, y_train, y_test, name
        )
        
        results.append(metrics)
        trained_pipelines[name] = trained_pipeline

    # Compare model performance with a visualization - SIMILAR to existing code
    results_df = pd.DataFrame(results)
    best_model = results_df.loc[results_df['test_r2'].idxmax()]
    best_model_name = best_model['name']

    print(f"\nBest model based on Test R² score: {best_model_name} (R² = {best_model['test_r2']:.4f})")

    # Plot R² scores
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    bars = plt.bar(results_df['name'], results_df['test_r2'], color='skyblue')
    plt.title('Test R² Scores by Model')
    plt.ylabel('R² Score (higher is better)')
    plt.xticks(rotation=45)
    plt.tight_layout()

    # Highlight the best model
    best_idx = results_df['test_r2'].idxmax()
    bars[best_idx].set_color('darkblue')

    # Plot MAE scores
    plt.subplot(1, 2, 2)
    bars = plt.bar(results_df['name'], results_df['test_mae'], color='lightcoral')
    plt.title('Test MAE Scores by Model')
    plt.ylabel('Mean Absolute Error € (lower is better)')
    plt.xticks(rotation=45)
    plt.tight_layout()

    # Highlight the best model by MAE
    best_mae_idx = results_df['test_mae'].idxmin()
    bars[best_mae_idx].set_color('darkred')

    plt.show()
    
    # Return best model and its pipeline
    return best_model_name, trained_pipelines[best_model_name], results_df

# Create buttons to train models with selected features
user_select_train_button = widgets.Button(
    description='Train with User Selected Features',
    button_style='info',
    tooltip='Train models with manually selected features',
    disabled=False
)

auto_select_train_button = widgets.Button(
    description='Train with Automatically Selected Features',
    button_style='info',
    tooltip='Train models with automatically selected features',
    disabled=False
)

# Functions to handle button clicks
def on_user_select_train_clicked(b):
    if 'user_selected_features' not in globals():
        print("Please select features manually first using the feature selection widgets above")
        return
    
    global user_best_model_name, user_best_model, user_results
    user_best_model_name, user_best_model, user_results = train_model_with_selected_features(
        user_selected_column_info, "user"
    )

def on_auto_select_train_clicked(b):
    if 'auto_selected_features' not in globals():
        print("Please select features automatically first using the feature selection method above")
        return
    
    global auto_best_model_name, auto_best_model, auto_results
    auto_best_model_name, auto_best_model, auto_results = train_model_with_selected_features(
        auto_selected_column_info, "automatically"
    )

user_select_train_button.on_click(on_user_select_train_clicked)
auto_select_train_button.on_click(on_auto_select_train_clicked)

print("Click one of the buttons below to train models with your selected features:")
display(user_select_train_button)
display(auto_select_train_button)

In [None]:
# Compare models trained with different feature selection methods

# Function to compare model results
def compare_model_results():
    # Check which models are available
    available_models = []
    results_list = []
    
    # Check original models (from earlier in the notebook)
    if 'results_df' in globals():
        available_models.append('Original (All Features)')
        results_list.append(results_df)
        
    # Check user-selected feature models
    if 'user_results' in globals():
        available_models.append('User Selected Features')
        results_list.append(user_results)
        
    # Check auto-selected feature models
    if 'auto_results' in globals():
        available_models.append('Auto Selected Features')
        results_list.append(auto_results)
    
    if not available_models:
        print("No models available for comparison. Please train models first.")
        return
    
    print(f"Comparing {len(available_models)} sets of models:")
    print(available_models)
    
    # Create a comparison dataframe
    comparison_rows = []
    
    # Collect results for each model set and model type
    for i, (model_set, results) in enumerate(zip(available_models, results_list)):
        for _, row in results.iterrows():
            model_name = row['name']
            comparison_rows.append({
                'Model Set': model_set,
                'Model Type': model_name,
                'Test R²': row['test_r2'],
                'Test MAE': row['test_mae'],
                'Test RMSE': row['test_rmse']
            })
    
    comparison_df = pd.DataFrame(comparison_rows)
    
    # Print comparison table
    print("\nModel Performance Comparison:")
    print(comparison_df)
    
    # Find the best overall model
    best_model = comparison_df.loc[comparison_df['Test R²'].idxmax()]
    print(f"\nBest overall model: {best_model['Model Set']} - {best_model['Model Type']}")
    print(f"Test R²: {best_model['Test R²']:.4f}, Test MAE: €{best_model['Test MAE']:.2f}")
    
    # Create visualization
    plt.figure(figsize=(14, 8))
    
    # Plot R² comparison
    plt.subplot(2, 1, 1)
    colors = ['skyblue', 'lightgreen', 'salmon']
    
    # Group by Model Set and Model Type
    for i, model_set in enumerate(available_models):
        subset = comparison_df[comparison_df['Model Set'] == model_set]
        x_positions = [j + (i * 0.25) for j in range(len(subset))]
        
        plt.bar(x_positions, 
                subset['Test R²'], 
                width=0.2, 
                color=colors[i % len(colors)], 
                label=model_set)
        
        # Add value labels
        for x, y in zip(x_positions, subset['Test R²']):
            plt.text(x, y + 0.01, f'{y:.3f}', ha='center', va='bottom', fontsize=9)
    
    plt.title('Test R² Score Comparison (higher is better)')
    plt.ylabel('R² Score')
    plt.xticks([j + 0.25 for j in range(len(subset))], subset['Model Type'], rotation=45)
    plt.legend()
    plt.ylim(0, comparison_df['Test R²'].max() + 0.1)
    
    # Plot MAE comparison
    plt.subplot(2, 1, 2)
    
    # Group by Model Set and Model Type
    for i, model_set in enumerate(available_models):
        subset = comparison_df[comparison_df['Model Set'] == model_set]
        x_positions = [j + (i * 0.25) for j in range(len(subset))]
        
        plt.bar(x_positions, 
                subset['Test MAE'], 
                width=0.2, 
                color=colors[i % len(colors)], 
                label=model_set)
        
        # Add value labels
        for x, y in zip(x_positions, subset['Test MAE']):
            plt.text(x, y + 5, f'{y:.0f}', ha='center', va='bottom', fontsize=9)
    
    plt.title('Test MAE Comparison (lower is better)')
    plt.ylabel('Mean Absolute Error (€)')
    plt.xticks([j + 0.25 for j in range(len(subset))], subset['Model Type'], rotation=45)
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    return comparison_df

# Create a button to compare results
compare_button = widgets.Button(
    description='Compare All Models',
    button_style='warning',
    tooltip='Compare performance of models trained with different feature sets',
    icon='bar-chart'
)

compare_button.on_click(lambda b: compare_model_results())
display(compare_button)

# Display help text
display(HTML("""<p><i>Click the button above to compare the performance of models trained 
with different feature selection methods.</i></p>
<p><i>Note: You need to have trained at least two sets of models for comparison.</i></p>"""))

# Conclusion: Feature Selection Impact

Now that we've implemented various feature selection methods, we can draw several conclusions:

1. **Manual Feature Selection**:
   - Allows domain expertise to guide the selection process
   - Provides control over which features are included
   - Can simplify the model and reduce noise

2. **Automated Feature Selection**:
   - Statistical methods like F-regression identify features correlated with target
   - Recursive Feature Elimination finds the most predictive feature subset
   - Random Forest importance provides insight into feature relevance

3. **Performance Impact**:
   - Feature selection often improves model generalization
   - Reduces overfitting by eliminating irrelevant features
   - Can sometimes improve R² score and reduce prediction error
   - Makes models more interpretable

4. **Best Practices**:
   - Start with domain knowledge to select initial features
   - Use automated methods to verify or refine selection
   - Consider both statistical significance and practical relevance
   - Compare models with different feature sets

The comparison tool we've built allows us to clearly see the impact of feature selection on model performance and choose the optimal approach for predicting real estate prices.