In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    PowerTransformer,
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler
)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import optuna
import os

from warnings import filterwarnings
filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_data(name, base_dir=None):
    # If base_dir is not provided, use the current working directory
    if base_dir is None:
        base_dir = os.getcwd()
    
    # Construct the path to the Data directory
    data_dir = os.path.join(base_dir, '..', 'Data')
    
    # Construct the full file path
    file_name = f"{name}.csv"
    file_path = os.path.join(data_dir, file_name)
    
    # Check if the file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_name} does not exist in the specified path.")
    
    # Read and return the CSV file
    return pd.read_csv(file_path)

In [3]:
data = get_data('preprocessed_data')
display(data.head())
print(data.shape)

Unnamed: 0,numerical__num_pipeline__serviceCharge,numerical__num_pipeline__pricetrend,numerical__num_pipeline__baseRent,numerical__num_pipeline__livingSpace,numerical__num_pipeline__residents,numerical__num_pipeline__zip_area_sq_km,numerical__num_pipeline__baseRentPerSquareMeter,numerical__num_pipeline__service_charge_ratio,numerical__num_pipeline__space_per_room,categorical__binary__newlyConst,categorical__binary__balcony,categorical__binary__hasKitchen,categorical__binary__cellar,categorical__binary__lift,categorical__binary__garden,categorical__low_card__stateName_Baden_Württemberg,categorical__low_card__stateName_Bayern,categorical__low_card__stateName_Berlin,categorical__low_card__stateName_Brandenburg,categorical__low_card__stateName_Bremen,categorical__low_card__stateName_Hamburg,categorical__low_card__stateName_Hessen,categorical__low_card__stateName_Mecklenburg_Vorpommern,categorical__low_card__stateName_Niedersachsen,categorical__low_card__stateName_Nordrhein_Westfalen,categorical__low_card__stateName_Rheinland_Pfalz,categorical__low_card__stateName_Saarland,categorical__low_card__stateName_Sachsen,categorical__low_card__stateName_Sachsen_Anhalt,categorical__low_card__stateName_Schleswig_Holstein,categorical__low_card__stateName_Thüringen,categorical__low_card__telekomTvOffer_ONE_YEAR_FREE,categorical__low_card__telekomTvOffer_ON_DEMAND,categorical__low_card__telekomTvOffer_unknown,categorical__low_card__petsAllowed_negotiable,categorical__low_card__petsAllowed_no,categorical__low_card__petsAllowed_unknown,categorical__low_card__petsAllowed_yes,categorical__low_card__typeOfFlat_apartment,categorical__low_card__typeOfFlat_ground_floor,categorical__low_card__typeOfFlat_half_basement,categorical__low_card__typeOfFlat_loft,categorical__low_card__typeOfFlat_maisonette,categorical__low_card__typeOfFlat_other,categorical__low_card__typeOfFlat_penthouse,categorical__low_card__typeOfFlat_raised_ground_floor,categorical__low_card__typeOfFlat_roof_storey,categorical__low_card__typeOfFlat_terraced_flat,categorical__med_card__heatingType,categorical__med_card__condition,categorical__med_card__interiorQual,categorical__high_card__geo_plz,categorical__high_card__cityName,categorical__high_card__firingTypes,categorical__high_card__address,categorical__numeric_binned__picturecount,categorical__numeric_binned__noRooms,categorical__ordinal__yearConstructed_category,categorical__ordinal__floor_category,totalRent
0,0.093818,0.841059,0.739537,0.608475,0.553207,0.412911,0.761024,0.672659,0.657155,0,0,0,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,747.076624,703.509425,619.951336,946.622478,687.787318,815.145102,946.622478,0.0,1.0,1.0,0.0,840.0
1,0.095355,0.8025,0.787266,0.603505,0.443616,0.294981,0.82727,0.532027,0.697159,1,1,0,1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1208.385582,1208.210683,1048.276586,812.293151,654.820533,781.698837,812.293151,0.0,0.0,3.0,0.0,1300.0
2,0.050773,0.774746,0.682779,0.534954,0.795387,0.678609,0.73667,0.397791,0.640781,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,767.005579,709.370499,718.525449,585.580047,482.951137,805.314715,585.580047,0.0,0.0,1.0,0.0,655.0
3,0.073984,0.796737,0.764583,0.606162,0.393051,0.324112,0.797415,0.415548,0.699223,0,1,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,719.036013,636.490649,718.525449,974.782232,846.164639,781.698837,974.782232,0.0,0.0,0.0,0.0,903.0
4,0.07488,0.838392,0.673839,0.519483,0.104444,0.588819,0.73669,0.700137,0.690482,0,1,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,806.037467,703.509425,718.525449,792.184408,767.154854,781.698837,792.184408,0.0,0.0,3.0,0.0,655.0


(199588, 60)


In [4]:
data.columns

Index(['numerical__num_pipeline__serviceCharge',
       'numerical__num_pipeline__pricetrend',
       'numerical__num_pipeline__baseRent',
       'numerical__num_pipeline__livingSpace',
       'numerical__num_pipeline__residents',
       'numerical__num_pipeline__zip_area_sq_km',
       'numerical__num_pipeline__baseRentPerSquareMeter',
       'numerical__num_pipeline__service_charge_ratio',
       'numerical__num_pipeline__space_per_room',
       'categorical__binary__newlyConst', 'categorical__binary__balcony',
       'categorical__binary__hasKitchen', 'categorical__binary__cellar',
       'categorical__binary__lift', 'categorical__binary__garden',
       'categorical__low_card__stateName_Baden_Württemberg',
       'categorical__low_card__stateName_Bayern',
       'categorical__low_card__stateName_Berlin',
       'categorical__low_card__stateName_Brandenburg',
       'categorical__low_card__stateName_Bremen',
       'categorical__low_card__stateName_Hamburg',
       'categorical__low

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199588 entries, 0 to 199587
Data columns (total 60 columns):
 #   Column                                                   Non-Null Count   Dtype  
---  ------                                                   --------------   -----  
 0   numerical__num_pipeline__serviceCharge                   199588 non-null  float64
 1   numerical__num_pipeline__pricetrend                      199588 non-null  float64
 2   numerical__num_pipeline__baseRent                        199588 non-null  float64
 3   numerical__num_pipeline__livingSpace                     199588 non-null  float64
 4   numerical__num_pipeline__residents                       199588 non-null  float64
 5   numerical__num_pipeline__zip_area_sq_km                  199588 non-null  float64
 6   numerical__num_pipeline__baseRentPerSquareMeter          199588 non-null  float64
 7   numerical__num_pipeline__service_charge_ratio            199542 non-null  float64
 8   numerical__num

In [6]:
print(data.shape)
data.dropna(inplace=True)
print(data.shape)

(199588, 60)
(199542, 60)


In [7]:
X = data.drop(['totalRent'], axis =1)
y = data['totalRent']

In [8]:
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")


X_train shape: (119724, 59)
X_val shape: (39909, 59)
X_test shape: (39909, 59)


In [9]:
from sklearn.metrics import r2_score, mean_squared_error

def adjusted_r2_score(r2, n, p):
    """
    Calculate the adjusted R² score.

    Parameters:
    - r2: R² score
    - n: Number of observations
    - p: Number of predictors/features

    Returns:
    - Adjusted R² score
    """
    return 1 - ((1 - r2) * (n - 1)) / (n - p - 1)


### Linear Regression

In [10]:
# Define the objective function for Optuna
def objective(trial):
    # Define hyperparameters to tune for ElasticNet (L1 and L2 regularization)
    alpha = trial.suggest_loguniform('alpha', 1e-5, 1e1)  # Regularization strength
    l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)  # Mix of L1 and L2 penalties

    # Initialize the model with the sampled hyperparameters
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42, max_iter=2000)

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred_val = model.predict(X_val)

    # Calculate RMSE as the evaluation metric
    rmse = root_mean_squared_error(y_val, y_pred_val)  # RMSE

    return rmse

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize')  
study.optimize(objective, n_trials=10)  

# Print the best hyperparameters and score
print("Best Hyperparameters:", study.best_params)
print("Best RMSE:", study.best_value)

# Evaluate on the validation and test set using the best model
best_params = study.best_params
best_model = ElasticNet(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Validation set performance
y_pred_val = best_model.predict(X_val)
r2_val = r2_score(y_val, y_pred_val)
rmse_val = root_mean_squared_error(y_val, y_pred_val)

# Test set performance
y_pred_test = best_model.predict(X_test)
r2_test = r2_score(y_test, y_pred_test)
rmse_test = root_mean_squared_error(y_test, y_pred_test)

print(f"\nValidation Set Performance with Tuned Model:")
print(f"R² score: {r2_val}")
print(f"RMSE: {rmse_val}")

print(f"\nTest Set Performance with Tuned Model:")
print(f"R² score: {r2_test}")
print(f"RMSE: {rmse_test}")


[I 2025-01-11 11:27:10,829] A new study created in memory with name: no-name-02e25bd0-1bc4-4c57-a01f-b4a528a8006c
[I 2025-01-11 11:27:27,130] Trial 0 finished with value: 276.4144700232694 and parameters: {'alpha': 0.0006353202976639726, 'l1_ratio': 0.8261689219381237}. Best is trial 0 with value: 276.4144700232694.
[I 2025-01-11 11:27:43,216] Trial 1 finished with value: 276.1793576960672 and parameters: {'alpha': 2.4328796653809698e-05, 'l1_ratio': 0.5310314759384213}. Best is trial 1 with value: 276.1793576960672.
[I 2025-01-11 11:27:58,828] Trial 2 finished with value: 277.8477019480019 and parameters: {'alpha': 0.0009298252797021549, 'l1_ratio': 0.502849319719116}. Best is trial 1 with value: 276.1793576960672.
[I 2025-01-11 11:28:14,435] Trial 3 finished with value: 346.3960165538439 and parameters: {'alpha': 0.9335977164260727, 'l1_ratio': 0.6713585654155484}. Best is trial 1 with value: 276.1793576960672.
[I 2025-01-11 11:28:30,180] Trial 4 finished with value: 276.166760969882

Best Hyperparameters: {'alpha': 2.8054012626139476e-05, 'l1_ratio': 0.9484952523420267}
Best RMSE: 276.16676096988283

Validation Set Performance with Tuned Model:
R² score: 0.7105731483248623
RMSE: 276.1667311940077

Test Set Performance with Tuned Model:
R² score: 0.7040636426674369
RMSE: 280.3147853533461


In [11]:
# Calculate feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': best_model.coef_
})
feature_importance = feature_importance.sort_values('coefficient', ascending=False)
feature_importance['importance_percent'] = (feature_importance['coefficient'] / 
                                         feature_importance['coefficient'].sum() * 100)

# Display top 15 most important features
print("\nTop 15 Most Important Features")
display(feature_importance.head(15).style.format({
    'coefficient': '{:.4f}',
    'importance_percent': '{:.2f}%'
}))


Top 15 Most Important Features


Unnamed: 0,feature,coefficient,importance_percent
3,numerical__num_pipeline__livingSpace,2747.5938,38.23%
6,numerical__num_pipeline__baseRentPerSquareMeter,2692.3043,37.46%
0,numerical__num_pipeline__serviceCharge,1654.036,23.02%
2,numerical__num_pipeline__baseRent,502.4627,6.99%
8,numerical__num_pipeline__space_per_room,197.6032,2.75%
7,numerical__num_pipeline__service_charge_ratio,177.8931,2.48%
44,categorical__low_card__typeOfFlat_penthouse,121.4164,1.69%
56,categorical__numeric_binned__noRooms,104.3834,1.45%
41,categorical__low_card__typeOfFlat_loft,91.621,1.27%
28,categorical__low_card__stateName_Sachsen_Anhalt,74.4818,1.04%


### Decison Tree Regressor

In [12]:
from sklearn.tree import DecisionTreeRegressor
def objective(trial):
    # Define hyperparameters to tune
    max_depth = trial.suggest_categorical('max_depth', [None, 5, 10, 20, 30])
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    splitter = trial.suggest_categorical('splitter', ['best', 'random'])

    # Initialize the model with the sampled hyperparameters
    model = DecisionTreeRegressor(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        splitter=splitter,
        random_state=42
    )

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred_val = model.predict(X_val)

    # Calculate RMSE as the evaluation metric
    rmse = root_mean_squared_error(y_val, y_pred_val)

    return rmse

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize')  
study.optimize(objective, n_trials=100)  

# Print the best hyperparameters and score
print("Best Hyperparameters:", study.best_params)
print("Best RMSE:", study.best_value)

# Evaluate on the val and test set using the best model
best_params = study.best_params
best_model = DecisionTreeRegressor(**best_params, random_state=42)
best_model.fit(X_train, y_train)

y_pred_val = best_model.predict(X_val)
r2_val = r2_score(y_val, y_pred_val)
rmse_val = root_mean_squared_error(y_val, y_pred_val)

y_pred_test = best_model.predict(X_test)
r2_test = r2_score(y_test, y_pred_test)
rmse_test = root_mean_squared_error(y_test, y_pred_test)

print(f"\nValidation Set Performance with Tuned Model:")
print(f"R² score: {r2_test}")
print(f"RMSE: {rmse_test}")

print(f"\nTest Set Performance with Tuned Model:")
print(f"R² score: {r2_test}")
print(f"RMSE: {rmse_test}")

[I 2025-01-11 11:30:11,172] A new study created in memory with name: no-name-5a9db4df-9b33-4441-ba5f-175adbfe8fe9
[I 2025-01-11 11:30:12,062] Trial 0 finished with value: 259.02402142329527 and parameters: {'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None, 'splitter': 'random'}. Best is trial 0 with value: 259.02402142329527.
[I 2025-01-11 11:30:12,107] Trial 1 finished with value: 375.27033963323834 and parameters: {'max_depth': 5, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'log2', 'splitter': 'random'}. Best is trial 0 with value: 259.02402142329527.
[I 2025-01-11 11:30:12,259] Trial 2 finished with value: 294.8443957095757 and parameters: {'max_depth': 20, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'splitter': 'random'}. Best is trial 0 with value: 259.02402142329527.
[I 2025-01-11 11:30:12,600] Trial 3 finished with value: 288.83994181540805 and parameters: {'max_depth': 30, 'min_samples_split': 8,

Best Hyperparameters: {'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': None, 'splitter': 'best'}
Best RMSE: 227.39258411633205

Validation Set Performance with Tuned Model:
R² score: 0.7905738184573382
RMSE: 235.81006105292425

Test Set Performance with Tuned Model:
R² score: 0.7905738184573382
RMSE: 235.81006105292425


In [13]:
# Calculate feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
})

feature_importance = feature_importance.sort_values('importance', ascending=False)
feature_importance['importance_percent'] = (feature_importance['importance'] / feature_importance['importance'].sum() * 100)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
})

# Display the feature importance
display(feature_importance.sort_values('importance', ascending=False).head(15).round(3))

# Display best params
print(f"\nBest Hyperparameters: {best_params}")


Unnamed: 0,feature,importance
2,numerical__num_pipeline__baseRent,0.981
54,categorical__high_card__address,0.009
0,numerical__num_pipeline__serviceCharge,0.004
51,categorical__high_card__geo_plz,0.003
5,numerical__num_pipeline__zip_area_sq_km,0.002
7,numerical__num_pipeline__service_charge_ratio,0.001
42,categorical__low_card__typeOfFlat_maisonette,0.0
41,categorical__low_card__typeOfFlat_loft,0.0
40,categorical__low_card__typeOfFlat_half_basement,0.0
39,categorical__low_card__typeOfFlat_ground_floor,0.0



Best Hyperparameters: {'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': None, 'splitter': 'best'}


### XGboost

In [14]:
def objective(trial):
    # Define hyperparameters to tune for XGBoost Regressor
    params = {
        'objective': 'reg:squarederror',  # Regression task
        'n_estimators': trial.suggest_int('n_estimators', 50, 500, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 15),  # Depth of trees
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),  # Step size shrinkage
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Proportion of samples to use
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),  # Proportion of features to use
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1),  # L1 regularization term
        'lambda': trial.suggest_loguniform('lambda', 1e-5, 1e-1)  # L2 regularization term
    }

    # Initialize the model with the sampled hyperparameters
    model = xgb.XGBRegressor(**params, random_state=42)

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred_val = model.predict(X_val)

    # Calculate RMSE as the evaluation metric
    rmse = root_mean_squared_error(y_val, y_pred_val)  # RMSE

    return rmse

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize')  
study.optimize(objective, n_trials=30)  

# Print the best hyperparameters and score
print("Best Hyperparameters:", study.best_params)
print("Best RMSE:", study.best_value)

# Evaluate on the validation and test set using the best model
best_params = study.best_params
best_model = xgb.XGBRegressor(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Validation set performance
y_pred_val = best_model.predict(X_val)
r2_val = r2_score(y_val, y_pred_val)
rmse_val = root_mean_squared_error(y_val, y_pred_val)

# Test set performance
y_pred_test = best_model.predict(X_test)
r2_test = r2_score(y_test, y_pred_test)
rmse_test = root_mean_squared_error(y_test, y_pred_test)

print(f"\nValidation Set Performance with Tuned Model:")
print(f"R² score: {r2_val}")
print(f"RMSE: {rmse_val}")

print(f"\nTest Set Performance with Tuned Model:")
print(f"R² score: {r2_test}")
print(f"RMSE: {rmse_test}")


[I 2025-01-11 11:33:53,380] A new study created in memory with name: no-name-51d026d8-e240-4ba1-b937-cc5477399b95
[I 2025-01-11 11:35:06,265] Trial 0 finished with value: 484.6810835579109 and parameters: {'n_estimators': 500, 'max_depth': 15, 'learning_rate': 0.00014140999658814496, 'subsample': 0.653070863875176, 'colsample_bytree': 0.838378552580817, 'alpha': 2.6983520567099917e-05, 'lambda': 0.03138544747980137}. Best is trial 0 with value: 484.6810835579109.
[I 2025-01-11 11:35:33,145] Trial 1 finished with value: 505.8182602517806 and parameters: {'n_estimators': 300, 'max_depth': 13, 'learning_rate': 5.974280226400025e-05, 'subsample': 0.8813486522843208, 'colsample_bytree': 0.7531140783083254, 'alpha': 0.02038564845337226, 'lambda': 0.0015381792354291512}. Best is trial 0 with value: 484.6810835579109.
[I 2025-01-11 11:35:40,194] Trial 2 finished with value: 282.3313677533885 and parameters: {'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.0019860869068085794, 'subsampl

Best Hyperparameters: {'n_estimators': 150, 'max_depth': 12, 'learning_rate': 0.035380529840647386, 'subsample': 0.93852320616696, 'colsample_bytree': 0.8754140787231365, 'alpha': 0.016830830194534335, 'lambda': 0.0012882090902766412}
Best RMSE: 207.93754382713678

Validation Set Performance with Tuned Model:
R² score: 0.8359176297695541
RMSE: 207.93754382713678

Test Set Performance with Tuned Model:
R² score: 0.8296184602350043
RMSE: 212.6954075801626


In [15]:
# Calculate feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
})

feature_importance = feature_importance.sort_values('importance', ascending=False)
feature_importance['importance_percent'] = (feature_importance['importance'] / feature_importance['importance'].sum() * 100)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
})

# Display the feature importance
display(feature_importance.sort_values('importance', ascending=False).head(15).round(3))

# Display best params
print(f"\nBest Hyperparameters: {best_params}")

Unnamed: 0,feature,importance
2,numerical__num_pipeline__baseRent,0.359
41,categorical__low_card__typeOfFlat_loft,0.086
54,categorical__high_card__address,0.044
6,numerical__num_pipeline__baseRentPerSquareMeter,0.039
3,numerical__num_pipeline__livingSpace,0.035
56,categorical__numeric_binned__noRooms,0.02
46,categorical__low_card__typeOfFlat_roof_storey,0.017
42,categorical__low_card__typeOfFlat_maisonette,0.017
51,categorical__high_card__geo_plz,0.015
15,categorical__low_card__stateName_Baden_Württem...,0.014



Best Hyperparameters: {'n_estimators': 150, 'max_depth': 12, 'learning_rate': 0.035380529840647386, 'subsample': 0.93852320616696, 'colsample_bytree': 0.8754140787231365, 'alpha': 0.016830830194534335, 'lambda': 0.0012882090902766412}
