In [7]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

import numpy as np
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OrdinalEncoder

import matplotlib.pyplot as plt

In [8]:
# category selector
cat_selector = make_column_selector(dtype_include=object)
# numeric selector
num_selector = make_column_selector(dtype_include=np.number)

# category encoder
cat_tree_processor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1)

# imputer
num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)

# tree based model precessor
tree_preprocessor = make_column_transformer(
    (num_tree_processor, num_selector), (cat_tree_processor, cat_selector))


def make_rf_pipeline(
        n_estimators: int = 800, random_state: int = 42) -> Pipeline:
    """make random forest pipeline

    Args:
        n_estimators (int, optional): number of estimators. Defaults to 800.
        random_state (int, optional):  Defaults to 42.

    Returns:
        Pipeline: Sklearn pipeline
    """
    return make_pipeline(
        tree_preprocessor,
        RandomForestRegressor(
            random_state=random_state, n_estimators=n_estimators))

def plot_feat_importance(importances, index):
    """plot random forest importance

    Args:
        importances (_type_): feat importance 
        index (_type_): feat name
    """
    forest_importances = pd.Series(
        importances, index=index).sort_values(ascending=False)
    fig, ax = plt.subplots()
    forest_importances.plot.bar(ax=ax)
    ax.set_ylabel("Mean decrease in impurity")
    fig.tight_layout()

In [9]:
data = pd.read_csv('../data/curated/feature_data.csv')
data['Crime Rate'] = data['Crime Count']/data['Population']
data.drop(columns=['Unnamed: 0', 'Property Address', 'Health Instituion', 'Bus Station',
                    'Shopping_Center_Name', 'School Name', 'Bus Duration', 'School Duration',
                    'Health Duration', 'Shop Duration', 'Postcode', 'Crime Count', 
                   ], inplace=True)

def transform_property_type(x):
    out = x
    if x not in ('House', 'Apartment / Unit / Flat', 'Townhouse', 'Studio', 'Villa'):
        out = 'Others'
    return out

data['property_type'] = data.property_type.apply(transform_property_type)
data['heating_feat'] = data['Property Features'].apply(lambda x: int(len([_ for xx in eval(x) if 'Heating' in xx]) > 0 ))
data['garden_feat'] = data['Property Features'].apply(lambda x: int(len([_ for xx in eval(x) if 'Garden' in xx]) > 0 ))
data['parking_feat'] = data['Property Features'].apply(lambda x: int(len([_ for xx in eval(x) if 'Parking' in xx]) > 0 ))
data['balcony_feat'] = data['Property Features'].apply(lambda x: int(len([_ for xx in eval(x) if 'Balcony' in xx]) > 0 ))
data['laundry_feat'] = data['Property Features'].apply(lambda x: int(len([_ for xx in eval(x) if 'Laundry' in xx]) > 0 ))

data.fillna(-1, inplace=True)


  data = pd.read_csv('../data/curated/feature_data.csv')


KeyError: 'Property Features'

In [None]:
X = data.drop(columns='Weekly Rental Price')
y = data['Weekly Rental Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

KeyError: "['Weekly Rental Price'] not found in axis"

In [None]:
rf_pipeline = make_rf_pipeline(random_state=42, n_estimators=500)
rf_pipeline.fit(X_train, y_train)

In [None]:
result = permutation_importance(
    rf_pipeline, X_test, y_test, n_repeats=15, random_state=42, n_jobs=4
)

sorted_importances_idx = result.importances_mean.argsort()
importances = pd.DataFrame(
    result.importances[sorted_importances_idx].T,
    columns=X.columns[sorted_importances_idx],
)
ax = importances.plot.box(vert=False, whis=10)
ax.set_title("Permutation Importances (test set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()

In [None]:
sorted_importances_idx = result.importances_mean.argsort()
importances = pd.DataFrame(
    result.importances[sorted_importances_idx].T,
    columns=X.columns[sorted_importances_idx],
)
ax = importances.apply(np.log).plot.box(vert=False, whis=10)
ax.set_title("Log Permutation Importances (test set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()