In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import polars as pl
import plotly.express as px

from housing_prediction.config import DATA_DIR
from housing_prediction.dataset import load_data

In [None]:
DATA_PATH = DATA_DIR / "raw/chicago_properties.csv" ## REPLACE THIS
target_col = "price"

df = load_data(DATA_PATH)
df.head()

In [None]:
# Missing values
df.select(pl.col('*').is_null().sum())

In [None]:
# sorted count plot of zip code
px.histogram(df, x='zipcode', title='Number of properties in each zip code').update_xaxes(categoryorder='total descending')

In [None]:
px.box(df, x='bedrooms', y='square_footage')

In [None]:
px.scatter(df, x='square_footage', y='price')

It can be observed from the scatter plot that a sqaure footage value can have multiple price points, and given other data like bathrooms, bedrooms (categorical data) and zip codes (less data points per state), they do not seem sufficient to explain the price. We need other information like carpet area, house type, etc.

In [None]:
px.histogram(df, x='square_footage', nbins=50)

We can keep these extreme values in the `square_footage` because price is clearly high for them. Because there are few points, it will effect the cross validation score.

In [None]:
px.density_heatmap(df, x='bathrooms', y='bedrooms', z='square_footage', histfunc='avg', title="Average square footage by number of bathrooms and bedrooms")

In [None]:
px.scatter(df.with_columns((pl.count('zipcode').over(['bathrooms', 'bedrooms']) / pl.count('zipcode').over(['bathrooms'])).alias('percentage').round(2)), x='bathrooms', y='bedrooms', size='percentage', title="Percentage of bedrooms for bedroom category")

In [None]:
px.histogram(df.with_columns(price_log=pl.col('price').log()), x='price_log', nbins=50, title='Price (log-scale) distribution of properties in Chicago').show()

In [None]:
px.histogram(df, x='square_footage', nbins=50, title='Square footage distribution of properties in Chicago').show()

In [None]:
px.box(df, x='zipcode', y='square_footage', title='Square footage distribution in each zip code').update_xaxes(categoryorder='total descending')