# Dataset import and exploration
- https://www.kaggle.com/shree1992/housedata

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('data/data.csv')
df.sample(5)

In [None]:
df.shape

In [None]:
df.isnull().sum()

<br>

# Drop columns we won't need

In [None]:
to_drop = ['date', 'street', 'statezip', 'country']
df = df.drop(to_drop, axis=1)
df.head()

<br>

# Feature engineering

- Houses that weren't renovated have `yr_renovated = 0`
- Here's how to get the first renovation year

In [None]:
df[df['yr_renovated'] != 0]['yr_renovated'].min()

- Let's create a couple of features:
    - House age 
    - Was the house renovated?
    - Was the renovation recent? (10 years or less)
    - Was the renovation not that recent (more than 10 years but less than 30)
- We'll then drop the original features

In [None]:
# How old is the house?
df['house_age'] = [2021 - yr_built for yr_built in df['yr_built']]

# Was the house renovated and was the renovation recent?
df['was_renovated'] = [1 if yr_renovated != 0 else 0 for yr_renovated in df['yr_renovated']]
df['was_renovated_10_yrs'] = [1 if (2021 - yr_renovated) <= 10 else 0 for yr_renovated in df['yr_renovated']]
df['was_renovated_30_yrs'] = [1 if 10 < (2021 - yr_renovated) <= 30 else 0 for yr_renovated in df['yr_renovated']]

# Drop original columns
df = df.drop(['yr_built', 'yr_renovated'], axis=1)
df.head()

- A lot of City options

In [None]:
df['city'].value_counts()

- Let's declare a function that will get rid of all city values that don't occur often
- The original value will be replaced with 'Rare':

In [None]:
def remap_location(data: pd.DataFrame, location: str, threshold: int = 50) -> str:
    if len(data[data['city'] == location]) < threshold:
        return 'Rare'
    return location

- Test:

In [None]:
remap_location(data=df, location='Seattle')

In [None]:
remap_location(data=df, location='Fall City')

In [None]:
df['city'] = df['city'].apply(lambda x: remap_location(data=df, location=x))
df.sample(10)

<br>

# Target variable visualization

In [None]:
import matplotlib.pyplot as plt
from matplotlib import rcParams

rcParams['figure.figsize'] = (16, 6)
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False

In [None]:
plt.hist(df['price'], bins=100);

- The distribution is highly skewed, so let's calculate Z-scores and remove outliers (assume the distirbution is otherwise normal)

In [None]:
from scipy import stats

df['price_z'] = np.abs(stats.zscore(df['price']))
df.head()

In [None]:
df = df[df['price_z'] <= 3]
df.shape

In [None]:
plt.hist(df['price'], bins=100);

- Still a bit of skew present
- There seem to be houses selling for $0
    - Let's remove them:

In [None]:
df[df['price'] == 0]

In [None]:
df = df[df['price'] != 0]

plt.hist(df['price'], bins=100);

In [None]:
df = df.drop('price_z', axis=1)

In [None]:
df.head()

<br>

# Data preparation for ML

- We'll MinMaxScale the numerical features and one-hot encode the categorical ones
- The features `waterfront`, `was_renovated`, `was_renovated_10_yrs` and `was_renovated_30_yrs` are ignored, since they're already in (0, 1) format

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

transformer = make_column_transformer(
    (MinMaxScaler(), ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'house_age']),
    (OneHotEncoder(handle_unknown='ignore'), ['bedrooms', 'bathrooms', 'floors', 'view', 'condition'])
)

- Train/test split - 80:20:

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

- Let's apply the transformations:

In [None]:
# Fit on the train set
transformer.fit(X_train)

# Apply the transformation
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

In [None]:
X_train.shape, X_test.shape

- Sparse array format:

In [None]:
X_train

- Convert to array:

In [None]:
X_train.toarray()

In [None]:
X_train = X_train.toarray()
X_test = X_test.toarray()

<br>

# Model training

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K

- RMSE is the best metric, as the error is displayed in the same units the target variable is in

In [None]:
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

- Really simple model:

In [None]:
tf.random.set_seed(42)

model = Sequential([
    Dense(256, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(1)
])

model.compile(
    loss=rmse,
    optimizer=Adam(),
    metrics=[rmse]
)

model.fit(X_train, y_train, epochs=100)

<br>

- Predict on the test set:

In [None]:
predictions = model.predict(X_test)

In [None]:
predictions[:5]

- Convert to a 1D array before visualization:

In [None]:
predictions = np.ravel(predictions)
predictions[:5]

In [None]:
rmse(y_test, predictions).numpy()