This notebook will explore a dataset provided by Kaggle for housing price prediction.
First we will implement a random forest model to get a baseline and then we will implement a more complex neural network solution.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
data_path = "./train.csv" # Relative path to the training dataset, equivalent to /mnt/c/Users/ekc/Documents/homespace/train.csv
df = pd.read_csv(data_path)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.drop("Id", axis=1, inplace=True)

In [None]:
df.info()

In [None]:
labels = df["SalePrice"]
df.drop("SalePrice", axis=1, inplace=True)
X_train, X_dev, Y_train, Y_dev = train_test_split(df, labels, test_size=0.3, random_state=53)

In [None]:
X_dev.info()

In [None]:
train_df = pd.concat([X_train, Y_train], axis=1)
dev_df = pd.concat([X_dev, Y_dev], axis=1)

In [None]:
dev_df.info()

In [None]:
plt.hist(labels, bins=100)
plt.show()

In [None]:
RF_model = tfdf.keras.RandomForestModel(
    task=tfdf.keras.Task.REGRESSION,   # Define the task (Regression)
    num_trees=300,                    # Number of trees in the forest
    max_depth=10,                     # Maximum depth of trees
    min_examples=5,                   # Minimum number of examples per leaf node
    categorical_algorithm="CART",     # CART algorithm for categorical features
    compute_oob_variable_importances=True,  # Compute out-of-bag variable importances
)

In [None]:
RF_model.fit(tfdf.keras.pd_dataframe_to_tf_dataset(train_df, task=tfdf.keras.Task.REGRESSION, label="SalePrice"))

In [None]:
inspector = RF_model.make_inspector()
training_metrics = inspector.evaluation()
print(training_metrics)

In [None]:
print(RF_model.task)

In [None]:
RF_results = RF_model.evaluate(tfdf.keras.pd_dataframe_to_tf_dataset(train_df, task=tfdf.keras.Task.REGRESSION, label="SalePrice"), return_dict=True)

In [None]:
print(RF_results)