In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
print(train.shape)

In [None]:
train.head()

In [None]:
train = train.drop('Id', axis =1)
train.head()

In [None]:
train.info()

In [None]:
print(train['SalePrice'].describe())
plt.figure(figsize=(9,8))
sns.distplot(train['SalePrice'],color='g', bins=100, hist_kws={'alpha':0.4});

In [None]:
list(set(train.dtypes.tolist()))

In [None]:
train_num=train.select_dtypes(include = ['float64', 'int64'])
train_num.head()

In [None]:
train_num.hist(figsize=(15,25), bins=50, xlabelsize=5, ylabelsize=5)

In [None]:
def split_dataset(dataset, test_ratio=0.3):
    test_indices = np.random.rand(len(dataset))<test_ratio
    return dataset[~test_indices], dataset[test_indices]

train_split, test_split= split_dataset(train)
print("{} examples in training, {} examples in testing.".format(len(train_split), len(test_split)))

In [None]:
label = 'SalePrice'
train_data= tfdf.keras.pd_dataframe_to_tf_dataset(train_split, label=label, task=tfdf.keras.Task.REGRESSION)
test_data= tfdf.keras.pd_dataframe_to_tf_dataset(test_split, label=label, task=tfdf.keras.Task.REGRESSION)

In [None]:
rf = tfdf.keras.RandomForestModel(task=tfdf.keras.Task.REGRESSION)
rf.compile(metrics=["mse"])

In [None]:
rf.fit(x=train_data)

In [None]:
tfdf.model_plotter.plot_model_in_colab(rf,tree_idx=0, max_depth=3)

In [None]:
logs = rf.make_inspector().training_logs()
plt.plot([log.num_trees for log in logs], [log.evaluation.rmse for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("RMSE (out-of-bag)")
plt.show()

In [None]:
inspector = rf.make_inspector()
inspector.evaluation()

In [None]:
evaluation = rf.evaluate(x=test_data, return_dict=True)

for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")

In [None]:
print(f"Available variable inportances:")
for importance in inspector.variable_importances().keys():
    print('\t',importance)

In [None]:
inspector.variable_importances()["NUM_AS_ROOT"]

In [None]:
plt.figure(figsize=(15,5))

variable_importance_metric = "NUM_AS_ROOT"
variable_importances = inspector.variable_importances()[variable_importance_metric]

feature_names = [vi[0].name for vi in variable_importances]
feature_importances = [vi[1] for vi in variable_importances]

feature_ranks = range(len(feature_names))

bar = plt.barh(feature_ranks, feature_importances, label=[str(x) for x in feature_ranks])
plt.yticks(feature_ranks, feature_names)
plt.gca().invert_yaxis()

for importance, patch in zip(feature_importances, bar.patches):
    plt.text(patch.get_x() + patch.get_width(), patch.get_y(), f"(importance:.4f)",va = "top")
    
plt.xlabel(variable_importance_metric)
plt.title("NUM AS ROOT of the class 1 vs the others")
plt.tight_layout()
plt.show()

In [None]:
test_sub= pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
ids = test_sub.pop('Id')

testing = tfdf.keras.pd_dataframe_to_tf_dataset(test_sub, task=tfdf.keras.Task.REGRESSION)
preds= rf.predict(testing)
output = pd.DataFrame({'Id': ids, "SalePrice": preds.squeeze()})
output.head()

In [None]:
submission =pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")
submission['SalePrice']= rf.predict(testing)
submission.to_csv('/kaggle/working/submission.csv', index=False)
submission.head()