In [1]:
import sys
sys.path.append('../')

In [2]:
import numpy as np
import pandas as pd
from build_features import build_data_sets
from build_model import check_model
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../data/processed/otodom_cleaned.csv', index_col=0)

X_train, X_test, y_train, y_test = build_data_sets(df=df)

# Linear Regression 

In [4]:
lin_reg = LinearRegression()
check_model(lin_reg, X_train, X_test, y_train, y_test)

	 train	|  test
mae	 209884 |  237152
r2	 0.73 	|  0.59
rmse	 361600 |  395825


# Linear Regression with L1 regularization (LASSSO)

In [5]:
lasso = Lasso(alpha=100)
check_model(lasso, X_train, X_test, y_train, y_test)

	 train	|  test
mae	 209557 |  236887
r2	 0.73 	|  0.59
rmse	 361617 |  395845


# Decision Tree Regressor

In [6]:
unrestricted_tree = DecisionTreeRegressor()
check_model(unrestricted_tree, X_train, X_test, y_train, y_test)

	 train	|  test
mae	 849 |  173370
r2	 1.0 	|  0.65
rmse	 5176 |  367895


In [7]:
tree = DecisionTreeRegressor(criterion='squared_error', max_depth=6, min_samples_split=4 ,random_state=42)
check_model(tree, X_train, X_test, y_train, y_test)

	 train	|  test
mae	 171390 |  200798
r2	 0.81 	|  0.6
rmse	 301582 |  390806


# Random Forest Regressor

In [8]:
unrestricted_forest = RandomForestRegressor()
check_model(unrestricted_forest, X_train, X_test, y_train, y_test)

	 train	|  test
mae	 59356 |  153775
r2	 0.96 	|  0.7
rmse	 135080 |  338920


In [9]:
forest = RandomForestRegressor(n_estimators=100, max_depth=6, min_samples_split=4, n_jobs=-1, random_state=42)
check_model(forest, X_train, X_test, y_train, y_test)

	 train	|  test
mae	 153404 |  192847
r2	 0.87 	|  0.63
rmse	 255053 |  375235


In [10]:
# z = zip(forest.feature_names_in_,forest.feature_importances_)
# lz = list(z)
# sorted(lz, key=lambda x: x[1])[::-1]

In [11]:
unrestricted_grad_boost = GradientBoostingRegressor()
check_model(unrestricted_grad_boost, X_train, X_test, y_train, y_test)

	 train	|  test
mae	 130685 |  182605
r2	 0.89 	|  0.62
rmse	 226986 |  382391


In [12]:
grad_boost = GradientBoostingRegressor(n_estimators=150, criterion='friedman_mse', max_depth=5, random_state=42)
check_model(grad_boost, X_train, X_test, y_train, y_test)

	 train	|  test
mae	 61474 |  139801
r2	 0.98 	|  0.75
rmse	 99260 |  308314


In [13]:
# z = zip(grad_boost.feature_names_in_,grad_boost.feature_importances_)
# lz = list(z)
# sorted(lz, key=lambda x: x[1])[::-1]

# Neural Network 

In [14]:
# from sklearn.neural_network import MLPRegressor

# reg_model = MLPRegressor(
#     hidden_layer_sizes=(100, 100, 50),
#     activation='relu',
#     solver='adam',
#     random_state=42,
#     max_iter=1000,
#     batch_size=8)

# check_model(reg_model, X_train, X_test, y_train, y_test)