In [3]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv("../outputs/clean_houses.csv")

In [None]:
data.rename(columns={"Unnamed: 0":"index"},inplace=True)
data.head()

In [None]:
# Para realizar nuestra predicción necesitamos que todos nuestros datos sean numéricos.
# Vamos a comprobar por tanto que tipos de datos tenemos.

data.info()

In [None]:
carac = pd.DataFrame(data.buy_price.describe())

carac

In [None]:
def transf(column):

    le = preprocessing.LabelEncoder()
    data[column] = le.fit_transform(data[column])
    return data[column]

In [None]:
# Transformamos ahora todas las columnas type object y bool a float64, excepto floor.

transf("house_type_id")
transf("is_renewal_needed")
transf("is_new_development")
transf("is_exterior")
transf("has_lift")
transf("has_parking")
transf("has_storage_room")
transf("has_terrace")
transf("has_balcony")
transf("has_pool")

In [None]:
# Aplicamos ahora get_dummies a la columna floor.

data = pd.get_dummies(data, columns = ["floor"])

In [None]:
data.columns

In [None]:
# Vamos a ordenar ahora las columnas y nos deshacemos de la columna neighborhood.

data = data[['index', 'house_type_id', 'is_renewal_needed',
        'is_new_development', 'sq_mt_built', 'n_rooms', 'n_bathrooms',
        'is_exterior', 'has_lift', 'has_parking', 'has_storage_room',
        'has_terrace', 'has_balcony', 'has_pool',
        'latitude', 'longitude', 'floor_1', 'floor_2',
        'floor_3', 'floor_4', 'floor_5', 'floor_6', 'floor_7', 'floor_8',
        'floor_9', 'floor_Bajo', 'floor_Entreplanta exterior',
        'floor_Entreplanta interior', 'floor_Semi-sótano exterior',
        'floor_Semi-sótano interior', 'floor_Sótano exterior',
        'floor_Sótano interior', 'buy_price']]

In [None]:
data.set_index("index", inplace=True)
data.head()

In [None]:
# Vamos a realizar un análisis de los datos, como tenemos muchas columnas, vamos a dividir el data frame
# en varios dataframes para compararlos con nuestro groundtruth y ver así que relación tienen.

In [None]:
# Heatmap.

cm = np.corrcoef(data.values.T)
sns.set(font_scale=1)
plt.figure(figsize=(20, 20))
hm = sns.heatmap(cm,
                cbar=True,
                annot=True,
                square=True,
                fmt='.2f',
                annot_kws={'size': 10},
                yticklabels=data.columns,
                xticklabels=data.columns)        

In [None]:
# Calculate and show pairplot

d_analysis1 = data[["house_type_id", "is_renewal_needed", "is_new_development","sq_mt_built", "n_rooms",
                    "n_bathrooms", 'buy_price']]

sns.pairplot(data, size=2.5, vars=d_analysis1)
plt.tight_layout()

In [None]:
d_analysis2 = data[["is_exterior", "has_lift", 'has_parking', 'has_storage_room', 'has_terrace', 
                    'has_balcony', 'buy_price']]

sns.pairplot(data, size=2.5, vars=d_analysis2)
plt.tight_layout()

In [None]:
d_analysis3 = data[['has_pool', 'latitude', 'longitude', 'floor_1', 'floor_2','floor_3', 'buy_price']]

sns.pairplot(data, size=2.5, vars=d_analysis3)
plt.tight_layout()

In [None]:
d_analysis4 = data[['floor_4', 'floor_5', 'floor_6', 'floor_7', 'floor_8','floor_9', 'buy_price']]

sns.pairplot(data, size=2.5, vars=d_analysis4)
plt.tight_layout()

In [None]:
d_analysis5 = data[['floor_Bajo', 'floor_Entreplanta exterior','floor_Entreplanta interior', 
                    'floor_Semi-sótano exterior','floor_Semi-sótano interior', 'floor_Sótano exterior',
                    'floor_Sótano interior', 'buy_price']]

sns.pairplot(data, size=2.5, vars=d_analysis5)
plt.tight_layout()

In [None]:
import visuals as vs

vs.ModelLearning(features, prices)

In [None]:
# Aquí comentarios con respecto a los gráficos anteriores.

### Una vez tenemos todos los datos correctamente organizados, vamos a proceder a entrenar los datos.
### En este caso nuestro GroundTruth es buy price, por lo que vamos a proceder a realizar el estudio.

In [None]:
columnas = [col for col in data.columns if col not in ["buy_price"]]
X = data[columnas]
y = data["buy_price"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

### Vamos a descubrir cual es el modelo que mejor funciona

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, VotingRegressor
from sklearn.metrics import  explained_variance_score, max_error, mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [None]:
models = {
    "DecisionTree" : DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "ExtraTrees" : ExtraTreesRegressor(),
    "GradientBoosting" : GradientBoostingRegressor(),
    "HistGradientBoosting": HistGradientBoostingRegressor(),
    "Voting": VotingRegressor(
        [('etr', ExtraTreesRegressor()),
         ('rf', RandomForestRegressor())]
    )
}

In [None]:
for name, model in models.items():
    print(f"Entrenando {name}")
    model.fit(X_train, y_train)
print("Proceso terminado ")

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"-------{name}-------")
    print("EVS", round(explained_variance_score(y_test,y_pred),3))
    print("ME", round(max_error(y_test,y_pred),3))
    print("MAE", round(mean_absolute_error(y_test,y_pred),3))
    print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),3))
    print("R2", round(r2_score(y_test,y_pred),3))
    plt.figure()
    plt.title(name)
    plt.scatter(y_test,y_pred)
    plt.plot([y_test.min(), y_test.max()], [y_pred.min(), y_pred.max()], 'k--', lw=2)

In [None]:
for name, model in models.items():
    scores = cross_val_score(model,X,y, cv=10)
    print(f"{name} accuracy{np.mean(scores)}")

## ------------------------------  Prueba con otras features.  ------------------------------------------

In [None]:
columnas = [col for col in data.columns if col not in ["has_pool","has_terrace","has_balcony","buy_price"]]
X = data[columnas]
y = data["buy_price"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
models = {
    "DecisionTree" : DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "ExtraTrees" : ExtraTreesRegressor(),
    "GradientBoosting" : GradientBoostingRegressor(),
    "HistGradientBoosting": HistGradientBoostingRegressor(),
    "Voting": VotingRegressor(
        [('etr', ExtraTreesRegressor()),
         ('rf', RandomForestRegressor())]
    )
}

In [None]:
for name, model in models.items():
    print(f"Entrenando {name}")
    model.fit(X_train, y_train)
print("Proceso terminado ")

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"-------{name}-------")
    print("EVS", round(explained_variance_score(y_test,y_pred),3))
    print("ME", round(max_error(y_test,y_pred),3))
    print("MAE", round(mean_absolute_error(y_test,y_pred),3))
    print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),3))
    print("R2", round(r2_score(y_test,y_pred),3))
    plt.pyplot.figure()
    plt.pyplot.title(name)
    plt.pyplot.scatter(y_test,y_pred)
    plt.pyplot.plot([y_test.min(), y_test.max()], [y_pred.min(), y_pred.max()], 'k--', lw=2)

In [None]:
for name, model in models.items():
    scores = cross_val_score(model,X,y, cv=10)
    print(f"{name} accuracy{np.mean(scores)}")

## Pruebas eliminando viviendas con precios por encima de 1.200.000€

In [None]:
data1 = pd.read_csv("../outputs/clean_houses.csv")

In [None]:
data1.rename(columns={"Unnamed: 0":"index"},inplace=True)
data1.set_index("index", inplace=True)
data1.head()

In [None]:
data1[data1.buy_price>2000000].count()

In [None]:
data1 = data1.drop(data1[data1["buy_price"]>1200000].index)

In [None]:
data1

In [None]:
def transf(column):

    le = preprocessing.LabelEncoder()
    data1[column] = le.fit_transform(data1[column])
    return data1[column]

In [None]:
transf("house_type_id")
transf("is_renewal_needed")
transf("is_new_development")
transf("is_exterior")
transf("has_lift")
transf("has_parking")
transf("has_storage_room")
transf("has_terrace")
transf("has_balcony")
transf("has_pool")

In [None]:
# Aplicamos ahora get_dummies a la columna floor.

data1 = pd.get_dummies(data1, columns = ["floor"])

In [None]:
data1.columns

In [None]:
data1 = data1[['latitude', 'longitude', 'house_type_id', 'sq_mt_built', 'n_rooms', 'n_bathrooms',
       'is_exterior', 'has_lift', 'has_parking', 'has_storage_room',
       'has_terrace', 'has_balcony', 'has_pool', 'floor_1', 'floor_2',
       'floor_3', 'floor_4', 'floor_5', 'floor_6', 'floor_7', 'floor_8',
       'floor_9', 'floor_Bajo', 'floor_Entreplanta exterior',
       'floor_Entreplanta interior', 'floor_Semi-sótano exterior',
       'floor_Semi-sótano interior', 'floor_Sótano exterior',
       'floor_Sótano interior', 'buy_price']]

In [None]:
columnas = [col for col in data1.columns if col not in ["buy_price"]]
X = data1[columnas]
y = data1["buy_price"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
models = {
    "DecisionTree" : DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "ExtraTrees" : ExtraTreesRegressor(),
    "GradientBoosting" : GradientBoostingRegressor(),
    "HistGradientBoosting": HistGradientBoostingRegressor(),
    "Voting": VotingRegressor(
        [('etr', ExtraTreesRegressor()),
         ('rf', RandomForestRegressor())]
    )
}

In [None]:
for name, model in models.items():
    print(f"Entrenando {name}")
    model.fit(X_train, y_train)
print("Proceso terminado ")

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"-------{name}-------")
    print("EVS", round(explained_variance_score(y_test,y_pred),3))
    print("ME", round(max_error(y_test,y_pred),3))
    print("MAE", round(mean_absolute_error(y_test,y_pred),3))
    print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),3))
    print("R2", round(r2_score(y_test,y_pred),3))
    plt.figure()
    plt.title(name)
    plt.scatter(y_test,y_pred)
    plt.plot([y_test.min(), y_test.max()], [y_pred.min(), y_pred.max()], 'k--', lw=2)

In [None]:
for name, model in models.items():
    scores = cross_val_score(model,X,y, cv=10)
    print(f"{name} accuracy{np.mean(scores)}")

## Pruebas según heatmap

In [4]:
data2 = pd.read_csv("../outputs/clean_houses.csv")

In [5]:
data2.rename(columns={"Unnamed: 0":"index"},inplace=True)
data2.set_index("index", inplace=True)
data2.head()

Unnamed: 0_level_0,neighborhood_id,house_type_id,is_renewal_needed,is_new_development,sq_mt_built,n_rooms,n_bathrooms,floor,is_exterior,has_lift,has_parking,has_storage_room,has_terrace,has_balcony,has_pool,buy_price,buy_price_by_area,latitude,longitude
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,"San Cristóbal, Villaverde(Madrid)",HouseType 1: Pisos,False,False,64.0,2,1.0,3,True,False,False,False,False,False,False,85000,1328,40.343263,-3.688352
1,"Los Ángeles, Villaverde(Madrid)",HouseType 1: Pisos,True,False,70.0,3,1.0,4,True,True,False,False,True,False,False,129900,1856,40.355109,-3.700105
2,"San Andrés, Villaverde(Madrid)",HouseType 1: Pisos,False,False,94.0,2,2.0,1,True,True,False,True,False,False,False,144247,1535,40.345539,-3.71097
3,"San Andrés, Villaverde(Madrid)",HouseType 1: Pisos,False,False,64.0,2,1.0,Bajo,True,True,False,True,False,False,False,109900,1717,40.345539,-3.71097
4,"Los Rosales, Villaverde(Madrid)",HouseType 1: Pisos,False,False,108.0,2,2.0,4,True,True,True,True,False,False,True,260000,2407,40.355841,-3.688399


In [6]:
def transf(column):

    le = preprocessing.LabelEncoder()
    data2[column] = le.fit_transform(data2[column])
    return data2[column]

In [7]:
transf("house_type_id")
transf("is_renewal_needed")
transf("is_new_development")
transf("is_exterior")
transf("has_lift")
transf("has_parking")
transf("has_storage_room")
transf("has_terrace")
transf("has_balcony")
transf("has_pool")

index
0        0
1        0
2        0
3        0
4        1
        ..
17393    1
17394    1
17395    0
17396    1
17397    1
Name: has_pool, Length: 17398, dtype: int64

In [8]:
data2.sq_mt_built.value_counts().head(300)

70.0     428
60.0     412
80.0     379
65.0     339
90.0     316
        ... 
291.0      6
366.0      6
288.0      6
318.0      6
282.0      6
Name: sq_mt_built, Length: 300, dtype: int64

In [9]:
data2 = data2.drop(data2[data2["buy_price"]>1200000].index)
data2 = data2.drop(data2[data2["sq_mt_built"]>300].index)
data2 = data2.drop(data2[data2["n_rooms"]>5].index)
data2 = data2.drop(data2[data2["n_bathrooms"]>5].index)

In [10]:
data2.reset_index()
data2

Unnamed: 0_level_0,neighborhood_id,house_type_id,is_renewal_needed,is_new_development,sq_mt_built,n_rooms,n_bathrooms,floor,is_exterior,has_lift,has_parking,has_storage_room,has_terrace,has_balcony,has_pool,buy_price,buy_price_by_area,latitude,longitude
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,"San Cristóbal, Villaverde(Madrid)",0,0,0,64.0,2,1.0,3,1,0,0,0,0,0,0,85000,1328,40.343263,-3.688352
1,"Los Ángeles, Villaverde(Madrid)",0,1,0,70.0,3,1.0,4,1,1,0,0,1,0,0,129900,1856,40.355109,-3.700105
2,"San Andrés, Villaverde(Madrid)",0,0,0,94.0,2,2.0,1,1,1,0,1,0,0,0,144247,1535,40.345539,-3.710970
3,"San Andrés, Villaverde(Madrid)",0,0,0,64.0,2,1.0,Bajo,1,1,0,1,0,0,0,109900,1717,40.345539,-3.710970
4,"Los Rosales, Villaverde(Madrid)",0,0,0,108.0,2,2.0,4,1,1,1,1,0,0,1,260000,2407,40.355841,-3.688399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17393,"Campo de las Naciones-Corralejos, Barajas(Madrid)",0,0,0,88.0,2,2.0,1,1,1,1,1,0,0,1,360000,4420,40.459749,-3.612738
17394,"Campo de las Naciones-Corralejos, Barajas(Madrid)",0,0,0,99.0,2,2.0,1,1,1,1,1,0,0,1,349000,3525,40.459749,-3.612738
17395,"Casco Histórico de Barajas, Barajas(Madrid)",2,0,0,78.0,2,2.0,4,1,1,1,1,0,0,0,350000,4487,40.479840,-3.580251
17396,"Campo de las Naciones-Corralejos, Barajas(Madrid)",0,0,0,96.0,2,2.0,3,1,1,1,1,0,0,1,425000,4427,40.459749,-3.612738


In [11]:
# Aplicamos ahora get_dummies a la columna floor.

data2 = pd.get_dummies(data2, columns = ["floor"])

In [12]:
data2.columns

Index(['neighborhood_id', 'house_type_id', 'is_renewal_needed',
       'is_new_development', 'sq_mt_built', 'n_rooms', 'n_bathrooms',
       'is_exterior', 'has_lift', 'has_parking', 'has_storage_room',
       'has_terrace', 'has_balcony', 'has_pool', 'buy_price',
       'buy_price_by_area', 'latitude', 'longitude', 'floor_1', 'floor_2',
       'floor_3', 'floor_4', 'floor_5', 'floor_6', 'floor_7', 'floor_8',
       'floor_9', 'floor_Bajo', 'floor_Entreplanta exterior',
       'floor_Entreplanta interior', 'floor_Semi-sótano exterior',
       'floor_Semi-sótano interior', 'floor_Sótano exterior',
       'floor_Sótano interior'],
      dtype='object')

In [13]:
data2 = data2[['house_type_id', 'is_renewal_needed',
       'is_new_development', 'sq_mt_built', 'n_rooms', 'n_bathrooms',
       'is_exterior', 'has_lift', 'has_parking', 'has_storage_room',
       'has_terrace', 'has_balcony', 'has_pool', 'latitude', 'longitude', 'floor_1', 'floor_2',
       'floor_3', 'floor_4', 'floor_5', 'floor_6', 'floor_7', 'floor_8',
       'floor_9', 'floor_Bajo', 'floor_Entreplanta exterior',
       'floor_Entreplanta interior', 'floor_Semi-sótano exterior',
       'floor_Semi-sótano interior', 'floor_Sótano exterior',
       'floor_Sótano interior','buy_price']]

In [None]:
data2.to_csv("../outputs/h2o.csv")

In [None]:
columnas = [col for col in data2.columns if col not in ["buy_price"]]
X = data2[columnas]
y = data2["buy_price"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
models = {
    "DecisionTree" : DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(max_depth= 60),
    "ExtraTrees" : ExtraTreesRegressor(),
    "GradientBoosting" : GradientBoostingRegressor(),
    "HistGradientBoosting": HistGradientBoostingRegressor(),
    "Voting": VotingRegressor(
        [('etr', ExtraTreesRegressor()),
         ('rf', RandomForestRegressor())]
    )
}

In [None]:
for name, model in models.items():
    print(f"Entrenando {name}")
    model.fit(X_train, y_train)
print("Proceso terminado ")

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"-------{name}-------")
    print("EVS", round(explained_variance_score(y_test,y_pred),3))
    print("ME", round(max_error(y_test,y_pred),3))
    print("MAE", round(mean_absolute_error(y_test,y_pred),3))
    print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),3))
    print("R2", round(r2_score(y_test,y_pred),3))
    plt.figure()
    plt.title(name)
    plt.scatter(y_test,y_pred)
    plt.plot([y_test.min(), y_test.max()], [y_pred.min(), y_pred.max()], 'k--', lw=2)

In [None]:
for name, model in models.items():
    scores = cross_val_score(model,X,y, cv=10)
    print(f"{name} accuracy{np.mean(scores)}")

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=200, random_state=1)

regr = MLPRegressor(random_state=1, max_iter=5000).fit(X_train, y_train)
regr.predict(X_test[:2])

regr.score(X_test, y_test)


In [None]:
parameters = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
#Reducimos para hacer la prueba con diferentes n_estimators
params = {
     'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid = GridSearchCV(RandomForestRegressor(),params,verbose=1)
grid.fit(X_train,y_train)

In [None]:
print(grid.best_params_)

In [14]:
# H2O
import h2o
from h2o.estimators import H2ORandomForestEstimator

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "14.0.1" 2020-04-14; Java(TM) SE Runtime Environment (build 14.0.1+7); Java HotSpot(TM) 64-Bit Server VM (build 14.0.1+7, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/67/cftjwrk93ll2_6bwxqr4mkzc0000gn/T/tmpohidiqhi
  JVM stdout: /var/folders/67/cftjwrk93ll2_6bwxqr4mkzc0000gn/T/tmpohidiqhi/h2o_Javi_started_from_python.out
  JVM stderr: /var/folders/67/cftjwrk93ll2_6bwxqr4mkzc0000gn/T/tmpohidiqhi/h2o_Javi_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Madrid
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,13 days
H2O_cluster_name:,H2O_from_python_Javi_h9vypp
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [None]:
#data2 = h2o.import_file("../outputs/h2o.csv")

In [15]:
h2train = h2o.H2OFrame(data2)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [20]:
columnas = [col for col in data2.columns if col not in ["buy_price"]]

x = list(columnas)

y = "buy_price"

In [None]:
train, test = data2.split_frame(ratios=[0.8])

In [21]:
from h2o.automl import H2OAutoML

automl = H2OAutoML(max_models=50, seed=1,max_runtime_secs=1200, sort_metric='RMSE')
automl.train(x, y, training_frame=h2train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [None]:
# Define model
model = H2ORandomForestEstimator(ntrees=50, max_depth=20, nfolds=10, metric)

# Train model
model.train(x=training_columns, y=response_column, training_frame=X)

In [22]:
leader_board = automl.leaderboard
leader_board.head()

model_id,rmse,mean_residual_deviance,mse,mae,rmsle
StackedEnsemble_AllModels_AutoML_20201011_203132,66919.3,4478190000.0,4478190000.0,41757.9,0.160805
StackedEnsemble_BestOfFamily_AutoML_20201011_203132,67648.9,4576380000.0,4576380000.0,42342.7,0.162938
XGBoost_grid__1_AutoML_20201011_203132_model_1,69427.3,4820150000.0,4820150000.0,44178.4,0.170085
XGBoost_grid__1_AutoML_20201011_203132_model_3,69465.4,4825440000.0,4825440000.0,44344.2,0.169072
GBM_grid__1_AutoML_20201011_203132_model_9,70054.1,4907580000.0,4907580000.0,43936.9,0.168495
GBM_4_AutoML_20201011_203132,70333.2,4946760000.0,4946760000.0,45151.4,0.167822
XGBoost_grid__1_AutoML_20201011_203132_model_12,70473.0,4966440000.0,4966440000.0,43746.5,0.172262
XGBoost_grid__1_AutoML_20201011_203132_model_2,70555.3,4978050000.0,4978050000.0,45126.9,0.170704
GBM_grid__1_AutoML_20201011_203132_model_11,70571.3,4980310000.0,4980310000.0,44984.3,0.168454
GBM_3_AutoML_20201011_203132,70910.3,5028270000.0,5028270000.0,45913.8,0.168302




In [None]:
# Model performance
performance = model.model_performance(test_data=test)

print performance