In [1]:
import numpy  as np
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import cross_val_score
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.xgboost import H2OXGBoostEstimator
from h2o.estimators import H2ORandomForestEstimator

# Cleaning Data:

In [2]:
diamonds = pd.read_csv("../input_diamonds/diamonds_train.csv")

In [3]:
diamonds.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.21,Ideal,H,VS2,63.0,57.0,6.73,6.7,4.23,6134
1,1,0.28,Very Good,D,VVS2,64.0,56.0,4.14,4.17,2.66,532
2,2,0.42,Premium,F,VS1,61.2,58.0,4.86,4.82,2.96,1103
3,3,0.26,Ideal,H,IF,61.1,57.0,4.16,4.12,2.53,600
4,4,1.1,Good,G,SI1,63.4,57.0,6.52,6.55,4.14,4997


**Convert categorical data to numerical:**

In [4]:
diamonds["cut"].replace(["Premium", "Ideal", "Very Good","Good","Fair"],[2,1,0,-1,-2],inplace=True)
diamonds["color"].replace(["G", "E", "F","H","D","I","J"],[3,2,1,0,-1,-2,-3],inplace=True)
diamonds["clarity"].replace(["SI1", "VS2", "SI2","VS1","VVS2","VVS1","IF","I1"],[4,3,2,1,0,-1,-2,-3],inplace=True)

**Create volume column with ["x","y","z"]:**

In [5]:
diamonds["volume"]=diamonds['x']*diamonds['y']*diamonds['z']

In [6]:
diamonds.drop(['x','y','z','Unnamed: 0'],axis=1,inplace=True)

In [7]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,volume
0,1.21,1,0,3,63.0,57.0,6134,190.73493
1,0.28,0,-1,0,64.0,56.0,532,45.921708
2,0.42,2,1,1,61.2,58.0,1103,69.338592
3,0.26,1,0,-2,61.1,57.0,600,43.362176
4,1.1,-1,3,4,63.4,57.0,4997,176.80284


# H2OAutoML:

**Inicialize the h2o environment:**

In [8]:
h2o.init(nthreads = -1, max_mem_size = 26)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_252"; OpenJDK Runtime Environment (AdoptOpenJDK)(build 1.8.0_252-b09); OpenJDK 64-Bit Server VM (AdoptOpenJDK)(build 25.252-b09, mixed mode)
  Starting server from /usr/local/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/1r/0z83blfn37n4310vb80zx8bh0000gn/T/tmpn587w9n5
  JVM stdout: /var/folders/1r/0z83blfn37n4310vb80zx8bh0000gn/T/tmpn587w9n5/h2o_mugald_started_from_python.out
  JVM stderr: /var/folders/1r/0z83blfn37n4310vb80zx8bh0000gn/T/tmpn587w9n5/h2o_mugald_started_from_python.err
  Server is running at http://127.0.0.1:54323
Connecting to H2O server at http://127.0.0.1:54323 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Madrid
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.2
H2O_cluster_version_age:,12 days
H2O_cluster_name:,H2O_from_python_mugald_ar86se
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,23.11 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


**Load the Pandas DF into h2o:**

In [9]:
diamonds_h2o=h2o.H2OFrame(diamonds)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [10]:
diamonds_h2o.head()

carat,cut,color,clarity,depth,table,price,volume
1.21,1,0,3,63.0,57.0,6134,190.735
0.28,0,-1,0,64.0,56.0,532,45.9217
0.42,2,1,1,61.2,58.0,1103,69.3386
0.26,1,0,-2,61.1,57.0,600,43.3622
1.1,-1,3,4,63.4,57.0,4997,176.803
0.35,1,1,4,62.2,56.0,583,56.6989
0.31,1,-2,3,62.3,54.0,452,52.4924
0.43,2,0,4,60.2,57.0,919,71.6506
0.51,2,-2,2,60.4,59.0,956,84.1469
1.22,1,-2,1,61.9,55.7,6469,197.035




In [11]:
X = ["carat","color","cut","clarity","depth"]
y = "price"

In [12]:
train, test=diamonds_h2o.split_frame(ratios = [.8])
X_train=train[X]
y_train=train[y]
X_test=test[X]
y_test=test[y]

In [13]:
modelH20 = H2OAutoML(max_runtime_secs=360,max_models= 15, seed= 1, nfolds=0)
modelH20.train(x = X, y = y, training_frame = train, validation_frame=test)
ld = modelH20.leaderboard
ld

AutoML progress: |████████████████████████████████████████████████████████| 100%


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
GBM_5_AutoML_20200511_180243,342203,584.981,342203,301.81,0.118862
GBM_3_AutoML_20200511_180243,342602,585.323,342602,300.68,0.1242
GBM_2_AutoML_20200511_180243,345310,587.631,345310,304.344,0.118995
GBM_1_AutoML_20200511_180243,346520,588.66,346520,304.319,0.113958
GBM_4_AutoML_20200511_180243,351224,592.642,351224,308.688,0.134358
XGBoost_3_AutoML_20200511_180243,366526,605.414,366526,320.495,0.131779
XGBoost_2_AutoML_20200511_180243,382882,618.774,382882,330.066,0.140464
GBM_grid__1_AutoML_20200511_180243_model_1,389386,624.008,389386,322.893,0.126654
XGBoost_grid__1_AutoML_20200511_180243_model_1,432788,657.866,432788,325.06,0.115659
XGBoost_1_AutoML_20200511_180243,459157,677.612,459157,375.3,




In [14]:
y_pred = modelH20.leader.predict(test)
y_pred

gbm prediction progress: |████████████████████████████████████████████████| 100%


predict
1177.37
591.113
3507.28
8422.26
2519.45
2272.95
1543.72
1769.22
934.169
6775.03




In [15]:
y_test

price
1103
600
3856
9760
2557
2384
1436
1755
1123
7632




# Test:

In [16]:
diamonds_test = pd.read_csv("../input_diamonds/diamonds_test.csv")
diamonds_test.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.3,Ideal,H,SI2,60.0,56.0,4.41,4.43,2.65
1,1,0.34,Ideal,D,IF,62.1,57.0,4.52,4.46,2.79
2,2,1.57,Very Good,I,VS2,60.3,58.0,7.58,7.55,4.56
3,3,0.31,Ideal,H,VS2,61.8,57.0,4.32,4.36,2.68
4,4,1.51,Good,I,VVS1,64.0,60.0,7.26,7.21,4.63


In [17]:
diamonds_test["cut"].replace(["Premium", "Ideal", "Very Good","Good","Fair"],[2,1,0,-1,-2],inplace=True)
diamonds_test["color"].replace(["G", "E", "F","H","D","I","J"],[3,2,1,0,-1,-2,-3],inplace=True)
diamonds_test["clarity"].replace(["SI1", "VS2", "SI2","VS1","VVS2","VVS1","IF","I1"],[4,3,2,1,0,-1,-2,-3],inplace=True)

In [18]:
diamonds_test["volume"]=diamonds_test['x']*diamonds_test['y']*diamonds_test['z']

In [19]:
diamonds_test.drop(['x','y','z','Unnamed: 0'],axis=1,inplace=True)

In [20]:
diamonds_test_h2o=h2o.H2OFrame(diamonds_test)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [21]:
X=["carat","color","cut","clarity","depth"]

In [22]:
X_train=diamonds_test_h2o[X]

In [23]:
price_pred = modelH20.leader.predict(X_train)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [24]:
price_pred

predict
388.682
1831.02
9704.49
667.435
9971.83
2481.62
996.971
10223.2
806.786
480.152




In [25]:
df_price = price_pred.as_data_frame()
df_price.rename(columns={"predict": "price"}, inplace=True)
df_price["price"] = df_price.price.astype(int)

In [26]:
df_price.index.rename('id', inplace=True)
df_price

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,388
1,1831
2,9704
3,667
4,9971
...,...
13444,4759
13445,520
13446,14130
13447,10786


In [27]:
#df_price.to_csv("./output/test10.csv")

# H2O XGBoost Estimator:

In [43]:
modelXG = H2OXGBoostEstimator()
modelXG.train(x=X, y=y, training_frame=train)
y_pred2 = modelXG.predict(test)
modelXG

xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost prediction progress: |████████████████████████████████████████████| 100%
Model Details
H2OXGBoostEstimator :  XGBoost
Model Key:  XGBoost_model_python_1589212956477_3


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees
0,,50.0




ModelMetricsRegression: xgboost
** Reported on train data. **

MSE: 207599.8862664702
RMSE: 455.631305187067
MAE: 255.04833810484993
RMSLE: 0.10163975567735312
Mean Residual Deviance: 207599.8862664702

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2020-05-11 18:16:02,0.054 sec,0.0,5581.201814,3920.142396,31149810.0
1,,2020-05-11 18:16:02,0.215 sec,1.0,3973.2572,2746.665473,15786770.0
2,,2020-05-11 18:16:02,0.333 sec,2.0,2861.384409,1924.763443,8187521.0
3,,2020-05-11 18:16:02,0.421 sec,3.0,2094.283731,1352.855003,4386024.0
4,,2020-05-11 18:16:02,0.502 sec,4.0,1570.925924,965.197951,2467808.0
5,,2020-05-11 18:16:02,0.575 sec,5.0,1228.444444,717.58072,1509076.0
6,,2020-05-11 18:16:02,0.642 sec,6.0,994.401085,565.218136,988833.5
7,,2020-05-11 18:16:02,0.710 sec,7.0,854.999942,476.80862,731024.9
8,,2020-05-11 18:16:03,0.779 sec,8.0,756.727363,416.935908,572636.3
9,,2020-05-11 18:16:03,0.842 sec,9.0,694.073577,380.352681,481738.1



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,carat,878626600000.0,1.0,0.89482
1,clarity,68683640000.0,0.078172,0.06995
2,color,29083610000.0,0.033101,0.02962
3,cut,2855465000.0,0.00325,0.002908
4,depth,2653803000.0,0.00302,0.002703




In [44]:
diamonds_test_xg=h2o.H2OFrame(diamonds_test)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [45]:
X=["carat","color","cut","clarity","depth"]

In [46]:
X_train=diamonds_test_xg[X]

In [47]:
price_pred2 = modelXG.predict(X_train)


xgboost prediction progress: |████████████████████████████████████████████| 100%


In [48]:
df_price2 = price_pred.as_data_frame()
df_price2.rename(columns={"predict": "price"}, inplace=True)
df_price2["price"] = df_price2.price.astype(int)

In [49]:
df_price.index.rename('id', inplace=True)
df_price

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,363
1,1574
2,9760
3,586
4,9794
...,...
13444,4684
13445,408
13446,14455
13447,10377


In [51]:
df_price.to_csv("./output/test11.csv")