In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [2]:
# Snowpark
from snowflake.snowpark.session import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark.types import *
from snowflake.snowpark.version import VERSION
from snowflake.snowpark.functions import pandas_udf


In [3]:
# Read credentials
with open('creds.json') as f:
    connection_parameters = json.load(f)    
session = Session.builder.configs(connection_parameters).create()

In [4]:
snowpark_version = VERSION
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Role                        : {}'.format(session.get_current_role()))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

Database                    : "HOUSING"
Schema                      : "PUBLIC"
Warehouse                   : "XSMALL_WH"
Role                        : "ACCOUNTADMIN"
Snowpark for Python version : 1.0.0


In [5]:
session.sql("SELECT count(*) FROM HOUSING.PUBLIC.HOUSINGPRICE").collect()

[Row(COUNT(*)=30144)]

In [6]:
housepricingdf = session.table("HOUSING.PUBLIC.HOUSINGPRICE")

In [7]:
housepricingdf=housepricingdf.to_pandas()
type(housepricingdf)

pandas.core.frame.DataFrame

In [8]:
housepricingdf.columns = map(lambda x: str(x).upper(), housepricingdf.columns)

In [9]:
housepricingdf.head()

Unnamed: 0,DATE,SUBURB,ADDRESS,ROOMS,TYPE,PRICE,METHOD,SELLERG,DATE.1,DISTANCE,...,BATHROOM,CAR,LANDSIZE,BUILDINGAREA,YEARBUILT,COUNCILAREA,LATTITUDE,LONGTITUDE,REGIONNAME,PROPERTYCOUNT
0,2016-01-28,Surrey Hills,999A Riversdale Rd,3,h,1205000.0,S,Fletchers,2016-01-28,11.2,...,1.0,2.0,490.0,,,Boroondara City Council,-37.8361,145.1006,Southern Metropolitan,5457.0
1,2016-01-28,Surrey Hills,1/10 Florence Rd,2,u,813000.0,S,Fletchers,2016-01-28,11.2,...,1.0,2.0,108.0,,,Boroondara City Council,-37.8276,145.1023,Southern Metropolitan,5457.0
2,2016-01-28,Canterbury,140 Canterbury Rd,4,h,,SP,Fletchers,2016-01-28,9.0,...,2.0,0.0,808.0,198.0,1910.0,Boroondara City Council,-37.8235,145.0751,Southern Metropolitan,3265.0
3,2016-03-09,Williamstown,54 Twyford St,3,h,1535000.0,S,Greg,2016-03-09,8.0,...,1.0,1.0,507.0,186.0,1910.0,Hobsons Bay City Council,-37.8681,144.8994,Western Metropolitan,6380.0
4,2016-03-09,Newport,1/26 Thorpe St,3,h,826000.0,S,Village,2016-03-09,8.4,...,1.0,2.0,281.0,100.0,1957.0,Hobsons Bay City Council,-37.8457,144.8651,Western Metropolitan,5498.0


In [10]:
housepricingdf.isnull().sum()

DATE                 0
SUBURB               0
ADDRESS              0
ROOMS                0
TYPE                 0
PRICE             6571
METHOD               0
SELLERG              0
DATE.1               0
DISTANCE             0
POSTCODE             0
BEDROOM2          6817
BATHROOM          6823
CAR               7162
LANDSIZE         10066
BUILDINGAREA     17997
YEARBUILT        16317
COUNCILAREA          2
LATTITUDE         6600
LONGTITUDE        6600
REGIONNAME           2
PROPERTYCOUNT        2
dtype: int64

In [11]:
housepricingdf.nunique()

DATE                68
SUBURB             346
ADDRESS          29486
ROOMS               12
TYPE                 3
PRICE             2722
METHOD               9
SELLERG            363
DATE.1              68
DISTANCE           213
POSTCODE           208
BEDROOM2            15
BATHROOM            11
CAR                 14
LANDSIZE          1631
BUILDINGAREA       721
YEARBUILT          156
COUNCILAREA         33
LATTITUDE        11161
LONGTITUDE       12203
REGIONNAME           8
PROPERTYCOUNT      339
dtype: int64

In [12]:
cols=['SUBURB','ROOMS','TYPE','METHOD','SELLERG','REGIONNAME','PROPERTYCOUNT','DISTANCE','COUNCILAREA','BEDROOM2','BATHROOM'
           ,'CAR','LANDSIZE','BUILDINGAREA','PRICE']
housepricingdf=housepricingdf[cols]
housepricingdf

Unnamed: 0,SUBURB,ROOMS,TYPE,METHOD,SELLERG,REGIONNAME,PROPERTYCOUNT,DISTANCE,COUNCILAREA,BEDROOM2,BATHROOM,CAR,LANDSIZE,BUILDINGAREA,PRICE
0,Surrey Hills,3,h,S,Fletchers,Southern Metropolitan,5457.0,11.2,Boroondara City Council,3.0,1.0,2.0,490.0,,1205000.0
1,Surrey Hills,2,u,S,Fletchers,Southern Metropolitan,5457.0,11.2,Boroondara City Council,2.0,1.0,2.0,108.0,,813000.0
2,Canterbury,4,h,SP,Fletchers,Southern Metropolitan,3265.0,9.0,Boroondara City Council,4.0,2.0,0.0,808.0,198.0,
3,Williamstown,3,h,S,Greg,Western Metropolitan,6380.0,8.0,Hobsons Bay City Council,3.0,1.0,1.0,507.0,186.0,1535000.0
4,Newport,3,h,S,Village,Western Metropolitan,5498.0,8.4,Hobsons Bay City Council,2.0,1.0,2.0,281.0,100.0,826000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30139,Highett,3,h,S,Greg,Southern Metropolitan,4794.0,16.0,Bayside City Council,3.0,1.0,2.0,700.0,,1242000.0
30140,Highett,3,t,PI,Purplebricks,Southern Metropolitan,4794.0,16.0,Bayside City Council,,,,,,1395000.0
30141,Hoppers Crossing,4,h,S,Barry,Western Metropolitan,13830.0,18.4,Wyndham City Council,4.0,2.0,2.0,650.0,,540000.0
30142,South Melbourne,4,h,PI,Marshall,Southern Metropolitan,5943.0,1.9,Port Phillip City Council,4.0,2.0,0.0,178.0,174.0,


In [13]:
housepricingdf.isnull().sum()

SUBURB               0
ROOMS                0
TYPE                 0
METHOD               0
SELLERG              0
REGIONNAME           2
PROPERTYCOUNT        2
DISTANCE             0
COUNCILAREA          2
BEDROOM2          6817
BATHROOM          6823
CAR               7162
LANDSIZE         10066
BUILDINGAREA     17997
PRICE             6571
dtype: int64

In [14]:
cols_zero = ['PROPERTYCOUNT','DISTANCE','BEDROOM2','BATHROOM','CAR']
housepricingdf[cols_zero]=housepricingdf[cols_zero].fillna(0)


In [15]:
housepricingdf.LANDSIZE=housepricingdf.LANDSIZE.fillna(housepricingdf.LANDSIZE.mean())
housepricingdf.BUILDINGAREA=housepricingdf.BUILDINGAREA.fillna(housepricingdf.BUILDINGAREA.mean())


In [16]:
housepricingdf.dropna(inplace=True)
housepricingdf.isnull().sum()

SUBURB           0
ROOMS            0
TYPE             0
METHOD           0
SELLERG          0
REGIONNAME       0
PROPERTYCOUNT    0
DISTANCE         0
COUNCILAREA      0
BEDROOM2         0
BATHROOM         0
CAR              0
LANDSIZE         0
BUILDINGAREA     0
PRICE            0
dtype: int64

In [17]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [18]:
# le.fit(housepricingdf) 
le1=LabelEncoder()
le2=LabelEncoder()
le3=LabelEncoder()
le4=LabelEncoder()
le5=LabelEncoder()
le6=LabelEncoder()
housepricingdf.SUBURB=le1.fit_transform(housepricingdf.SUBURB)
housepricingdf.TYPE = le2.fit_transform(housepricingdf.TYPE)
housepricingdf.METHOD = le3.fit_transform(housepricingdf.METHOD)
housepricingdf.SELLERG=le4.fit_transform(housepricingdf.SELLERG)
housepricingdf.REGIONNAME=le5.fit_transform(housepricingdf.REGIONNAME)
housepricingdf.COUNCILAREA = le6.fit_transform(housepricingdf.COUNCILAREA)
housepricingdf

Unnamed: 0,SUBURB,ROOMS,TYPE,METHOD,SELLERG,REGIONNAME,PROPERTYCOUNT,DISTANCE,COUNCILAREA,BEDROOM2,BATHROOM,CAR,LANDSIZE,BUILDINGAREA,PRICE
0,292,3,0,1,95,5,5457.0,11.2,2,3.0,1.0,2.0,490.000000,158.001108,1205000.0
1,292,2,2,1,95,5,5457.0,11.2,2,2.0,1.0,2.0,108.000000,158.001108,813000.0
3,329,3,0,1,110,6,6380.0,8.0,10,3.0,1.0,1.0,507.000000,186.000000,1535000.0
4,228,3,0,1,296,6,5498.0,8.4,10,2.0,1.0,2.0,281.000000,100.000000,826000.0
5,228,3,0,3,275,6,5498.0,8.4,10,3.0,2.0,2.0,301.000000,195.000000,1025000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30137,166,4,0,3,314,6,5556.0,18.0,3,4.0,2.0,2.0,600.000000,174.000000,580000.0
30138,165,2,2,1,121,5,4794.0,16.0,1,2.0,1.0,1.0,130.000000,158.001108,677500.0
30139,165,3,0,1,110,5,4794.0,16.0,1,3.0,1.0,2.0,700.000000,158.001108,1242000.0
30140,165,3,1,0,235,5,4794.0,16.0,1,0.0,0.0,0.0,584.022263,158.001108,1395000.0


In [19]:
X4 = housepricingdf.drop(['PRICE'],axis=1)
y4=housepricingdf.PRICE

In [20]:
from sklearn.model_selection import train_test_split
X4_train, X4_test, y4_train, y4_test = train_test_split(X4,y4, test_size = .20, random_state= 0)

In [21]:
from xgboost import XGBRegressor
import xgboost as xgb

xgb = XGBRegressor()
xgb.fit(X4_train,y4_train)
pred_xgb=xgb.predict(X4_test)

  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [23]:
from sklearn import metrics
# Calculated R Squared
print('R^2 =',metrics.explained_variance_score(y4_test,pred_xgb))

R^2 = 0.7997427606518817


In [25]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y4_test, pred_xgb))
print('Mean Squared Error:', metrics.mean_squared_error(y4_test, pred_xgb))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y4_test, pred_xgb)))

Mean Absolute Error: 175130.13917351537
Mean Squared Error: 79463067551.61269
Root Mean Squared Error: 281891.94304132333


In [26]:
errors = abs(pred_xgb - y4_test)
MSE=round(np.mean(errors),2)
MAPE=100*(errors/y4_test)
accuracy=round (100 - np.mean(MAPE),2)
print(accuracy)

83.33


In [28]:
regressor=XGBRegressor(eval_metric='rmsle')

In [29]:
from sklearn.model_selection import GridSearchCV
# set up our search grid
param_grid = {"max_depth":    [4, 5],
              "n_estimators": [500, 600, 700],
              "learning_rate": [0.01, 0.015]}

# try out every combination of the above values
search = GridSearchCV(regressor, param_grid, cv=5,verbose=50).fit(X4_train, y4_train)
print("The best hyperparameters are ",search.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5; 1/12] START learning_rate=0.01, max_depth=4, n_estimators=500..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 1/5; 1/12] END learning_rate=0.01, max_depth=4, n_estimators=500;, score=0.707 total time=   2.6s
[CV 2/5; 1/12] START learning_rate=0.01, max_depth=4, n_estimators=500..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 2/5; 1/12] END learning_rate=0.01, max_depth=4, n_estimators=500;, score=0.718 total time=   2.6s
[CV 3/5; 1/12] START learning_rate=0.01, max_depth=4, n_estimators=500..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 3/5; 1/12] END learning_rate=0.01, max_depth=4, n_estimators=500;, score=0.716 total time=   2.6s
[CV 4/5; 1/12] START learning_rate=0.01, max_depth=4, n_estimators=500..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 4/5; 1/12] END learning_rate=0.01, max_depth=4, n_estimators=500;, score=0.714 total time=   2.9s
[CV 5/5; 1/12] START learning_rate=0.01, max_depth=4, n_estimators=500..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 5/5; 1/12] END learning_rate=0.01, max_depth=4, n_estimators=500;, score=0.716 total time=   2.4s
[CV 1/5; 2/12] START learning_rate=0.01, max_depth=4, n_estimators=600..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 1/5; 2/12] END learning_rate=0.01, max_depth=4, n_estimators=600;, score=0.716 total time=   3.1s
[CV 2/5; 2/12] START learning_rate=0.01, max_depth=4, n_estimators=600..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 2/5; 2/12] END learning_rate=0.01, max_depth=4, n_estimators=600;, score=0.728 total time=   3.1s
[CV 3/5; 2/12] START learning_rate=0.01, max_depth=4, n_estimators=600..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 3/5; 2/12] END learning_rate=0.01, max_depth=4, n_estimators=600;, score=0.720 total time=   3.1s
[CV 4/5; 2/12] START learning_rate=0.01, max_depth=4, n_estimators=600..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 4/5; 2/12] END learning_rate=0.01, max_depth=4, n_estimators=600;, score=0.724 total time=   3.1s
[CV 5/5; 2/12] START learning_rate=0.01, max_depth=4, n_estimators=600..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 5/5; 2/12] END learning_rate=0.01, max_depth=4, n_estimators=600;, score=0.726 total time=   3.9s
[CV 1/5; 3/12] START learning_rate=0.01, max_depth=4, n_estimators=700..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 1/5; 3/12] END learning_rate=0.01, max_depth=4, n_estimators=700;, score=0.723 total time=   3.9s
[CV 2/5; 3/12] START learning_rate=0.01, max_depth=4, n_estimators=700..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 2/5; 3/12] END learning_rate=0.01, max_depth=4, n_estimators=700;, score=0.736 total time=   3.7s
[CV 3/5; 3/12] START learning_rate=0.01, max_depth=4, n_estimators=700..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 3/5; 3/12] END learning_rate=0.01, max_depth=4, n_estimators=700;, score=0.725 total time=   3.4s
[CV 4/5; 3/12] START learning_rate=0.01, max_depth=4, n_estimators=700..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 4/5; 3/12] END learning_rate=0.01, max_depth=4, n_estimators=700;, score=0.732 total time=   3.6s
[CV 5/5; 3/12] START learning_rate=0.01, max_depth=4, n_estimators=700..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 5/5; 3/12] END learning_rate=0.01, max_depth=4, n_estimators=700;, score=0.732 total time=   3.7s
[CV 1/5; 4/12] START learning_rate=0.01, max_depth=5, n_estimators=500..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 1/5; 4/12] END learning_rate=0.01, max_depth=5, n_estimators=500;, score=0.726 total time=   3.3s
[CV 2/5; 4/12] START learning_rate=0.01, max_depth=5, n_estimators=500..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 2/5; 4/12] END learning_rate=0.01, max_depth=5, n_estimators=500;, score=0.747 total time=   3.9s
[CV 3/5; 4/12] START learning_rate=0.01, max_depth=5, n_estimators=500..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 3/5; 4/12] END learning_rate=0.01, max_depth=5, n_estimators=500;, score=0.737 total time=   3.6s
[CV 4/5; 4/12] START learning_rate=0.01, max_depth=5, n_estimators=500..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 4/5; 4/12] END learning_rate=0.01, max_depth=5, n_estimators=500;, score=0.738 total time=   3.4s
[CV 5/5; 4/12] START learning_rate=0.01, max_depth=5, n_estimators=500..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 5/5; 4/12] END learning_rate=0.01, max_depth=5, n_estimators=500;, score=0.736 total time=   3.2s
[CV 1/5; 5/12] START learning_rate=0.01, max_depth=5, n_estimators=600..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 1/5; 5/12] END learning_rate=0.01, max_depth=5, n_estimators=600;, score=0.732 total time=   3.8s
[CV 2/5; 5/12] START learning_rate=0.01, max_depth=5, n_estimators=600..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 2/5; 5/12] END learning_rate=0.01, max_depth=5, n_estimators=600;, score=0.753 total time=   3.9s
[CV 3/5; 5/12] START learning_rate=0.01, max_depth=5, n_estimators=600..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 3/5; 5/12] END learning_rate=0.01, max_depth=5, n_estimators=600;, score=0.741 total time=   3.9s
[CV 4/5; 5/12] START learning_rate=0.01, max_depth=5, n_estimators=600..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 4/5; 5/12] END learning_rate=0.01, max_depth=5, n_estimators=600;, score=0.747 total time=   4.0s
[CV 5/5; 5/12] START learning_rate=0.01, max_depth=5, n_estimators=600..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 5/5; 5/12] END learning_rate=0.01, max_depth=5, n_estimators=600;, score=0.741 total time=   4.4s
[CV 1/5; 6/12] START learning_rate=0.01, max_depth=5, n_estimators=700..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 1/5; 6/12] END learning_rate=0.01, max_depth=5, n_estimators=700;, score=0.737 total time=   4.8s
[CV 2/5; 6/12] START learning_rate=0.01, max_depth=5, n_estimators=700..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 2/5; 6/12] END learning_rate=0.01, max_depth=5, n_estimators=700;, score=0.759 total time=   4.7s
[CV 3/5; 6/12] START learning_rate=0.01, max_depth=5, n_estimators=700..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 3/5; 6/12] END learning_rate=0.01, max_depth=5, n_estimators=700;, score=0.746 total time=   4.8s
[CV 4/5; 6/12] START learning_rate=0.01, max_depth=5, n_estimators=700..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 4/5; 6/12] END learning_rate=0.01, max_depth=5, n_estimators=700;, score=0.753 total time=   4.9s
[CV 5/5; 6/12] START learning_rate=0.01, max_depth=5, n_estimators=700..........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 5/5; 6/12] END learning_rate=0.01, max_depth=5, n_estimators=700;, score=0.744 total time=   5.0s
[CV 1/5; 7/12] START learning_rate=0.015, max_depth=4, n_estimators=500.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 1/5; 7/12] END learning_rate=0.015, max_depth=4, n_estimators=500;, score=0.725 total time=   2.6s
[CV 2/5; 7/12] START learning_rate=0.015, max_depth=4, n_estimators=500.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 2/5; 7/12] END learning_rate=0.015, max_depth=4, n_estimators=500;, score=0.737 total time=   2.8s
[CV 3/5; 7/12] START learning_rate=0.015, max_depth=4, n_estimators=500.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 3/5; 7/12] END learning_rate=0.015, max_depth=4, n_estimators=500;, score=0.728 total time=   2.4s
[CV 4/5; 7/12] START learning_rate=0.015, max_depth=4, n_estimators=500.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 4/5; 7/12] END learning_rate=0.015, max_depth=4, n_estimators=500;, score=0.734 total time=   2.3s
[CV 5/5; 7/12] START learning_rate=0.015, max_depth=4, n_estimators=500.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 5/5; 7/12] END learning_rate=0.015, max_depth=4, n_estimators=500;, score=0.735 total time=   2.4s
[CV 1/5; 8/12] START learning_rate=0.015, max_depth=4, n_estimators=600.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 1/5; 8/12] END learning_rate=0.015, max_depth=4, n_estimators=600;, score=0.732 total time=   3.0s
[CV 2/5; 8/12] START learning_rate=0.015, max_depth=4, n_estimators=600.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 2/5; 8/12] END learning_rate=0.015, max_depth=4, n_estimators=600;, score=0.743 total time=   2.8s
[CV 3/5; 8/12] START learning_rate=0.015, max_depth=4, n_estimators=600.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 3/5; 8/12] END learning_rate=0.015, max_depth=4, n_estimators=600;, score=0.733 total time=   2.9s
[CV 4/5; 8/12] START learning_rate=0.015, max_depth=4, n_estimators=600.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 4/5; 8/12] END learning_rate=0.015, max_depth=4, n_estimators=600;, score=0.740 total time=   2.8s
[CV 5/5; 8/12] START learning_rate=0.015, max_depth=4, n_estimators=600.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 5/5; 8/12] END learning_rate=0.015, max_depth=4, n_estimators=600;, score=0.741 total time=   3.0s
[CV 1/5; 9/12] START learning_rate=0.015, max_depth=4, n_estimators=700.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 1/5; 9/12] END learning_rate=0.015, max_depth=4, n_estimators=700;, score=0.735 total time=   3.4s
[CV 2/5; 9/12] START learning_rate=0.015, max_depth=4, n_estimators=700.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 2/5; 9/12] END learning_rate=0.015, max_depth=4, n_estimators=700;, score=0.749 total time=   3.3s
[CV 3/5; 9/12] START learning_rate=0.015, max_depth=4, n_estimators=700.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 3/5; 9/12] END learning_rate=0.015, max_depth=4, n_estimators=700;, score=0.735 total time=   3.3s
[CV 4/5; 9/12] START learning_rate=0.015, max_depth=4, n_estimators=700.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 4/5; 9/12] END learning_rate=0.015, max_depth=4, n_estimators=700;, score=0.743 total time=   4.3s
[CV 5/5; 9/12] START learning_rate=0.015, max_depth=4, n_estimators=700.........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 5/5; 9/12] END learning_rate=0.015, max_depth=4, n_estimators=700;, score=0.745 total time=   3.6s
[CV 1/5; 10/12] START learning_rate=0.015, max_depth=5, n_estimators=500........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 1/5; 10/12] END learning_rate=0.015, max_depth=5, n_estimators=500;, score=0.738 total time=   3.1s
[CV 2/5; 10/12] START learning_rate=0.015, max_depth=5, n_estimators=500........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 2/5; 10/12] END learning_rate=0.015, max_depth=5, n_estimators=500;, score=0.759 total time=   3.0s
[CV 3/5; 10/12] START learning_rate=0.015, max_depth=5, n_estimators=500........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 3/5; 10/12] END learning_rate=0.015, max_depth=5, n_estimators=500;, score=0.747 total time=   3.0s
[CV 4/5; 10/12] START learning_rate=0.015, max_depth=5, n_estimators=500........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 4/5; 10/12] END learning_rate=0.015, max_depth=5, n_estimators=500;, score=0.755 total time=   3.0s
[CV 5/5; 10/12] START learning_rate=0.015, max_depth=5, n_estimators=500........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 5/5; 10/12] END learning_rate=0.015, max_depth=5, n_estimators=500;, score=0.744 total time=   3.0s
[CV 1/5; 11/12] START learning_rate=0.015, max_depth=5, n_estimators=600........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 1/5; 11/12] END learning_rate=0.015, max_depth=5, n_estimators=600;, score=0.742 total time=   4.3s
[CV 2/5; 11/12] START learning_rate=0.015, max_depth=5, n_estimators=600........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 2/5; 11/12] END learning_rate=0.015, max_depth=5, n_estimators=600;, score=0.763 total time=   4.1s
[CV 3/5; 11/12] START learning_rate=0.015, max_depth=5, n_estimators=600........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 3/5; 11/12] END learning_rate=0.015, max_depth=5, n_estimators=600;, score=0.749 total time=   3.8s
[CV 4/5; 11/12] START learning_rate=0.015, max_depth=5, n_estimators=600........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 4/5; 11/12] END learning_rate=0.015, max_depth=5, n_estimators=600;, score=0.759 total time=   3.7s
[CV 5/5; 11/12] START learning_rate=0.015, max_depth=5, n_estimators=600........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 5/5; 11/12] END learning_rate=0.015, max_depth=5, n_estimators=600;, score=0.747 total time=   4.1s
[CV 1/5; 12/12] START learning_rate=0.015, max_depth=5, n_estimators=700........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 1/5; 12/12] END learning_rate=0.015, max_depth=5, n_estimators=700;, score=0.744 total time=   4.3s
[CV 2/5; 12/12] START learning_rate=0.015, max_depth=5, n_estimators=700........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 2/5; 12/12] END learning_rate=0.015, max_depth=5, n_estimators=700;, score=0.766 total time=   4.2s
[CV 3/5; 12/12] START learning_rate=0.015, max_depth=5, n_estimators=700........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 3/5; 12/12] END learning_rate=0.015, max_depth=5, n_estimators=700;, score=0.752 total time=   4.3s
[CV 4/5; 12/12] START learning_rate=0.015, max_depth=5, n_estimators=700........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 4/5; 12/12] END learning_rate=0.015, max_depth=5, n_estimators=700;, score=0.761 total time=   4.3s
[CV 5/5; 12/12] START learning_rate=0.015, max_depth=5, n_estimators=700........


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[CV 5/5; 12/12] END learning_rate=0.015, max_depth=5, n_estimators=700;, score=0.748 total time=   4.3s


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


The best hyperparameters are  {'learning_rate': 0.015, 'max_depth': 5, 'n_estimators': 700}


In [32]:
regressor=XGBRegressor(learning_rate = search.best_params_["learning_rate"],
                           n_estimators  = search.best_params_["n_estimators"],
                           max_depth     = search.best_params_["max_depth"],)

regressor.fit(X4_train, y4_train)

In [33]:
predictions = regressor.predict(X4_test)

In [34]:
from sklearn import metrics
# Calculated R Squared
print('R^2 =',metrics.explained_variance_score(y4_test,predictions))

R^2 = 0.798923944952734


In [35]:

print('Mean Absolute Error:', metrics.mean_absolute_error(y4_test, predictions))
print('Mean Squared Error:', metrics.mean_squared_error(y4_test, predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y4_test, predictions)))

Mean Absolute Error: 175747.35149456523
Mean Squared Error: 79776312381.72511
Root Mean Squared Error: 282447.00809483737


In [36]:
errors = abs(predictions - y4_test)
MSE=round(np.mean(errors),2)
MAPE=100*(errors/y4_test)
accuracy=round (100 - np.mean(MAPE),2)
print(accuracy)

83.16


In [38]:
df4= pd.DataFrame({'Actual_Price': np.round(y4_test), 
                   'Predicted_Price': np.round(predictions),
                 })
df4['difference']=df4.apply(lambda x: x.Predicted_Price-x.Actual_Price,axis=1)
df4.head(10)

Unnamed: 0,Actual_Price,Predicted_Price,difference
13805,1800000.0,1337102.0,-462898.0
6821,1335000.0,1234586.0,-100414.0
6590,662000.0,625350.0,-36650.0
24336,769000.0,660864.0,-108136.0
22019,720000.0,703638.0,-16362.0
20671,2536000.0,2321266.0,-214734.0
14715,910000.0,887401.0,-22599.0
17980,945000.0,971199.0,26199.0
10676,672000.0,676229.0,4229.0
9673,360000.0,440673.0,80673.0


In [44]:
# Creating a User Defined Function within Snowflake to do the scoring there
def predict_pandas_udf_XGB(df4: pd.DataFrame) -> pd.Series:
    from xgboost import XGBRegressor
    import xgboost as xgb
    return pd.Series(regressor.predict(df4))  


In [45]:
housepricingdf.columns = map(lambda x: str(x).upper(), housepricingdf.columns)
snowdf_details = session.createDataFrame(housepricingdf)
snowdf_details.show()
snowdf_details.write.mode("overwrite").saveAsTable("HOUSING.PUBLIC.FULL_HOUSINGPRICE_encoded_XGB") 

session.table("HOUSING.PUBLIC.FULL_HOUSINGPRICE_encoded_XGB").show(5)

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SUBURB"  |"ROOMS"  |"TYPE"  |"METHOD"  |"SELLERG"  |"REGIONNAME"  |"PROPERTYCOUNT"  |"DISTANCE"  |"COUNCILAREA"  |"BEDROOM2"  |"BATHROOM"  |"CAR"  |"LANDSIZE"  |"BUILDINGAREA"      |"PRICE"    |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|292       |3        |0       |1         |95         |5             |5457.0           |11.2        |2              |3.0         |1.0         |2.0    |490.0       |158.00110757388654  |1205000.0  |
|292       |2        |2       |1         |95         |5             |5457.0           |11.2        |2              |2.0         |1.0         |2.0    |108.0       |158.00110757388654  |813000.0   |
|329       |3  

In [47]:
XGB_model_vec = pandas_udf(func=predict_pandas_udf_XGB,
                                return_type=FloatType(),
                                input_types=[IntegerType(),IntegerType(),IntegerType(),IntegerType(),IntegerType(),IntegerType(),FloatType(),FloatType(),FloatType(),IntegerType(),FloatType(),FloatType(),FloatType(),FloatType()],
                                session=session,
                                packages = ("pandas","scikit-learn","xgboost"), max_batch_size=200)

In [49]:

# Calling the UDF to do the scoring (pushing down to Snowflake)
output_XGB = session.table('HOUSING.PUBLIC.FULL_HOUSINGPRICE_encoded_XGB').select(*list(X4.columns),
                    XGB_model_vec(list(X4.columns)).alias('PREDICTED_PRICE'),
                    (F.col('Price')).alias('ACTUAL_PRICE')                                              
                    )

output_XGB.show(5)


Passing arguments to a UDF with a list or tuple is deprecated. We still respect this invocation but please consider passing variable-length arguments without a list or tuple.


-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SUBURB"  |"ROOMS"  |"TYPE"  |"METHOD"  |"SELLERG"  |"REGIONNAME"  |"PROPERTYCOUNT"  |"DISTANCE"  |"COUNCILAREA"  |"BEDROOM2"  |"BATHROOM"  |"CAR"  |"LANDSIZE"  |"BUILDINGAREA"      |"PREDICTED_PRICE"  |"ACTUAL_PRICE"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|292       |3        |0       |1         |95         |5             |5457.0           |11.2        |2              |3.0         |1.0         |2.0    |490.0       |158.00110757388654  |1557488.625        |1205000.0       |
|292       |2        |2       |1         |95         |5             |5457.0           |11.2        |2           

In [50]:
output_XGB=output_XGB.to_pandas()

In [51]:
output_XGB.head()

Unnamed: 0,SUBURB,ROOMS,TYPE,METHOD,SELLERG,REGIONNAME,PROPERTYCOUNT,DISTANCE,COUNCILAREA,BEDROOM2,BATHROOM,CAR,LANDSIZE,BUILDINGAREA,PREDICTED_PRICE,ACTUAL_PRICE
0,292,2,2,1,95,5,5457.0,11.2,2,2.0,1.0,2.0,108.0,158.001108,643362.8,813000.0
1,218,3,2,1,197,6,6232.0,6.9,22,3.0,2.0,2.0,0.0,120.0,767232.4,670000.0
2,40,3,0,1,22,6,3589.0,10.8,16,3.0,2.0,4.0,700.0,158.001108,945559.3,870000.0
3,30,2,0,1,317,5,10969.0,13.9,8,2.0,1.0,2.0,768.0,158.001108,1208220.0,1410000.0
4,239,4,0,1,312,4,3692.0,14.7,12,0.0,0.0,0.0,584.022263,158.001108,1194439.0,991000.0


In [52]:
output_XGB.SUBURB=le1.inverse_transform(output_XGB.SUBURB)
output_XGB.TYPE = le2.inverse_transform(output_XGB.TYPE)
output_XGB.METHOD = le3.inverse_transform(output_XGB.METHOD)
output_XGB.SELLERG=le4.inverse_transform(output_XGB.SELLERG)
output_XGB.REGIONNAME=le5.inverse_transform(output_XGB.REGIONNAME)
output_XGB.COUNCILAREA = le6.inverse_transform(output_XGB.COUNCILAREA)


In [53]:
output_XGB.head()

Unnamed: 0,SUBURB,ROOMS,TYPE,METHOD,SELLERG,REGIONNAME,PROPERTYCOUNT,DISTANCE,COUNCILAREA,BEDROOM2,BATHROOM,CAR,LANDSIZE,BUILDINGAREA,PREDICTED_PRICE,ACTUAL_PRICE
0,Surrey Hills,2,u,S,Fletchers,Southern Metropolitan,5457.0,11.2,Boroondara City Council,2.0,1.0,2.0,108.0,158.001108,643362.8,813000.0
1,Moonee Ponds,3,u,S,Nelson,Western Metropolitan,6232.0,6.9,Moonee Valley City Council,3.0,2.0,2.0,0.0,120.0,767232.4,670000.0
2,Braybrook,3,h,S,Barry,Western Metropolitan,3589.0,10.8,Maribyrnong City Council,3.0,2.0,4.0,700.0,158.001108,945559.3,870000.0
3,Bentleigh East,2,h,S,hockingstuart,Southern Metropolitan,10969.0,13.9,Glen Eira City Council,2.0,1.0,2.0,768.0,158.001108,1208220.0,1410000.0
4,Oakleigh South,4,h,S,Woodards,South-Eastern Metropolitan,3692.0,14.7,Kingston City Council,0.0,0.0,0.0,584.022263,158.001108,1194439.0,991000.0


In [54]:
snowdf_details_XGB = session.createDataFrame(output_XGB)
snowdf_details_XGB.show()
snowdf_details_XGB.write.mode("overwrite").saveAsTable("HOUSING.PUBLIC.FULL_HOUSINGPRICE_PREDICTED_XGB") 




-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SUBURB"        |"ROOMS"  |"TYPE"  |"METHOD"  |"SELLERG"      |"REGIONNAME"                |"PROPERTYCOUNT"  |"DISTANCE"  |"COUNCILAREA"               |"BEDROOM2"  |"BATHROOM"  |"CAR"  |"LANDSIZE"         |"BUILDINGAREA"      |"PREDICTED_PRICE"  |"ACTUAL_PRICE"  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|Surrey Hills    |2        |u       |S         |Fletchers      |Southern Metropolitan       |5457.0           |11.2        |Boroondara City Council     |2.0         |1.0         |2.0    |108.0          

In [55]:
# Also get a local dataframe to review the results
snowdf_details_XGB=snowdf_details_XGB.toPandas()
snowdf_details_XGB

Unnamed: 0,SUBURB,ROOMS,TYPE,METHOD,SELLERG,REGIONNAME,PROPERTYCOUNT,DISTANCE,COUNCILAREA,BEDROOM2,BATHROOM,CAR,LANDSIZE,BUILDINGAREA,PREDICTED_PRICE,ACTUAL_PRICE
0,Surrey Hills,2,u,S,Fletchers,Southern Metropolitan,5457.0,11.2,Boroondara City Council,2.0,1.0,2.0,108.000000,158.001108,6.433628e+05,813000.0
1,Moonee Ponds,3,u,S,Nelson,Western Metropolitan,6232.0,6.9,Moonee Valley City Council,3.0,2.0,2.0,0.000000,120.000000,7.672324e+05,670000.0
2,Braybrook,3,h,S,Barry,Western Metropolitan,3589.0,10.8,Maribyrnong City Council,3.0,2.0,4.0,700.000000,158.001108,9.455593e+05,870000.0
3,Bentleigh East,2,h,S,hockingstuart,Southern Metropolitan,10969.0,13.9,Glen Eira City Council,2.0,1.0,2.0,768.000000,158.001108,1.208220e+06,1410000.0
4,Oakleigh South,4,h,S,Woodards,South-Eastern Metropolitan,3692.0,14.7,Kingston City Council,0.0,0.0,0.0,584.022263,158.001108,1.194439e+06,991000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23566,Maidstone,2,t,S,Biggin,Western Metropolitan,3873.0,6.4,Maribyrnong City Council,2.0,2.0,1.0,126.000000,95.000000,5.962979e+05,642500.0
23567,Hawthorn East,1,u,SP,Noel,Southern Metropolitan,6482.0,6.2,Boroondara City Council,1.0,1.0,1.0,0.000000,52.000000,3.868471e+05,505000.0
23568,Heidelberg,3,h,S,Miles,Eastern Metropolitan,2890.0,8.9,Banyule City Council,3.0,1.0,0.0,668.000000,158.001108,1.028496e+06,1250000.0
23569,Ivanhoe,3,h,VB,Nelson,Eastern Metropolitan,5549.0,7.8,Banyule City Council,3.0,1.0,2.0,360.000000,158.001108,1.090875e+06,950000.0
