### 1. Import Libraries and Loading Dataset

In [1]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

import plotly.express as px
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error,mean_absolute_error

%matplotlib inline

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
data_dir = "/content/drive/MyDrive/zillow-prize-1"

# Read Property Information from 2016 and 2017
properties_2016 = pd.read_csv( data_dir +"/properties_2016.csv", low_memory=False)
properties_2017 = pd.read_csv( data_dir +"/properties_2017.csv", low_memory=False)

# Read Training Information
train_2016  = pd.read_csv( data_dir +"/train_2016_v2.csv", low_memory=False)
train_2017  = pd.read_csv( data_dir +"/train_2017.csv", low_memory=False)

# Read Submission Data
sample_submission_data = pd.read_csv(data_dir+'/sample_submission.csv', low_memory = False)

#merge training with properties
train_2016 = pd.merge(train_2016, properties_2016, how = 'left', on = 'parcelid').copy()
train_2017 = pd.merge(train_2017, properties_2017, how = 'left', on = 'parcelid').copy()

train_df = pd.concat([train_2016,train_2017],axis=0)
original_train_df=train_df
train_df # Merge Dataset from 2016 and 2017 for Train Data

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.027600,2016-01-01,1.0,,,2.0,3.0,,4.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,6.037107e+13
1,14366692,-0.168400,2016-01-01,,,,3.5,4.0,,,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,-0.004000,2016-01-01,1.0,,,3.0,2.0,,4.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,6.037464e+13
3,12643413,0.021800,2016-01-02,1.0,,,2.0,2.0,,4.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,6.037296e+13
4,14432541,-0.005000,2016-01-02,,,,2.5,4.0,,,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,6.059042e+13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77608,10833991,-0.002245,2017-09-20,1.0,,,3.0,3.0,,8.0,...,,,265000.0,379000.0,2016.0,114000.0,4685.34,,,6.037132e+13
77609,11000655,0.020615,2017-09-20,,,,2.0,2.0,,6.0,...,,,70917.0,354621.0,2016.0,283704.0,4478.43,,,6.037101e+13
77610,17239384,0.013209,2017-09-21,,,,2.0,4.0,,,...,1.0,,50683.0,67205.0,2016.0,16522.0,1107.48,,,6.111008e+13
77611,12773139,0.037129,2017-09-21,1.0,,,1.0,3.0,,4.0,...,,,32797.0,49546.0,2016.0,16749.0,876.43,,,6.037434e+13


### 2. Data Preprocessing Stage

In [3]:
#replace nan 
def replace_nan(data): 
  print("Replace Empty Values with 0")
  print()
  data = data.fillna(0) 
  #test_df = test_df.fillna(0)   
  return data 
 
#one_hot_encoding 
def one_hot_encoding(train_filtered): 
  categorical_features = ['airconditioningtypeid', 'heatingorsystemtypeid', 'propertylandusedesc', 'storytypeid', 'architecturalstyletypeid', 'typeconstructiontypeid', 'buildingclasstypeid']

  print("Perform One hot encoding")
  print()

  for i in categorical_features:
    if i in train_filtered.columns:
      feature_df = pd.DataFrame(train_filtered, columns=[i]) 
      dum_df = pd.get_dummies(feature_df, columns=[i], prefix=[ i + "_"]) 
      train_filtered = pd.concat([train_filtered, dum_df], axis = 1) 
      train_filtered = train_filtered.drop(columns=[i])   
   
  return train_filtered

def clean_data(data):
  print("Cleaning Data")
  print()


  dataframe = data.copy()

  
  dataframe['property_tax_per_sqft'] = dataframe['taxamount'] / dataframe['calculatedfinishedsquarefeet']
  dataframe['missing_finished_area'] = dataframe['calculatedfinishedsquarefeet'].isnull().astype(np.float32)


  empty_pool_index = dataframe[dataframe["poolcnt"]==0].index
  dataframe.loc[empty_pool_index,'poolsizesum'] = 0

  pool_index = dataframe[(dataframe['poolcnt'] > 0) & (dataframe['poolsizesum'].isnull())].index
  dataframe.loc[pool_index,'poolsizesum'] = np.nan

  dataframe['derived_room_cnt'] = dataframe['bedroomcnt'] + dataframe['bathroomcnt']

  #bad_index = dataframe[dataframe.unitcnt.isnull()].index   
  #dataframe.loc[bad_index,'unitcnt'] = 1
  dataframe['location_1'] = dataframe['latitude'] + dataframe['longitude']
  dataframe['location_2'] = dataframe['latitude'] - dataframe['longitude']
  dataframe['location_3'] = dataframe['latitude'] + 0.5 * dataframe['longitude']
  dataframe['location_4'] = dataframe['latitude'] - 0.5 * dataframe['longitude']

  dataframe['avg_garage_size'] = dataframe['garagetotalsqft'] / dataframe['garagecarcnt']

  dataframe['fireplacecnt'] = np.where(dataframe['fireplacecnt'].isnull(),0,1)

  dataframe['decktypeid'] = np.where(dataframe['decktypeid'].isnull(),0,1)

  dataframe['taxdelinquencyflag'] = np.where(dataframe['taxdelinquencyflag']=='',0,1)
  dataframe['derived_room_cnt'] = dataframe['bedroomcnt'] + dataframe['bathroomcnt'] 
  dataframe['hashottuborspa'] = np.where(dataframe['hashottuborspa']=='',0,1)
  
  #dataframe['airconditioningtypeid'] = np.where(dataframe['airconditioningtypeid'].isnull(),0,1)
  #dataframe['heatingorsystemtypeid'] = np.where(dataframe['heatingorsystemtypeid'].isnull(),0,1)

  #dataframe['airconditioningtypeid'] = np.where((dataframe["airconditioningtypeid"].isnull()) & (dataframe.heatingorsystemtypeid==2),1,dataframe["airconditioningtypeid"])

  #dataframe.loc[dataframe[dataframe["heatingorsystemtypeid"].isnull()].index,'heatingorsystemtypeid']=0
  

  invalid_garages = dataframe.loc[dataframe['calculatedfinishedsquarefeet']<dataframe['finishedfloor1squarefeet']].index
  dataframe.loc[invalid_garages,'finishedfloor1squarefeet'] = np.nan

  garage_index = dataframe[(dataframe.garagecarcnt==0) & (dataframe['garagetotalsqft']>0)].index
  dataframe.loc[garage_index,'garagecarcnt'] = np.nan

  #bad_index = dataframe[dataframe['taxvaluedollarcnt'].isnull()].index
  #dataframe.loc[bad_index,'taxvaluedollarcnt'] = dataframe.loc[bad_index,'structuretaxvaluedollarcnt'] + dataframe.loc[bad_index,'landtaxvaluedollarcnt']
  
  return dataframe


def transform_date(X): 
  print("Transforming Date Information")
  print()
  dt = pd.to_datetime(X['transactiondate']).dt
  X['transaction_year'] = dt.year
  #X['transaction_month'] = ((dt.year - 2016)*12 + dt.month).astype('category')
  X['transaction_month'] = dt.month
  #X['transaction_day'] = dt.day
  #X['transaction_quarter'] = ((dt.year - 2016)*4 + dt.quarter).astype('category')
  X = X.drop(['transactiondate'], axis=1)

  return X


def columns_to_remove(df, threshold):
  print("Removing Columns with" + str(threshold) + "% missing data")
  columns_to_drop = []
  columns_to_keep = []

  percent_missing = train_df.isnull().sum() * 100 / len(train_df)
  missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
  missing_value_df.sort_values('percent_missing', inplace=True,ascending=False)

  print(missing_value_df)

  # Example: 100 means 100 % of the data is missing while 20 means 20% of the data is missing
  #fig = px.bar(missing_value_df,x=missing_value_df.index.values, y=missing_value_df.percent_missing,title='Missing Information')
  #fig.show()

  for index, row in missing_value_df.iterrows():
      if(row['percent_missing']>threshold):
        columns_to_drop.append(index)
      else:
        columns_to_keep.append(index)

  print("Columns to Drop",len(columns_to_drop),columns_to_drop)
  print("Columns to Keep",len(columns_to_keep),columns_to_keep)
  print()

  df = df.drop(columns_to_drop,axis=1)

  return df,columns_to_drop

def select_features_correlate_positive(train_df):
  features_corr = train_df.corr()['logerror'].sort_values(ascending=False).reset_index().rename(columns={'index':'feature','logerror':'correlation'})
  positive_feature = features_corr[features_corr.correlation>0]
  negative_feature = features_corr[(features_corr.correlation<0) & (features_corr.correlation>-0.001)]

  cols = [col for col in positive_feature.feature if col not in ['logerror','parcelid']]
  neg_cor = [col for col in negative_feature.feature if col not in ['logerror','parcelid']]

  print(features_corr)
  cols = cols +['propertycountylandusecode']#'propertyzoningdesc',
  print('we have about {} selected features'.format(len(cols)))
  return cols, neg_cor

def data_processing(df):
  df = clean_data(df)
  df, columns_to_drop = columns_to_remove(df,87)
  df = one_hot_encoding(df)
  df = replace_nan(df) 
  df = transform_date(df)
  return df,columns_to_drop


### Feature Selection

In [None]:
# cols, neg_cor = select_features_correlate_positive(train_df)
# print(neg_cor)
# print(cols)

In [4]:
train_df = original_train_df.copy()
# train_df = train_df.drop(neg_cor,axis=1)
train_df, columns_to_drop_test = data_processing(train_df)
#train_df

Cleaning Data

Removing Columns with87% missing data
                              percent_missing
buildingclasstypeid                 99.981535
finishedsquarefeet13                99.955327
storytypeid                         99.944606
basementsqft                        99.944606
yardbuildingsqft26                  99.901720
fireplaceflag                       99.765320
architecturalstyletypeid            99.721243
typeconstructiontypeid              99.689078
finishedsquarefeet6                 99.519322
decktypeid                          99.242352
pooltypeid10                        99.031497
poolsizesum                         98.905223
pooltypeid2                         98.643143
hashottuborspa                      97.674640
taxdelinquencyyear                  97.210640
taxdelinquencyflag                  97.210640
yardbuildingsqft17                  96.998594
finishedsquarefeet15                96.074168
finishedfloor1squarefeet            92.320476
finishedsquarefeet50       

### 4. Splitting Data for Training Data

In [5]:
print(train_df.shape)

(167888, 64)


In [6]:
def prepare_dataset(df, columns_to_drop, test_size, random_state):    
     
    # divide data to train and test
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=random_state)
    
    # scale the training inputs
    x_train = df_train.drop(columns_to_drop,axis=1)
    y_train = df_train['logerror'].to_numpy()

    x_test = df_test.drop(columns_to_drop,axis=1)
    y_test = df_test['logerror'].to_numpy() 
    
    return x_train, y_train, x_test, y_test

In [7]:
additonal_columns_to_drop= ['logerror', 'propertycountylandusecode', 'parcelid', 'propertyzoningdesc']
X_train, y_train, X_test, y_test = prepare_dataset(train_df, additonal_columns_to_drop, 0.2,20)

In [8]:
X_train

Unnamed: 0,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,garagecarcnt,garagetotalsqft,...,heatingorsystemtypeid__10.0,heatingorsystemtypeid__11.0,heatingorsystemtypeid__12.0,heatingorsystemtypeid__13.0,heatingorsystemtypeid__14.0,heatingorsystemtypeid__18.0,heatingorsystemtypeid__20.0,heatingorsystemtypeid__24.0,transaction_year,transaction_month
66756,3.0,4.0,7.0,3.0,2698.0,2698.0,6037.0,3.0,0.0,0.0,...,0,0,0,0,0,0,0,0,2017,8
3808,2.0,2.0,4.0,2.0,1463.0,1463.0,6037.0,2.0,0.0,0.0,...,0,0,0,0,0,0,0,0,2016,1
941,1.5,2.0,0.0,1.5,993.0,993.0,6059.0,1.0,2.0,0.0,...,0,0,0,0,0,0,0,0,2017,1
22922,2.0,4.0,0.0,2.0,1430.0,1430.0,6111.0,2.0,1.0,390.0,...,0,0,0,0,0,0,0,0,2016,4
36824,3.0,4.0,4.0,3.0,1740.0,0.0,6037.0,3.0,0.0,0.0,...,0,0,0,0,0,0,0,0,2017,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34033,1.0,0.0,6.0,1.0,426.0,426.0,6037.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,2017,5
31962,2.0,2.0,0.0,2.0,975.0,975.0,6059.0,2.0,1.0,0.0,...,0,0,0,0,0,0,0,0,2016,5
23775,2.0,3.0,7.0,2.0,1098.0,1098.0,6037.0,2.0,0.0,0.0,...,0,0,0,0,0,0,0,0,2016,4
37135,2.0,3.0,4.0,2.0,1410.0,1410.0,6037.0,2.0,0.0,0.0,...,0,0,0,0,0,0,0,0,2016,5


In [None]:
#X_train

5. Model Training

In [9]:
X_train.dtypes

bathroomcnt                     float64
bedroomcnt                      float64
buildingqualitytypeid           float64
calculatedbathnbr               float64
calculatedfinishedsquarefeet    float64
finishedsquarefeet12            float64
fips                            float64
fullbathcnt                     float64
garagecarcnt                    float64
garagetotalsqft                 float64
latitude                        float64
longitude                       float64
lotsizesquarefeet               float64
poolcnt                         float64
pooltypeid7                     float64
propertylandusetypeid           float64
rawcensustractandblock          float64
regionidcity                    float64
regionidcounty                  float64
regionidneighborhood            float64
regionidzip                     float64
roomcnt                         float64
threequarterbathnbr             float64
unitcnt                         float64
yearbuilt                       float64


### Recursive Feature Elimination

In [None]:
# # evaluate RFE for regression
# from numpy import mean
# from numpy import std
# from sklearn.datasets import make_regression
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedKFold
# from sklearn.feature_selection import RFE
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import AdaBoostRegressor
# from sklearn.pipeline import Pipeline

In [None]:
# rfe = RFE(estimator=AdaBoostRegressor())
# rfe=rfe.fit(X_train, y_train)

In [None]:
# important_features_rfe = rfe.get_support(1) #the most important features


In [None]:
# print(important_features_rfe)
# print(len(important_features_rfe))

In [None]:
# rfe_columns=[]
# for i in important_features_rfe:
#   rfe_columns.append(X_train.columns[i])

# print(rfe_columns)
# print(len(rfe_columns))

In [None]:
# #using randomforest features
# #rfe_columns = ['bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'fullbathcnt', 'garagetotalsqft', 'latitude', 'longitude', 'lotsizesquarefeet', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'yearbuilt', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'censustractandblock', 'transaction_year', 'transaction_month']
# X_train = X_train.loc[:, rfe_columns]
# X_train

### Random Forest Regressor

In [13]:
#model = RandomForestRegressor()
model = RandomForestRegressor(n_estimators=100, max_depth=5, bootstrap=True, max_features = 'auto', min_samples_split = 2, min_samples_leaf = 1, n_jobs=1, random_state=0, oob_score = True)

In [14]:
X_train.replace([np.inf, -np.inf], 0, inplace=True)
model.fit(X_train,y_train)

RandomForestRegressor(max_depth=5, n_jobs=1, oob_score=True, random_state=0)

In [15]:
y_pred= model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Model: {model}")
print(f"MAE: {mae}, RMSE: {rmse}")

Model: RandomForestRegressor(max_depth=5, n_jobs=1, oob_score=True, random_state=0)
MAE: 0.07008432865573837, RMSE: 0.16971652858774366


### Linear Regression

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
X_train.replace([np.inf, -np.inf], 0, inplace=True)
model2 = Ridge()

In [15]:
model2.fit(X_train,y_train)

Ridge()

In [16]:
y_pred= model2.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Model: {model2}")
print(f"MAE: {mae}, RMSE: {rmse}")

Model: Ridge()
MAE: 0.07032130922957193, RMSE: 0.1701182124334736


### Gradient Boosting Regressor

In [17]:
from sklearn.ensemble import GradientBoostingRegressor
model3 = GradientBoostingRegressor()
X_train.replace([np.inf, -np.inf], 0, inplace=True)

In [18]:
model3.fit(X_train,y_train)

GradientBoostingRegressor()

In [19]:
y_pred= model3.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Model: {model3}")
print(f"MAE: {mae}, RMSE: {rmse}")

Model: GradientBoostingRegressor()
MAE: 0.07017298866397752, RMSE: 0.16973255201425239


### Extratree

In [20]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
X_train.replace([np.inf, -np.inf], 0, inplace=True)

In [21]:
model.fit(X_train,y_train)

ExtraTreesRegressor()

In [22]:
y_pred= model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Model: {model}")
print(f"MAE: {mae}, RMSE: {rmse}")

Model: ExtraTreesRegressor()
MAE: 0.07752351316355587, RMSE: 0.1740435545286477


### Catboost

In [10]:
!pip install catboost
from catboost import CatBoostRegressor
#X_train.replace([np.inf, -np.inf], 0, inplace=True)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
catBoost = CatBoostRegressor(loss_function='MAE', eval_metric='MAE')
catBoost.fit(X_train,y_train)


0:	learn: 0.0686064	total: 89.5ms	remaining: 1m 29s
1:	learn: 0.0685896	total: 137ms	remaining: 1m 8s
2:	learn: 0.0685743	total: 174ms	remaining: 57.8s
3:	learn: 0.0685584	total: 209ms	remaining: 52.1s
4:	learn: 0.0685429	total: 244ms	remaining: 48.6s
5:	learn: 0.0685279	total: 280ms	remaining: 46.5s
6:	learn: 0.0685144	total: 323ms	remaining: 45.8s
7:	learn: 0.0685012	total: 357ms	remaining: 44.3s
8:	learn: 0.0684877	total: 393ms	remaining: 43.3s
9:	learn: 0.0684750	total: 430ms	remaining: 42.6s
10:	learn: 0.0684631	total: 464ms	remaining: 41.8s
11:	learn: 0.0684519	total: 499ms	remaining: 41.1s
12:	learn: 0.0684407	total: 538ms	remaining: 40.8s
13:	learn: 0.0684289	total: 577ms	remaining: 40.6s
14:	learn: 0.0684177	total: 612ms	remaining: 40.2s
15:	learn: 0.0684072	total: 648ms	remaining: 39.9s
16:	learn: 0.0683981	total: 682ms	remaining: 39.5s
17:	learn: 0.0683891	total: 715ms	remaining: 39s
18:	learn: 0.0683811	total: 774ms	remaining: 40s
19:	learn: 0.0683747	total: 856ms	remaining

<catboost.core.CatBoostRegressor at 0x7ff3623f7310>

In [12]:
y_pred= catBoost.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Model: {catBoost}")
print(f"MAE: {mae}, RMSE: {rmse}")

Model: <catboost.core.CatBoostRegressor object at 0x7ff3623f7310>
MAE: 0.06881215816463569, RMSE: 0.16950772300785028


### XGB Regressor

In [26]:
from xgboost import XGBRegressor
X_train.replace([np.inf, -np.inf], 0, inplace=True)
model5 = XGBRegressor()

In [27]:
model5.fit(X_train,y_train)



XGBRegressor()

In [28]:
y_pred= model5.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Model: {model5}")
print(f"MAE: {mae}, RMSE: {rmse}")

Model: XGBRegressor()
MAE: 0.07013339537560732, RMSE: 0.16948975737298827


###LightGBM

In [29]:
import lightgbm as lgb
X_train.replace([np.inf, -np.inf], 0, inplace=True)
# LightGBM parameters
params = {}

params['objective'] = 'regression'
params['metric'] = 'mae'
params['num_threads'] = 4  # set to number of real CPU cores for best performance

params['boosting_type'] = 'gbdt'
params['num_boost_round'] = 2000
params['learning_rate'] = 0.003  # shrinkage_rate
params['early_stopping_rounds'] = 30  # Early stopping based on validation set performance

# Control tree growing
params['num_leaves'] = 127  # max number of leaves in one tree (default 31)
params['min_data'] = 150  # min_data_in_leaf
params['min_hessian'] = 0.001  # min_sum_hessian_in_leaf (default 1e-3)
params['max_depth'] = -1  # limit the max depth of tree model, defult -1 (no limit)
params['max_bin'] = 255  # max number of bins that feature values are bucketed in (small -> less overfitting, default 255)
params['sub_feature'] = 0.5    # feature_fraction (small values => use very different submodels)

# Row subsampling (speed up training and alleviate overfitting)
params['bagging_fraction'] = 0.7
params['bagging_freq'] = 50  # perform bagging at every k iteration

# Constraints on categorical features
params['min_data_per_group'] = 100  # minimal number of data per categorical group (default 100)
params['cat_smooth'] = 15.0  # reduce effect of noises in categorical features, especially for those with few data (default 10.0)

# Regularization (default 0.0)
params['lambda_l1'] = 0.0
params['lambda_l2'] = 0.0

# Random seeds (keep default values)
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

In [30]:
#categorical_features = ['airconditioningtypeid', 'heatingorsystemtypeid', 'propertylandusedesc', 'storytypeid', 'architecturalstyletypeid', 'typeconstructiontypeid', 'buildingclasstypeid']
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_test = lgb.Dataset(X_test, label=y_test, reference=lgb_train)
model = lgb.train(params=params, train_set=lgb_train, verbose_eval=False,
                valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'])



In [31]:
y_pred= model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Model: {model}")
print(f"MAE: {mae}, RMSE: {rmse}")

Model: <lightgbm.basic.Booster object at 0x7fcb88d14850>
MAE: 0.06983244023325673, RMSE: 0.16960320450829094
