### 1. Import Libraries and Loading Dataset

In [1]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

import plotly.express as px
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error,mean_absolute_error

%matplotlib inline

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
data_dir = "/content/drive/MyDrive/zillow-prize-1"

# Read Property Information from 2016 and 2017
properties_2016 = pd.read_csv( data_dir +"/properties_2016.csv", low_memory=False)
properties_2017 = pd.read_csv( data_dir +"/properties_2017.csv", low_memory=False)

# Read Training Information
train_2016  = pd.read_csv( data_dir +"/train_2016_v2.csv", low_memory=False)
train_2017  = pd.read_csv( data_dir +"/train_2017.csv", low_memory=False)

# Read Submission Data
sample_submission_data = pd.read_csv(data_dir+'/sample_submission.csv', low_memory = False)

In [3]:
#merge training with properties
train_2016 = pd.merge(train_2016, properties_2016, how = 'left', on = 'parcelid').copy()
train_2017 = pd.merge(train_2017, properties_2017, how = 'left', on = 'parcelid').copy()

train_df = pd.concat([train_2016,train_2017],axis=0)
original_train_df=train_df
train_df # Merge Dataset from 2016 and 2017 for Train Data

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.027600,2016-01-01,1.0,,,2.0,3.0,,4.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,6.037107e+13
1,14366692,-0.168400,2016-01-01,,,,3.5,4.0,,,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,-0.004000,2016-01-01,1.0,,,3.0,2.0,,4.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,6.037464e+13
3,12643413,0.021800,2016-01-02,1.0,,,2.0,2.0,,4.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,6.037296e+13
4,14432541,-0.005000,2016-01-02,,,,2.5,4.0,,,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,6.059042e+13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77608,10833991,-0.002245,2017-09-20,1.0,,,3.0,3.0,,8.0,...,,,265000.0,379000.0,2016.0,114000.0,4685.34,,,6.037132e+13
77609,11000655,0.020615,2017-09-20,,,,2.0,2.0,,6.0,...,,,70917.0,354621.0,2016.0,283704.0,4478.43,,,6.037101e+13
77610,17239384,0.013209,2017-09-21,,,,2.0,4.0,,,...,1.0,,50683.0,67205.0,2016.0,16522.0,1107.48,,,6.111008e+13
77611,12773139,0.037129,2017-09-21,1.0,,,1.0,3.0,,4.0,...,,,32797.0,49546.0,2016.0,16749.0,876.43,,,6.037434e+13


In [4]:
sample_submission_data

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0
...,...,...,...,...,...,...,...
2985212,168176230,0,0,0,0,0,0
2985213,14273630,0,0,0,0,0,0
2985214,168040630,0,0,0,0,0,0
2985215,168040830,0,0,0,0,0,0


### 2. Data Preprocessing Stage

In [5]:
#replace nan 
def replace_nan(data): 
  print("Replace Empty Values with 0")
  print()
  data = data.fillna(0) 
  #test_df = test_df.fillna(0)   
  return data 
 
#one_hot_encoding 
def one_hot_encoding(train_filtered): 
  categorical_features = ['airconditioningtypeid', 'heatingorsystemtypeid', 'propertylandusedesc', 'storytypeid', 'architecturalstyletypeid', 'typeconstructiontypeid', 'buildingclasstypeid']

  print("Perform One hot encoding")
  print()

  for i in categorical_features:
    if i in train_filtered.columns:
      feature_df = pd.DataFrame(train_filtered, columns=[i]) 
      dum_df = pd.get_dummies(feature_df, columns=[i], prefix=[ i + "_"]) 
      train_filtered = pd.concat([train_filtered, dum_df], axis = 1) 
      train_filtered = train_filtered.drop(columns=[i])   
   
  return train_filtered

def clean_data(data):
  print("Cleaning Data")
  print()

  dataframe = data.copy()

  empty_pool_index = dataframe[dataframe["poolcnt"]==0].index
  dataframe.loc[empty_pool_index,'poolsizesum'] = 0

  pool_index = dataframe[(dataframe['poolcnt'] > 0) & (dataframe['poolsizesum'].isnull())].index
  dataframe.loc[pool_index,'poolsizesum'] = np.nan

  #bad_index = dataframe[dataframe.unitcnt.isnull()].index   
  #dataframe.loc[bad_index,'unitcnt'] = 1

  dataframe['fireplacecnt'] = np.where(dataframe['fireplacecnt'].isnull(),0,1)

  dataframe['decktypeid'] = np.where(dataframe['decktypeid'].isnull(),0,1)

  dataframe['taxdelinquencyflag'] = np.where(dataframe['taxdelinquencyflag']=='',0,1)

  dataframe['hashottuborspa'] = np.where(dataframe['hashottuborspa']=='',0,1)
  
  #dataframe['airconditioningtypeid'] = np.where((dataframe["airconditioningtypeid"].isnull()) & (dataframe.heatingorsystemtypeid==2),1,dataframe["airconditioningtypeid"])

  #dataframe.loc[dataframe[dataframe["heatingorsystemtypeid"].isnull()].index,'heatingorsystemtypeid']=0


  invalid_garages = dataframe.loc[dataframe['calculatedfinishedsquarefeet']<dataframe['finishedfloor1squarefeet']].index
  dataframe.loc[invalid_garages,'finishedfloor1squarefeet'] = np.nan

  garage_index = dataframe[(dataframe.garagecarcnt==0) & (dataframe['garagetotalsqft']>0)].index
  dataframe.loc[garage_index,'garagecarcnt'] = np.nan

  #bad_index = dataframe[dataframe['taxvaluedollarcnt'].isnull()].index
  #dataframe.loc[bad_index,'taxvaluedollarcnt'] = dataframe.loc[bad_index,'structuretaxvaluedollarcnt'] + dataframe.loc[bad_index,'landtaxvaluedollarcnt']
  
  return dataframe


def transform_date(X): 
  print("Transforming Date Information")
  print()
  dt = pd.to_datetime(X['transactiondate']).dt
  X['transaction_year'] = dt.year
  #X['transaction_month'] = ((dt.year - 2016)*12 + dt.month).astype('category')
  X['transaction_month'] = dt.month
  #X['transaction_day'] = dt.day
  #X['transaction_quarter'] = ((dt.year - 2016)*4 + dt.quarter).astype('category')
  X = X.drop(['transactiondate'], axis=1)

  return X


def columns_to_remove(df, threshold):
  print("Removing Columns with" + str(threshold) + "% missing data")
  columns_to_drop = []
  columns_to_keep = []

  percent_missing = train_df.isnull().sum() * 100 / len(train_df)
  missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
  missing_value_df.sort_values('percent_missing', inplace=True,ascending=False)

  #print(missing_value_df)

  # Example: 100 means 100 % of the data is missing while 20 means 20% of the data is missing
  #fig = px.bar(missing_value_df,x=missing_value_df.index.values, y=missing_value_df.percent_missing,title='Missing Information')
  #fig.show()

  for index, row in missing_value_df.iterrows():
      if(row['percent_missing']>threshold):
        columns_to_drop.append(index)
      else:
        columns_to_keep.append(index)

  print("Columns to Drop",len(columns_to_drop),columns_to_drop)
  print("Columns to Keep",len(columns_to_keep),columns_to_keep)
  print()

  df = df.drop(columns_to_drop,axis=1)

  return df,columns_to_drop

def select_features_correlate_positive(train_df):
  features_corr = train_df.corr()['logerror'].sort_values(ascending=False).reset_index().rename(columns={'index':'feature','logerror':'correlation'})
  positive_feature = features_corr[features_corr.correlation>0]
  negative_feature = features_corr[(features_corr.correlation<0) & (features_corr.correlation>-0.001)]

  cols = [col for col in positive_feature.feature if col not in ['logerror','parcelid']]
  neg_cor = [col for col in negative_feature.feature if col not in ['logerror','parcelid']]

  print(features_corr)
  cols = cols +['propertycountylandusecode']#'propertyzoningdesc',
  print('we have about {} selected features'.format(len(cols)))
  return cols, neg_cor

def data_processing(df):
  df = clean_data(df)
  df, columns_to_drop = columns_to_remove(df,90)
  df = one_hot_encoding(df)
  df = replace_nan(df) 
  df = transform_date(df)
  cols, neg_cor = select_features_correlate_positive(df)
  df = df.drop(neg_cor,axis=1)
  print(neg_cor)
  print(cols)

  return df,columns_to_drop,neg_cor


### Feature Selection

In [6]:
train_df = original_train_df.copy()
train_df, columns_to_drop_test,neg_cor = data_processing(train_df)
train_df

Cleaning Data

Removing Columns with90% missing data
Columns to Drop 20 ['buildingclasstypeid', 'finishedsquarefeet13', 'storytypeid', 'basementsqft', 'yardbuildingsqft26', 'fireplaceflag', 'architecturalstyletypeid', 'typeconstructiontypeid', 'finishedsquarefeet6', 'decktypeid', 'pooltypeid10', 'poolsizesum', 'pooltypeid2', 'hashottuborspa', 'taxdelinquencyyear', 'taxdelinquencyflag', 'yardbuildingsqft17', 'finishedsquarefeet15', 'finishedfloor1squarefeet', 'finishedsquarefeet50']
Columns to Keep 40 ['fireplacecnt', 'threequarterbathnbr', 'pooltypeid7', 'poolcnt', 'numberofstories', 'airconditioningtypeid', 'garagetotalsqft', 'garagecarcnt', 'regionidneighborhood', 'heatingorsystemtypeid', 'buildingqualitytypeid', 'propertyzoningdesc', 'unitcnt', 'lotsizesquarefeet', 'finishedsquarefeet12', 'regionidcity', 'fullbathcnt', 'calculatedbathnbr', 'yearbuilt', 'calculatedfinishedsquarefeet', 'censustractandblock', 'structuretaxvaluedollarcnt', 'regionidzip', 'taxamount', 'landtaxvaluedollar

Unnamed: 0,parcelid,logerror,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fireplacecnt,...,heatingorsystemtypeid__6.0,heatingorsystemtypeid__7.0,heatingorsystemtypeid__10.0,heatingorsystemtypeid__11.0,heatingorsystemtypeid__13.0,heatingorsystemtypeid__18.0,heatingorsystemtypeid__20.0,heatingorsystemtypeid__24.0,transaction_year,transaction_month
0,11016594,0.027600,2.0,3.0,4.0,2.0,1684.0,1684.0,6037.0,0,...,0,0,0,0,0,0,0,0,2016,1
1,14366692,-0.168400,3.5,4.0,0.0,3.5,2263.0,2263.0,6059.0,0,...,0,0,0,0,0,0,0,0,2016,1
2,12098116,-0.004000,3.0,2.0,4.0,3.0,2217.0,2217.0,6037.0,0,...,0,0,0,0,0,0,0,0,2016,1
3,12643413,0.021800,2.0,2.0,4.0,2.0,839.0,839.0,6037.0,0,...,0,0,0,0,0,0,0,0,2016,1
4,14432541,-0.005000,2.5,4.0,0.0,2.5,2283.0,2283.0,6059.0,0,...,0,0,0,0,0,0,0,0,2016,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77608,10833991,-0.002245,3.0,3.0,8.0,3.0,1741.0,1741.0,6037.0,0,...,0,0,0,0,0,0,0,0,2017,9
77609,11000655,0.020615,2.0,2.0,6.0,2.0,1286.0,1286.0,6037.0,0,...,0,0,0,0,0,0,0,0,2017,9
77610,17239384,0.013209,2.0,4.0,0.0,2.0,1612.0,1612.0,6111.0,1,...,0,0,0,0,0,0,0,0,2017,9
77611,12773139,0.037129,1.0,3.0,4.0,1.0,1032.0,1032.0,6037.0,0,...,0,0,0,0,0,0,0,0,2017,9


### 4. Splitting Data for Training Data

In [7]:
print(train_df.shape)

(167888, 51)


In [8]:
def prepare_dataset(df, columns_to_drop, test_size, random_state):    
     
    # divide data to train and test
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=random_state)
    
    # scale the training inputs
    x_train = df_train.drop(columns_to_drop,axis=1)
    y_train = df_train['logerror'].to_numpy()

    x_test = df_test.drop(columns_to_drop,axis=1)
    y_test = df_test['logerror'].to_numpy() 
    
    return x_train, y_train, x_test, y_test

In [9]:
additonal_columns_to_drop= ['logerror', 'propertycountylandusecode', 'parcelid', 'propertyzoningdesc','garagecarcnt']
X_train, y_train, X_test, y_test = prepare_dataset(train_df, additonal_columns_to_drop, 0.2,20)

In [10]:
#X_train

5. Model Training

In [11]:
X_train.dtypes

bathroomcnt                     float64
bedroomcnt                      float64
buildingqualitytypeid           float64
calculatedbathnbr               float64
calculatedfinishedsquarefeet    float64
finishedsquarefeet12            float64
fips                            float64
fireplacecnt                      int64
fullbathcnt                     float64
garagetotalsqft                 float64
latitude                        float64
longitude                       float64
lotsizesquarefeet               float64
poolcnt                         float64
pooltypeid7                     float64
propertylandusetypeid           float64
rawcensustractandblock          float64
regionidcounty                  float64
regionidneighborhood            float64
regionidzip                     float64
roomcnt                         float64
threequarterbathnbr             float64
unitcnt                         float64
yearbuilt                       float64
numberofstories                 float64


### Random Forest Regressor

In [12]:
# model_RandomForest = RandomForestRegressor(n_estimators=100, max_depth=3, bootstrap=True, n_jobs=1, random_state=0, oob_score = True)
# model_RandomForest.fit(X_train,y_train)

# y_pred= model_RandomForest.predict(X_test)
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# mae = mean_absolute_error(y_test, y_pred)
# print(f"Model: {model_RandomForest}")
# print(f"MAE: {mae}, RMSE: {rmse}")

### Gradient Boosting Regressor

In [13]:
# from sklearn.ensemble import GradientBoostingRegressor
# model_GradientBoostingRegressor = GradientBoostingRegressor()
# model_GradientBoostingRegressor.fit(X_train,y_train)

# model_GradientBoostingRegressor.fit(X_train,y_train)

# y_pred= model_GradientBoostingRegressor.predict(X_test)
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# mae = mean_absolute_error(y_test, y_pred)
# print(f"Model: {model_GradientBoostingRegressor}")
# print(f"MAE: {mae}, RMSE: {rmse}")

### CatBoost

In [14]:
!pip install catboost
from catboost import   CatBoostRegressor

catBoost_models = []
for i in range(5):
  catBoost = CatBoostRegressor(loss_function='MAE', eval_metric='MAE', random_seed = i)
  catBoost.fit(X_train,y_train)
  catBoost_models.append(catBoost)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0:	learn: 0.0686085	total: 87.4ms	remaining: 1m 27s
1:	learn: 0.0685908	total: 123ms	remaining: 1m 1s
2:	learn: 0.0685727	total: 157ms	remaining: 52.2s
3:	learn: 0.0685590	total: 191ms	remaining: 47.5s
4:	learn: 0.0685458	total: 224ms	remaining: 44.5s
5:	learn: 0.0685301	total: 259ms	remaining: 42.9s
6:	learn: 0.0685158	total: 299ms	remaining: 42.4s
7:	learn: 0.0685030	total: 345ms	remaining: 42.8s
8:	learn: 0.0684915	total: 379ms	remaining: 41.8s
9:	learn: 0.0684803	total: 412ms	remaining: 40.8s
10:	learn: 0.0684694	total: 444ms	remaining: 39.9s
11:	learn: 0.0684576	total: 476ms	remaining: 39.2s
12:	learn: 0.0684471	total: 509ms	remaining: 38.6s
13:	learn: 0.0684362	total: 541ms	remaining: 38.1s
14:	learn: 0.0684259	total: 574ms	remaining: 37.7s
15:	learn: 0.0684144	total: 608ms	remaining: 37.4s
16:	learn: 0.0684055	total: 643ms	remaining: 37.2s
17:	learn: 0.0683964	total: 679ms	remaining: 37.1s
18:	learn: 0.0683891	tota

In [15]:
catBoost_models

[<catboost.core.CatBoostRegressor at 0x7f131a571290>,
 <catboost.core.CatBoostRegressor at 0x7f1317efdb10>,
 <catboost.core.CatBoostRegressor at 0x7f1317ccd910>,
 <catboost.core.CatBoostRegressor at 0x7f13179c6e10>,
 <catboost.core.CatBoostRegressor at 0x7f1317351fd0>]

In [16]:
for i in catBoost_models:
  y_pred= i.predict(X_test)
  rmse = mean_squared_error(y_test, y_pred, squared=False)
  mae = mean_absolute_error(y_test, y_pred)
  print(f"Model: {i}")
  print(f"MAE: {mae}, RMSE: {rmse}")

Model: <catboost.core.CatBoostRegressor object at 0x7f131a571290>
MAE: 0.06880870169010894, RMSE: 0.1694793050227536
Model: <catboost.core.CatBoostRegressor object at 0x7f1317efdb10>
MAE: 0.06884041120813152, RMSE: 0.16954431953140606
Model: <catboost.core.CatBoostRegressor object at 0x7f1317ccd910>
MAE: 0.06882279105091865, RMSE: 0.1694970265595756
Model: <catboost.core.CatBoostRegressor object at 0x7f13179c6e10>
MAE: 0.06880364264594914, RMSE: 0.16943954806928774
Model: <catboost.core.CatBoostRegressor object at 0x7f1317351fd0>
MAE: 0.06881810854076008, RMSE: 0.16952520086771816


### LightGBM

In [17]:
import lightgbm as lgb
# LightGBM parameters
params = {}

params['objective'] = 'regression'
params['metric'] = 'mae'
params['num_threads'] = 4  # set to number of real CPU cores for best performance

params['boosting_type'] = 'gbdt'
params['num_boost_round'] = 2000
params['learning_rate'] = 0.003  # shrinkage_rate
params['early_stopping_rounds'] = 30  # Early stopping based on validation set performance

# Control tree growing
params['num_leaves'] = 127  # max number of leaves in one tree (default 31)
params['min_data'] = 150  # min_data_in_leaf
params['min_hessian'] = 0.001  # min_sum_hessian_in_leaf (default 1e-3)
params['max_depth'] = -1  # limit the max depth of tree model, defult -1 (no limit)
params['max_bin'] = 255  # max number of bins that feature values are bucketed in (small -> less overfitting, default 255)
params['sub_feature'] = 0.5    # feature_fraction (small values => use very different submodels)

# Row subsampling (speed up training and alleviate overfitting)
params['bagging_fraction'] = 0.7
params['bagging_freq'] = 50  # perform bagging at every k iteration

# Constraints on categorical features
params['min_data_per_group'] = 100  # minimal number of data per categorical group (default 100)
params['cat_smooth'] = 15.0  # reduce effect of noises in categorical features, especially for those with few data (default 10.0)

# Regularization (default 0.0)
params['lambda_l1'] = 0.0
params['lambda_l2'] = 0.0

# Random seeds (keep default values)
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

In [18]:
#categorical_features = ['airconditioningtypeid', 'heatingorsystemtypeid', 'propertylandusedesc', 'storytypeid', 'architecturalstyletypeid', 'typeconstructiontypeid', 'buildingclasstypeid']
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_test = lgb.Dataset(X_test, label=y_test, reference=lgb_train)
lightGBM_model = lgb.train(params=params, train_set=lgb_train, verbose_eval=False,
                valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'])



In [19]:
y_pred= lightGBM_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Model: {lightGBM_model}")
print(f"MAE: {mae}, RMSE: {rmse}")

Model: <lightgbm.basic.Booster object at 0x7f13173b1310>
MAE: 0.06984298850537582, RMSE: 0.16962863544283907


# Model Testing

In [20]:
sample_submission_data

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0
...,...,...,...,...,...,...,...
2985212,168176230,0,0,0,0,0,0
2985213,14273630,0,0,0,0,0,0
2985214,168040630,0,0,0,0,0,0
2985215,168040830,0,0,0,0,0,0


In [21]:
original_test_df = pd.merge(sample_submission_data[['ParcelId']], properties_2017.rename(columns = {'parcelid': 'ParcelId'}), how = 'left', on = 'ParcelId')
original_test_df

Unnamed: 0,ParcelId,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2016.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,5.0,,,,...,1.0,,660680.0,1434941.0,2016.0,774261.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,6.0,,,...,1.0,,580059.0,1174475.0,2016.0,594416.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,1.0,,196751.0,440101.0,2016.0,243350.0,5725.17,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2985212,168176230,,,,0.0,0.0,,,,,...,,,110245.0,226900.0,2016.0,116655.0,3035.26,,,
2985213,14273630,,,,3.0,3.0,,,3.0,,...,,,378466.0,762157.0,2016.0,383691.0,8653.10,,,
2985214,168040630,,,,2.5,3.0,,,2.5,,...,,,201575.0,483188.0,2016.0,281613.0,5671.40,,,
2985215,168040830,,,,3.0,4.0,,,3.0,,...,,,347863.0,578621.0,2016.0,230758.0,5894.36,,,


In [22]:
additonal_test_columns_to_drop= ['propertycountylandusecode', 'propertyzoningdesc','garagecarcnt','ParcelId','airconditioningtypeid__12.0']
def test_data_processing(df):
  df = clean_data(df)
  df = df.drop(columns_to_drop_test,axis=1)
  df = one_hot_encoding(df)
  df = df.drop(neg_cor,axis=1)
  #print(df.columns)
  df = df.drop(additonal_test_columns_to_drop,axis=1)
  df = replace_nan(df) 

  return df

In [23]:
basic_test_df = test_data_processing(original_test_df)
basic_test_df

Cleaning Data

Perform One hot encoding

Replace Empty Values with 0



Unnamed: 0,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fireplacecnt,fullbathcnt,garagetotalsqft,...,heatingorsystemtypeid__6.0,heatingorsystemtypeid__7.0,heatingorsystemtypeid__10.0,heatingorsystemtypeid__11.0,heatingorsystemtypeid__13.0,heatingorsystemtypeid__18.0,heatingorsystemtypeid__19.0,heatingorsystemtypeid__20.0,heatingorsystemtypeid__21.0,heatingorsystemtypeid__24.0
0,0.0,0.0,0.0,0.0,0.0,0.0,6037.0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,6037.0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,73026.0,0.0,6037.0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,6.0,0.0,5068.0,0.0,6037.0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,1776.0,0.0,6037.0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2985212,0.0,0.0,0.0,0.0,0.0,0.0,6059.0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2985213,3.0,3.0,0.0,3.0,2009.0,2009.0,6059.0,0,2.0,549.0,...,0,0,0,0,0,0,0,0,0,0
2985214,2.5,3.0,0.0,2.5,1390.0,1390.0,6059.0,0,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2985215,3.0,4.0,0.0,3.0,2381.0,2381.0,6059.0,0,3.0,442.0,...,0,0,0,0,0,0,0,0,0,0


### CatBoostx5 Model Prediction

In [24]:
Prediction_Dates = {'201610':'2016-10-01','201611':'2016-11-01','201612':'2016-12-01','201710':'2017-10-01','201711':'2017-11-01','201712':'2017-12-01'}
prediction_list = []
catBoost_models_sample_submission_data = sample_submission_data.copy()
for key,val in Prediction_Dates.items():
  print(key,val)
  test_df = basic_test_df.copy()
  test_df['transactiondate'] = pd.Timestamp(val) 
  test_df = transform_date(test_df)
  #predictions = model4.predict(test_df)
  #predictions2 = model7.predict(test_df)
  for i in catBoost_models:
    print(catBoost_models.index(i))
    prediction = i.predict(test_df)
    prediction_list.append(prediction)
  predictions = (prediction_list[0]+prediction_list[1]+prediction_list[2]+prediction_list[3]+prediction_list[4])/5
  catBoost_models_sample_submission_data[key] = [float(format(x, '.4f')) for x in predictions]

201610 2016-10-01
Transforming Date Information

0
1
2
3
4
201611 2016-11-01
Transforming Date Information

0
1
2
3
4
201612 2016-12-01
Transforming Date Information

0
1
2
3
4
201710 2017-10-01
Transforming Date Information

0
1
2
3
4
201711 2017-11-01
Transforming Date Information

0
1
2
3
4
201712 2017-12-01
Transforming Date Information

0
1
2
3
4


In [25]:
catBoost_models_sample_submission_data

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.0147,0.0147,0.0147,0.0147,0.0147,0.0147
1,10759547,0.0230,0.0230,0.0230,0.0230,0.0230,0.0230
2,10843547,0.0458,0.0458,0.0458,0.0458,0.0458,0.0458
3,10859147,0.0236,0.0236,0.0236,0.0236,0.0236,0.0236
4,10879947,0.0219,0.0219,0.0219,0.0219,0.0219,0.0219
...,...,...,...,...,...,...,...
2985212,168176230,0.0090,0.0090,0.0090,0.0090,0.0090,0.0090
2985213,14273630,0.0107,0.0107,0.0107,0.0107,0.0107,0.0107
2985214,168040630,0.0110,0.0110,0.0110,0.0110,0.0110,0.0110
2985215,168040830,0.0238,0.0238,0.0238,0.0238,0.0238,0.0238


In [26]:
catBoost_models_sample_submission_data.to_csv('catBoost_submission.csv',index=False)
from google.colab import files
#files.download("catBoost_submission.csv")

### LightGBM

In [27]:
Prediction_Dates = {'201610':'2016-10-01','201611':'2016-11-01','201612':'2016-12-01','201710':'2017-10-01','201711':'2017-11-01','201712':'2017-12-01'}
lightGBM_model_sample_submission_data = sample_submission_data.copy()
for key,val in Prediction_Dates.items():
  print(key,val)
  test_df = basic_test_df.copy()
  test_df['transactiondate'] = pd.Timestamp(val) 
  test_df = transform_date(test_df)
  predictions = lightGBM_model.predict(test_df)
  lightGBM_model_sample_submission_data[key] = [float(format(x, '.4f')) for x in predictions]

201610 2016-10-01
Transforming Date Information

201611 2016-11-01
Transforming Date Information

201612 2016-12-01
Transforming Date Information

201710 2017-10-01
Transforming Date Information

201711 2017-11-01
Transforming Date Information

201712 2017-12-01
Transforming Date Information



In [28]:
lightGBM_model_sample_submission_data

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.0469,0.0469,0.0469,0.0469,0.0469,0.0469
1,10759547,0.0401,0.0401,0.0401,0.0401,0.0401,0.0401
2,10843547,0.0147,0.0147,0.0147,0.0147,0.0147,0.0147
3,10859147,0.0213,0.0213,0.0213,0.0213,0.0213,0.0213
4,10879947,0.0006,0.0006,0.0006,0.0006,0.0006,0.0006
...,...,...,...,...,...,...,...
2985212,168176230,0.0098,0.0098,0.0098,0.0098,0.0098,0.0098
2985213,14273630,0.0205,0.0205,0.0205,0.0205,0.0205,0.0205
2985214,168040630,0.0136,0.0136,0.0136,0.0136,0.0136,0.0136
2985215,168040830,0.0186,0.0186,0.0186,0.0186,0.0186,0.0186


In [29]:
lightGBM_model_sample_submission_data.to_csv('lightGBM_submission.csv',index=False)
#files.download("lightGBM_submission.csv")

### Combined Model

In [36]:
catboost_weight = 0.9
combined_stacked_model = pd.DataFrame()

In [37]:
combined_stacked_model['ParcelId'] = lightGBM_model_sample_submission_data['ParcelId']
for col in ['201610', '201611', '201612', '201710', '201711', '201712']:
    combined_stacked_model[col] = catboost_weight * catBoost_models_sample_submission_data[col] + (1 - catboost_weight) * lightGBM_model_sample_submission_data[col]

print(combined_stacked_model.head())
combined_stacked_model.to_csv('combined_stacked_model.csv', index=False)
files.download("combined_stacked_model.csv")

   ParcelId   201610   201611   201612   201710   201711   201712
0  10754147  0.01792  0.01792  0.01792  0.01792  0.01792  0.01792
1  10759547  0.02471  0.02471  0.02471  0.02471  0.02471  0.02471
2  10843547  0.04269  0.04269  0.04269  0.04269  0.04269  0.04269
3  10859147  0.02337  0.02337  0.02337  0.02337  0.02337  0.02337
4  10879947  0.01977  0.01977  0.01977  0.01977  0.01977  0.01977


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>