** In this challenge, your task is to predict a transformed count of hazards or pre-existing damages using a dataset of property information. **

This will enable Liberty Mutual to more accurately identify high risk homes that require additional examination to confirm their insurability.

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn import preprocessing
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer

# load train and test
train = pd.read_csv('../input/liberty/train.csv', index_col=0)
test = pd.read_csv('../input/liberty/test.csv', index_col=0)
sample = pd.read_csv('../input/liberty1/sample_submission.csv', index_col=0)


In [2]:
print(train.shape)
print(test.shape)

(50999, 33)
(51000, 32)


In [3]:
train.head(5)

Unnamed: 0_level_0,Hazard,T1_V1,T1_V2,T1_V3,T1_V4,T1_V5,T1_V6,T1_V7,T1_V8,T1_V9,...,T2_V6,T2_V7,T2_V8,T2_V9,T2_V10,T2_V11,T2_V12,T2_V13,T2_V14,T2_V15
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,15,3,2,N,B,N,B,B,D,...,2,37,1,11,6,Y,N,E,2,2
2,4,16,14,5,H,B,N,B,B,C,...,2,22,1,18,5,Y,Y,E,2,1
3,1,10,10,5,N,K,N,B,B,E,...,6,37,2,14,6,Y,Y,E,6,1
4,1,18,18,5,N,K,N,B,B,E,...,2,25,1,1,6,Y,N,C,2,6
5,1,13,19,5,N,H,N,B,B,E,...,1,22,1,2,7,N,N,E,1,1


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50999 entries, 1 to 101999
Data columns (total 33 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Hazard  50999 non-null  int64 
 1   T1_V1   50999 non-null  int64 
 2   T1_V2   50999 non-null  int64 
 3   T1_V3   50999 non-null  int64 
 4   T1_V4   50999 non-null  object
 5   T1_V5   50999 non-null  object
 6   T1_V6   50999 non-null  object
 7   T1_V7   50999 non-null  object
 8   T1_V8   50999 non-null  object
 9   T1_V9   50999 non-null  object
 10  T1_V10  50999 non-null  int64 
 11  T1_V11  50999 non-null  object
 12  T1_V12  50999 non-null  object
 13  T1_V13  50999 non-null  int64 
 14  T1_V14  50999 non-null  int64 
 15  T1_V15  50999 non-null  object
 16  T1_V16  50999 non-null  object
 17  T1_V17  50999 non-null  object
 18  T2_V1   50999 non-null  int64 
 19  T2_V2   50999 non-null  int64 
 20  T2_V3   50999 non-null  object
 21  T2_V4   50999 non-null  int64 
 22  T2_V5   50999 non-nul

In [5]:
train.apply(lambda x: len(x.unique()))

Hazard     50
T1_V1      19
T1_V2      24
T1_V3       9
T1_V4       8
T1_V5      10
T1_V6       2
T1_V7       4
T1_V8       4
T1_V9       6
T1_V10      5
T1_V11     12
T1_V12      4
T1_V13      4
T1_V14      5
T1_V15      8
T1_V16     18
T1_V17      2
T2_V1     100
T2_V2      39
T2_V3       2
T2_V4      22
T2_V5       6
T2_V6       7
T2_V7       7
T2_V8       3
T2_V9      25
T2_V10      7
T2_V11      2
T2_V12      2
T2_V13      5
T2_V14      7
T2_V15     12
dtype: int64

In [6]:
test.head(5)

Unnamed: 0_level_0,T1_V1,T1_V2,T1_V3,T1_V4,T1_V5,T1_V6,T1_V7,T1_V8,T1_V9,T1_V10,...,T2_V6,T2_V7,T2_V8,T2_V9,T2_V10,T2_V11,T2_V12,T2_V13,T2_V14,T2_V15
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,2,13,4,C,A,Y,B,B,D,12,...,2,28,1,22,6,Y,N,E,2,7
7,10,10,7,N,C,Y,B,B,D,8,...,3,28,1,4,3,Y,N,E,5,8
8,9,20,4,N,H,Y,B,B,E,8,...,2,22,1,1,7,N,N,C,6,1
9,11,18,2,N,H,Y,B,B,D,12,...,4,40,1,20,6,Y,N,E,5,5
10,4,5,4,H,K,Y,B,B,E,8,...,2,34,1,11,7,Y,Y,E,2,1


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51000 entries, 6 to 101997
Data columns (total 32 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   T1_V1   51000 non-null  int64 
 1   T1_V2   51000 non-null  int64 
 2   T1_V3   51000 non-null  int64 
 3   T1_V4   51000 non-null  object
 4   T1_V5   51000 non-null  object
 5   T1_V6   51000 non-null  object
 6   T1_V7   51000 non-null  object
 7   T1_V8   51000 non-null  object
 8   T1_V9   51000 non-null  object
 9   T1_V10  51000 non-null  int64 
 10  T1_V11  51000 non-null  object
 11  T1_V12  51000 non-null  object
 12  T1_V13  51000 non-null  int64 
 13  T1_V14  51000 non-null  int64 
 14  T1_V15  51000 non-null  object
 15  T1_V16  51000 non-null  object
 16  T1_V17  51000 non-null  object
 17  T2_V1   51000 non-null  int64 
 18  T2_V2   51000 non-null  int64 
 19  T2_V3   51000 non-null  object
 20  T2_V4   51000 non-null  int64 
 21  T2_V5   51000 non-null  object
 22  T2_V6   51000 non-nul

In [8]:
test.apply(lambda x: len(x.unique()))

T1_V1      19
T1_V2      24
T1_V3       9
T1_V4       8
T1_V5      10
T1_V6       2
T1_V7       4
T1_V8       4
T1_V9       6
T1_V10      5
T1_V11     12
T1_V12      4
T1_V13      4
T1_V14      5
T1_V15      8
T1_V16     18
T1_V17      2
T2_V1     100
T2_V2      39
T2_V3       2
T2_V4      22
T2_V5       6
T2_V6       7
T2_V7       7
T2_V8       3
T2_V9      25
T2_V10      7
T2_V11      2
T2_V12      2
T2_V13      5
T2_V14      7
T2_V15     12
dtype: int64

both train, test datasets are made up of categorical values.

In [9]:
sample.head(5)

Unnamed: 0_level_0,Hazard
Id,Unnamed: 1_level_1
6,0
7,0
8,0
9,0
10,0


# plan

algorithm : xgboost

encoding : label_encoding, DictVectorizer

submission = pred_1(label_encoding) + pred_2(DictVectorizer)

# Preprocessing

In [10]:
labels = train.Hazard
train.drop('Hazard', axis=1, inplace=True)

train_s = train
test_s = test

columns = train.columns
test_ind = test.index

train_s = np.array(train_s)
test_s = np.array(test_s)

In [11]:
# label encode the categorical variables
for i in range(train_s.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_s[:, i]) + list(test_s[:, i]))
    train_s[:, i] = lbl.transform(train_s[:, i])
    test_s[:, i] = lbl.transform(test_s[:, i])

train_s = train_s.astype(float)
test_s = test_s.astype(float)

XGboost Function

In [12]:
def xgboost_pred(train, labels, test):
    params = {}
    params["objective"] = "reg:linear"
    params["eta"] = 0.005
    params["min_child_weight"] = 6
    params["subsample"] = 0.7
    params["colsample_bytree"] = 0.7
    params["scale_pos_weight"] = 1
    params["silent"] = 1
    params["max_depth"] = 9

    plst = list(params.items())

    # Using 4000 rows for early stopping.
    offset = 4000

    num_rounds = 10000
    xgtest = xgb.DMatrix(test)

    # create a train and validation dmatrices
    xgtrain = xgb.DMatrix(train[offset:, :], label=labels[offset:])
    xgval = xgb.DMatrix(train[:offset, :], label=labels[:offset])

    # train using early stopping and predict
    watchlist = [(xgtrain, 'train'), (xgval, 'val')]
    model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=120)
    preds1 = model.predict(xgtest, ntree_limit=model.best_iteration)

    # reverse train and labels and use different 5k for early stopping.
    train = train[::-1, :]
    labels = np.log(labels[::-1])

    xgtrain = xgb.DMatrix(train[offset:, :], label=labels[offset:])
    xgval = xgb.DMatrix(train[:offset, :], label=labels[:offset])

    watchlist = [(xgtrain, 'train'), (xgval, 'val')]
    model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=120)
    preds2 = model.predict(xgtest, ntree_limit=model.best_iteration)

    # combine predictions
    preds = (preds1) * 1.4 + (preds2) * 8.6
    return preds

In [13]:
# model_1. xgboost - label encoding

preds1 = xgboost_pred(train_s, labels, test_s)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:5.33730	val-rmse:5.28459
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 120 rounds.
[1]	train-rmse:5.32382	val-rmse:5.27193
[2]	train-rmse:5.31057	val-rmse:5.25943
[3]	train-rmse:5.29736	val-rmse:5.24691
[4]	train-rmse:5.28454	val-rmse:5.23475
[5]	train-rmse:5.27147	val-rmse:5.22246
[6]	train-rmse:5.25888	val-rmse:5.21033
[7]	train-rmse:5.24602	val-rmse:5.19807
[8]	train-rmse:5.23382	val-rmse:5.18644
[9]	train-rmse:5.22154	val-rmse:5.17467
[10]	train-rmse:5.20920	val-rmse:5.16278
[11]	train-rmse:5.19714	val-rmse:5.15122
[12]	train-rmse:5.18530	val-rmse:5.13985
[13]	train-rmse:5.17331	val-rmse:5.12842
[14]	train-r

In [14]:
# model_2. xgboost - DictVectorizer

train = train.T.to_dict().values()
test = test.T.to_dict().values()

vec = DictVectorizer()
train = vec.fit_transform(train)
test = vec.transform(test)

preds2 = xgboost_pred(train, labels, test)

preds = 0.47 * (preds1 ** 0.2) + 0.53 * (preds2 ** 0.8)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:5.33691	val-rmse:5.28411
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 120 rounds.
[1]	train-rmse:5.32349	val-rmse:5.27138
[2]	train-rmse:5.31050	val-rmse:5.25911
[3]	train-rmse:5.29733	val-rmse:5.24672
[4]	train-rmse:5.28484	val-rmse:5.23477
[5]	train-rmse:5.27214	val-rmse:5.22249
[6]	train-rmse:5.25964	val-rmse:5.21066
[7]	train-rmse:5.24703	val-rmse:5.19846
[8]	train-rmse:5.23444	val-rmse:5.18632
[9]	train-rmse:5.22191	val-rmse:5.17467
[10]	train-rmse:5.20980	val-rmse:5.16293
[11]	train-rmse:5.19739	val-rmse:5.15103
[12]	train-rmse:5.18496	val-rmse:5.13951
[13]	train-rmse:5.17268	val-rmse:5.12778
[14]	train-r

In [15]:
# generate solution
preds = pd.DataFrame({"Id": test_ind, "Hazard": preds})
preds = preds.set_index('Id')
preds.to_csv('submission.csv')