# Model Algorithm (Hyperparameter Tuning):

In [None]:
features = [x for x in train.columns if x not in ['id','loss']]

cat_features = [x for x in train.select_dtypes(
        include=['object']).columns if x not in ['id','loss']]
num_features = [x for x in train.select_dtypes(
        exclude=['object']).columns if x not in ['id','loss',]]

In [None]:
ntrain = train.shape[0]

train_x = train[features]
train_y = train['loss']
#categorized each categorical variables into numerical value by using below code
for c in range(len(cat_features)):
    train_x[cat_features[c]] = train_x[cat_features[c]].astype('category').cat.codes
    
print( "Xtrain:", train_x.shape)
print( "ytrain:", train_y.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Xtrain: (188318, 130)
ytrain: (188318,)


In [None]:
# train_test split
from sklearn.model_selection import train_test_split
x_train,x_cv_base,y_train,y_cv=train_test_split(train_x,train_y,test_size=0.2)

In [None]:
# hyperparameter tuning by using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
xgb_param_grid = {'gamma':[ 0.1 * i for i in range(0,5)],
                 'subsample':[ 0.1 * i for i in range(5,10)],
                'colsample_bytree':[ 0.1 * i for i in range(5,10)]}

xgb_model = XGBRegressor()
XGB_random = RandomizedSearchCV(xgb_model, param_distributions=xgb_param_grid,
                                   n_iter=5,cv = 3,scoring='neg_mean_absolute_error',random_state=25,verbose=3,n_jobs=-1)

XGB_random.fit(x_train, y_train.values)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


RandomizedSearchCV(cv=3, estimator=XGBRegressor(), n_iter=5, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.5,
                                                             0.6000000000000001,
                                                             0.7000000000000001,
                                                             0.8, 0.9],
                                        'gamma': [0.0, 0.1, 0.2,
                                                  0.30000000000000004, 0.4],
                                        'subsample': [0.5, 0.6000000000000001,
                                                      0.7000000000000001, 0.8,
                                                      0.9]},
                   random_state=25, scoring='neg_mean_absolute_error',
                   verbose=3)

In [None]:
XGB_random.cv_results_

{'mean_fit_time': array([42.1159087 , 37.78492133, 39.81208793, 41.54098248, 32.77899106]),
 'mean_score_time': array([0.35750484, 0.3298804 , 0.31528974, 0.30607009, 0.2866923 ]),
 'mean_test_score': array([-1255.83230928, -1253.69140322, -1253.75059459, -1258.11357556,
        -1252.06375291]),
 'param_colsample_bytree': masked_array(data=[0.6000000000000001, 0.6000000000000001,
                    0.6000000000000001, 0.6000000000000001,
                    0.6000000000000001],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_gamma': masked_array(data=[0.30000000000000004, 0.1, 0.30000000000000004, 0.2,
                    0.1],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_subsample': masked_array(data=[0.6000000000000001, 0.8, 0.7000000000000001, 0.5, 0.9],
              mask=[False, False, False, False, False],
        fill_value='?',
           

In [None]:
scores = abs(XGB_random.cv_results_['mean_test_score'])
best_gamma = list(XGB_random.cv_results_['param_gamma'].data)
best_subsample = list(XGB_random.cv_results_['param_subsample'].data)

In [None]:
XGB_random.best_score_

-1252.063752907384

In [None]:
XGB_random.best_params_

{'colsample_bytree': 0.6000000000000001, 'gamma': 0.1, 'subsample': 0.9}

##Final Model :

In [None]:
bst_model = XGBRegressor(num_boost_round=200, eta=0.07, gamma=0.1, max_depth=8, min_child_weight=6,
                                         colsample_bytree=0.6, subsample=0.9)
bst_model.fit(train_x, train_y.values)



XGBRegressor(colsample_bytree=0.6, eta=0.07, gamma=0.1, max_depth=8,
             min_child_weight=6, num_boost_round=200, subsample=0.9)

In [None]:
y_pred = bst_model.predict(x_cv_base)
print(mean_absolute_error(y_cv, y_pred))

1094.658741213913


#Conclusion :

Here after hyperparameter tuning we get best score of 1094.65. This score can be further improved by using more complex model and using different hyperparameter tuning.



 
### Room for Improvement 
1.   Fit a more complex XGBoost model by adding even more estimators and reducing eta at the same time.
2.   Run Grid Search on different values of hyperparameters.
3.   Ensemble several XGBoost models, trained with different hyperparameters. This can be done by bagging (averaging the score of the models), blending and stacking.

