In [23]:
import  numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sys
import xgboost as xgb


In [3]:
sys.path.append('/home/boris/projects/aimasters/utils')
from utils import *

In [4]:
train = pd.read_csv('data/train_contest.csv')
test = pd.read_csv('data/test_contest.csv')

In [5]:
train.columns
test.columns

Index(['index', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9',
       ...
       'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11',
       'cont12', 'cont13', 'cont14'],
      dtype='object', length=131)

In [15]:
correlation_matrix(new_train)

Расчет матрицы корреляций для 131 признаков...
Матрица корреляций рассчитана.
Матрица корреляций слишком велика для визуализации heatmap.

Поиск сильно скоррелированных пар (абсолютное значение > 0.95)...
Найдено 3 пар с корреляцией > 0.9:
  - cont12 и cont11: 0.9944
  - cat89 и cat7: 0.9586
  - cat90 и cat3: 0.9572


In [6]:
cat_features = train.select_dtypes('object').columns
num_features = train.select_dtypes(np.number).columns
features = train.columns.drop('target')
target = 'target'

In [7]:
from sklearn.preprocessing import OneHotEncoder
from  sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
ct = ColumnTransformer([
    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', encoded_missing_value=np.nan), cat_features),
    ('num', StandardScaler(), num_features)
])
pipeline = Pipeline([
    ('ct', ct)
])
# pipeline.fit(train)
new_train = pd.DataFrame(enc.fit_transform(train[cat_features]))
new_test = pd.DataFrame(enc.transform(test[cat_features]))
new_train.columns = cat_features
new_test.columns = cat_features
numer = train[num_features]
new_train = pd.concat([new_train, numer], axis=1)
new_test = pd.concat([new_test, test[num_features[:-1]]], axis=1)


In [8]:
print(new_train.columns)
print(num_features[:-1])
new_test['target'] = 0

Index(['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9',
       'cat10',
       ...
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14', 'target'],
      dtype='object', length=131)
Index(['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8',
       'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14'],
      dtype='object')


In [12]:
tr_tr, tr_test = get_split(new_train)
print(tr_tr.shape)
print(tr_test.shape)
print()

(80462, 131)
(39632, 131)


In [17]:
print(tr_tr.columns)

Index(['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9',
       'cat10',
       ...
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14', 'target'],
      dtype='object', length=131)


In [16]:

baseline = train_xgb_model(tr = tr_tr, val = tr_test, features=features, target_col='target', params= {'objective': 'reg:squarederror'})

[0]	val_name-rmse:2845.05323


Parameters: { "metric", "verbose" } are not used.

  self.starting_round = model.num_boosted_rounds()


[50]	val_name-rmse:2433.10727
[100]	val_name-rmse:2228.60009
[150]	val_name-rmse:2120.49147
[200]	val_name-rmse:2059.49725
[250]	val_name-rmse:2020.01692
[299]	val_name-rmse:1993.26120


In [21]:
features_no_corr = tr_tr.columns.drop('cat89')
features_no_corr = features_no_corr.drop('cat90')
features_no_corr = features_no_corr.drop('cont12')

In [22]:
no_corr_baseline = train_xgb_model(tr = tr_tr, val = tr_test, features=features_no_corr, target_col='target', params= {'objective': 'reg:squarederror'})


[0]	val_name-rmse:2829.00242


Parameters: { "metric", "verbose" } are not used.

  self.starting_round = model.num_boosted_rounds()


[50]	val_name-rmse:1730.18841
[100]	val_name-rmse:1075.86102
[150]	val_name-rmse:693.80686
[200]	val_name-rmse:483.85519
[250]	val_name-rmse:378.82773
[299]	val_name-rmse:333.72808


In [None]:
from typing import Tuple

def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the gradient squared log error.'''
    y = dtrain.get_label()
    return (np.log1p(predt) - np.log1p(y)) / (predt + 1)

def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()
    return ((-np.log1p(predt) + np.log1p(y) + 1) /
            np.power(predt + 1, 2))

def squared_log(predt: np.ndarray,
                dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    '''Squared Log Error objective. A simplified version for RMSLE used as
    objective function.
    '''
    predt[predt < -1] = -1 + 1e-6
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess

