# Фишка с выбросами
В данных есть явные выбросы, которые сильно влияют на оцениваемую метрику ошибки
В предыдущем ядре, мы вручную добавили бинарный признак, характеризующий строку данных как выброс или нет
## Шаг 1
мы обучим модель на данных без выбросов и будем использовать ее прогноз как базовый
## Шаг 2
обучим новую модель для предсказания, являются ли данные выбросом
## Шаг 3
берем первые n (10к) наиболее вероятных выбросов из прогноза модели с шага 2 и заменяем ответы у этих card_id в прогнозе, полученном от модели без выбросов (из шага 1), на ответы нашей лучшей модели, которая обучалась на смешанных данных

Ссылка на соревнование: https://www.kaggle.com/c/elo-merchant-category-recommendation

In [None]:
import numpy as np
import pandas as pd
import time
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

In [9]:
# rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

## Step 1. Training model without outliers

In [10]:
%%time
df_train = pd.read_csv('D:\Ellunium\elo/train_data_clean_16_02.csv')
df_test = pd.read_csv('D:\Ellunium\elo/test_data_clean_16_02.csv')

Wall time: 24.4 s


In [12]:
FEATS_EXCLUDED = ['first_active_month', 'target', 'card_id', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size',
                  'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size',
                  'OOF_PRED', 'month_0','observation_date_x','observation_date_y']

df_train = df_train[df_train['outliers'] == 0]
target = df_train['target']
del df_train['target']
features = [c for c in df_train.columns if c not in FEATS_EXCLUDED]
categorical_feats = [c for c in features if 'feature_' in c]

In [13]:
%%time
param = {'objective':'regression',
         'num_leaves': 31,
         'min_data_in_leaf': 25,
         'max_depth': 7,
         'learning_rate': 0.01,
         'lambda_l1':0.13,
         "boosting": "gbdt",
         "feature_fraction":0.85,
         'bagging_freq':8,
         "bagging_fraction": 0.9 ,
         "metric": 'rmse',
         "verbosity": -1,
         "random_state": 2333}

folds = StratifiedKFold(n_splits=11, shuffle=True, random_state=1245)
oof_lgb = np.zeros(len(df_train))
predictions_lgb = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval= 100, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions_lgb += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof_lgb, target)**0.5))

fold 0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 1.60934	valid_1's rmse: 1.60201
[200]	training's rmse: 1.57868	valid_1's rmse: 1.57461
[300]	training's rmse: 1.56401	valid_1's rmse: 1.56407
[400]	training's rmse: 1.55456	valid_1's rmse: 1.55898
[500]	training's rmse: 1.5474	valid_1's rmse: 1.55583
[600]	training's rmse: 1.54136	valid_1's rmse: 1.55376
[700]	training's rmse: 1.5359	valid_1's rmse: 1.55246
[800]	training's rmse: 1.53098	valid_1's rmse: 1.55116
[900]	training's rmse: 1.52655	valid_1's rmse: 1.55059
[1000]	training's rmse: 1.52244	valid_1's rmse: 1.54971
[1100]	training's rmse: 1.5186	valid_1's rmse: 1.54921
[1200]	training's rmse: 1.51484	valid_1's rmse: 1.54904
[1300]	training's rmse: 1.51115	valid_1's rmse: 1.54861
[1400]	training's rmse: 1.50763	valid_1's rmse: 1.54828
[1500]	training's rmse: 1.50425	valid_1's rmse: 1.54808
[1600]	training's rmse: 1.50073	valid_1's rmse: 1.54781
[1700]	training's rmse: 1.49736	valid_1's rmse

[1700]	training's rmse: 1.49258	valid_1's rmse: 1.59724
[1800]	training's rmse: 1.48917	valid_1's rmse: 1.59725
[1900]	training's rmse: 1.48607	valid_1's rmse: 1.59711
[2000]	training's rmse: 1.48284	valid_1's rmse: 1.597
[2100]	training's rmse: 1.4796	valid_1's rmse: 1.59679
[2200]	training's rmse: 1.47652	valid_1's rmse: 1.59665
[2300]	training's rmse: 1.47356	valid_1's rmse: 1.59666
[2400]	training's rmse: 1.47061	valid_1's rmse: 1.59663
[2500]	training's rmse: 1.46753	valid_1's rmse: 1.59653
[2600]	training's rmse: 1.46448	valid_1's rmse: 1.59656
[2700]	training's rmse: 1.46166	valid_1's rmse: 1.59653
[2800]	training's rmse: 1.45874	valid_1's rmse: 1.59644
[2900]	training's rmse: 1.45592	valid_1's rmse: 1.59623
[3000]	training's rmse: 1.45302	valid_1's rmse: 1.59617
[3100]	training's rmse: 1.45024	valid_1's rmse: 1.59642
Early stopping, best iteration is:
[2923]	training's rmse: 1.45525	valid_1's rmse: 1.5961
fold 6
Training until validation scores don't improve for 200 rounds.
[10

In [34]:
#Cat boost
import catboost as cb

oof_cb = np.zeros(len(df_train))
predictions_cb = np.zeros(df_test.shape[0])

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=345)

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train[features], df_train['outliers'].values)):
    train_x, train_y = df_train[features].iloc[train_idx], target.iloc[train_idx]
    valid_x, valid_y = df_train[features].iloc[valid_idx], target.iloc[valid_idx]
    
    # CatBoost Regressor estimator
    model = cb.CatBoostRegressor(
        learning_rate = 0.03,
        iterations = 1000,
        eval_metric = 'RMSE',
        allow_writing_files = False,
        od_type = 'Iter',
        bagging_temperature = 0.2,
        depth = 10,
        od_wait = 20,
        silent = True
    )
    
    # Fit
    model.fit(
        train_x, train_y,
        eval_set=[(train_x, train_y), (valid_x, valid_y)],
        verbose=None,
        early_stopping_rounds=100
    )
    
    print("CB " + str(n_fold) + "-" * 50)
    
    oof_cb[valid_idx] = model.predict(valid_x)
    test_preds = model.predict(df_test[features])
    predictions_cb += test_preds / folds.n_splits
    print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_cb[valid_idx])))

print('Cat Boost RMSE: {:<8.5f}'.format(rmse(oof_cb, target)))

CB 0--------------------------------------------------
Fold  1 RMSE : 1.547999
CB 1--------------------------------------------------
Fold  2 RMSE : 1.557780
CB 2--------------------------------------------------
Fold  3 RMSE : 1.564711
CB 3--------------------------------------------------
Fold  4 RMSE : 1.568365
CB 4--------------------------------------------------
Fold  5 RMSE : 1.556924
Cat Boost RMSE: 1.55917 


In [44]:
%%time
#XGB
import xgboost as xgb

xgb_params = {'eta': 0.005, 'max_depth': 9, 'subsample': 0.8, 'colsample_bytree': 0.8, 
          'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True}

oof_xgb = np.zeros(len(df_train))
predictions_xgb = np.zeros(len(df_test))

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=345)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(df_train[features], df_train['outliers'].values)):
    trn_data = xgb.DMatrix(data=df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = xgb.DMatrix(data=df_train.iloc[val_idx][features], label=target.iloc[val_idx])
    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
    print("XGB " + str(n_fold) + "-" * 50)
    num_round = 10000
    xgb_model = xgb.train(xgb_params, trn_data, num_round, watchlist, early_stopping_rounds=50, verbose_eval=1000)
    oof_xgb[val_idx] = xgb_model.predict(xgb.DMatrix(df_train.iloc[val_idx][features]), ntree_limit=xgb_model.best_ntree_limit+50)

    predictions_xgb += xgb_model.predict(xgb.DMatrix(df_test[features]), ntree_limit=xgb_model.best_ntree_limit+50) / folds.n_splits
    print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(target.iloc[val_idx], oof_xgb[val_idx])))
    
print(np.sqrt(mean_squared_error(oof_xgb, target)))

XGB 0--------------------------------------------------
[0]	train-rmse:1.80037	valid-rmse:1.77758
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[1000]	train-rmse:1.38643	valid-rmse:1.54772
Stopping. Best iteration:
[1460]	train-rmse:1.34052	valid-rmse:1.54646

Fold  1 RMSE : 1.546468
XGB 1--------------------------------------------------
[0]	train-rmse:1.79627	valid-rmse:1.7941
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[1000]	train-rmse:1.3854	valid-rmse:1.55947
Stopping. Best iteration:
[1729]	train-rmse:1.32039	valid-rmse:1.5577

Fold  2 RMSE : 1.557722
XGB 2--------------------------------------------------
[0]	train-rmse:1.79402	valid-rmse:1.80305
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 

In [36]:
from scipy.optimize import minimize

def find_best_weight(preds, target):
    def _validate_func(weights):
        ''' scipy minimize will pass the weights as a numpy array '''
        final_prediction = 0
        for weight, prediction in zip(weights, preds):
                final_prediction += weight * prediction
        return np.sqrt(mean_squared_error(final_prediction, target))

    #the algorithms need a starting value, right not we chose 0.5 for all weights
    #its better to choose many random starting points and run minimize a few times
    starting_values = [0.5]*len(preds)

    #adding constraints and a different solver as suggested by user 16universe
    #https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
    cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
    #our weights are bound between 0 and 1
    bounds = [(0, 1)] * len(preds)
    
    res = minimize(_validate_func, starting_values, method='Nelder-Mead', bounds=bounds, constraints=cons)
    
    print('Ensemble Score: {best_score}'.format(best_score=(1-res['fun'])))
    print('Best Weights: {weights}'.format(weights=res['x']))
    
    return res

In [45]:
res = find_best_weight([oof_lgb, oof_cb, oof_xgb], target)



Ensemble Score: -0.55650325720128
Best Weights: [0.36024638 0.31976519 0.35192096]


In [46]:
model_without_outliers = pd.DataFrame({"card_id":df_test["card_id"].values})
model_without_outliers["target"] = 0.36024638*predictions_lgb + 0.31976519*predictions_cb + 0.35192096*predictions_xgb

In [14]:
# simple model
model_without_outliers = pd.DataFrame({"card_id":df_test["card_id"].values})
model_without_outliers["target"] = predictions_lgb

## Step 2. Predicting outliers

In [15]:
%%time
df_train = pd.read_csv('D:\Ellunium\elo/train_data_clean_16_02.csv')
df_test = pd.read_csv('D:\Ellunium\elo/test_data_clean_16_02.csv')

Wall time: 26.3 s


In [16]:
target = df_train['outliers']
del df_train['outliers']
del df_train['target']

In [17]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 6,
         'learning_rate': 0.01,
         "boosting": "rf",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 2333}

In [18]:
%%time
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

start = time.time()


for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(log_loss(target, oof)))

fold n°0




Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0439813	valid_1's binary_logloss: 0.0469813
[200]	training's binary_logloss: 0.0440024	valid_1's binary_logloss: 0.0470181
Early stopping, best iteration is:
[63]	training's binary_logloss: 0.0439481	valid_1's binary_logloss: 0.0469078
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0442106	valid_1's binary_logloss: 0.0453628
[200]	training's binary_logloss: 0.0441944	valid_1's binary_logloss: 0.0453443
Early stopping, best iteration is:
[65]	training's binary_logloss: 0.0442016	valid_1's binary_logloss: 0.0452901
fold n°2
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0444285	valid_1's binary_logloss: 0.0438536
[200]	training's binary_logloss: 0.0444377	valid_1's binary_logloss: 0.0438545
Early stopping, best iteration is:
[49]	training's binary_logloss: 0.0444558	valid_1's binary

In [19]:
df_outlier_prob = pd.DataFrame({"card_id":df_test["card_id"].values})
df_outlier_prob["target"] = predictions
df_outlier_prob.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0.078945
1,C_ID_130fd0cbdd,0.001863
2,C_ID_b709037bc5,0.008478
3,C_ID_d27d835a9f,0.001863
4,C_ID_2b5e3df5c2,0.001878


## Step 3. Combining submissions

In [20]:
# In case missing some predictable outlier, we choose top 25000 with highest outliers likelyhood.
outlier_id = pd.DataFrame(df_outlier_prob.sort_values(by='target',ascending = False).head(25000)['card_id'])

In [21]:
best_submission = pd.read_csv('D:\Ellunium\elo/submission_elo_strat_lgb_3_64489.csv')

In [22]:
most_likely_liers = best_submission.merge(outlier_id,how='right')
most_likely_liers.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-2.935556
1,C_ID_f7cada36d3,0.431253
2,C_ID_6d8dba8475,-0.73517
3,C_ID_7f1041e8e1,-6.152643
4,C_ID_22e4a47c72,-0.620218


In [23]:
%%time
for card_id in most_likely_liers['card_id']:
    model_without_outliers.loc[model_without_outliers['card_id']==card_id,'target']\
    = most_likely_liers.loc[most_likely_liers['card_id']==card_id,'target'].values

Wall time: 15min 41s


In [24]:
model_without_outliers.to_csv("combining_submission_lgb1702.csv", index=False)