# Бейзлайн для ДЗ 2

In [103]:
import pandas as pd
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

train = pd.read_csv("train.tsv")
test = pd.read_csv("test.tsv")

In [104]:
X_train = train.iloc[:,6:].copy()
X_test = test.iloc[:,5:].copy()

data_mean = X_train.mean(axis=0)
data_std = X_train.std(axis=0)

X_train = (X_train - data_mean)/data_std
X_test = (X_test - data_mean)/data_std
Y_train = train['y']

In [105]:
X_train = pd.concat([train.iloc[:,2:5].copy(), X_train], axis=1)
X_test = pd.concat([test.iloc[:,1:5].copy(), X_test], axis=1)

In [107]:
X_test.head()

Unnamed: 0,year,week,shift,item_id,f1,f2,f3,f4,f5,f6,...,f51,f52,f53,f54,f55,f56,f57,f58,f59,f60
0,2015,3,3,20447918,-0.384962,-0.387639,-0.38742,-0.386206,-0.389701,-0.392829,...,-0.404677,-0.407896,-0.406941,-0.406165,-0.40893,-0.405078,-0.408676,-0.410854,-0.41057,-0.410564
1,2015,3,3,20447902,-0.360966,-0.353005,-0.356585,-0.362939,-0.354454,-0.362705,...,-0.338836,-0.364061,-0.341776,-0.352009,-0.352404,-0.36038,-0.357172,-0.353676,-0.335137,-0.344264
2,2015,3,3,20447732,-0.047945,0.043532,0.127935,0.030801,0.207132,0.259203,...,0.558835,0.111039,0.303967,0.423395,0.369111,0.359363,0.263894,0.397682,0.531011,0.532121
3,2015,3,3,20443951,-0.326079,-0.318165,-0.311002,-0.331141,-0.316858,-0.314412,...,-0.284356,-0.363387,-0.337733,-0.339604,-0.310419,-0.335916,-0.362284,-0.351659,-0.31687,-0.302322
4,2015,3,3,20443944,-0.374715,-0.372791,-0.380992,-0.386053,-0.384384,-0.387942,...,-0.409131,-0.409341,-0.40987,-0.411387,-0.411679,-0.411989,-0.412479,-0.414126,-0.414844,-0.414836


In [115]:
%%time

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'gamma': [i/10.0 for i in range(3,6)],
              'max_depth': [5, 6, 7],
              'min_child_weight': [4, 5],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

param_comb = 5
model = XGBRegressor(nthread=-1)
search_params = RandomizedSearchCV(model, param_distributions = parameters, n_iter = param_comb,
                                   scoring = 'r2', n_jobs = -1, iid = False, verbose = 3, random_state = 0)

search_params.fit(X_train,
         Y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 17.9min finished
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


CPU times: user 3min 50s, sys: 779 ms, total: 3min 50s
Wall time: 21min 43s


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=-1,
                                          objective='reg:linear',
                                          random...
                                          seed=None, silent=None, subsample=1,
                                          verbosity=1),
                   iid=False, n_iter=5, n_jobs=-1,
   

In [118]:
del X_test['item_id']

In [120]:
y_pred = search_params.predict(X_test)
sample_submission = pd.read_csv("sample_submission.tsv")
sample_submission['y'] = y_pred

In [121]:
sample_submission.head(5)

Unnamed: 0,Num,y
0,348622,1525.586792
1,348623,25151.701172
2,348624,297314.71875
3,348625,27381.027344
4,348626,1140.900757


В прогнозах GBM могут появиться отрицательные числа:

In [122]:
print(sample_submission[sample_submission['y'] < 0])

Empty DataFrame
Columns: [Num, y]
Index: []


Если они есть - можем занулить:

In [10]:
sample_submission['y'] = sample_submission['y'].map(lambda x: x if x > 0 else 0.0)

In [123]:
sample_submission.to_csv("baseline_submission.tsv", sep=',', index=False)