In [1]:
# https://towardsdatascience.com/hyperparameter-tuning-in-lasso-and-ridge-regressions-70a4b158ae6d
# https://medium.com/analytics-vidhya/hyperparameter-tuning-in-linear-regression-e0e0f1f968a1
# https://towardsdatascience.com/how-to-tune-a-decision-tree-f03721801680
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
#### models
# https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html
# https://catboost.ai/en/docs/concepts/python-reference_catboostregressor
# https://xgboost.readthedocs.io/en/stable/parameter.html

In [2]:
import pandas as pd
import numpy as np

In [3]:
y_train = pd.read_pickle('../data/preprocessed/y_train.pkl')
y_train.head()

Unnamed: 0_level_0,TARGET
ID,Unnamed: 1_level_1
1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1079f498d53d42ff438|0fedacb725c2c2128d2b42505880d91d238e616b926278c41caff6da4c622e51|f793de205ead5ac302c4a1627829dea41f176b1068b993a32373fc869918374b|SEMANA_01,0
1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1079f498d53d42ff438|0fedacb725c2c2128d2b42505880d91d238e616b926278c41caff6da4c622e51|f793de205ead5ac302c4a1627829dea41f176b1068b993a32373fc869918374b|SEMANA_02,0
1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1079f498d53d42ff438|0fedacb725c2c2128d2b42505880d91d238e616b926278c41caff6da4c622e51|f793de205ead5ac302c4a1627829dea41f176b1068b993a32373fc869918374b|SEMANA_03,0
1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1079f498d53d42ff438|0fedacb725c2c2128d2b42505880d91d238e616b926278c41caff6da4c622e51|f793de205ead5ac302c4a1627829dea41f176b1068b993a32373fc869918374b|SEMANA_04,0
1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1079f498d53d42ff438|0fedacb725c2c2128d2b42505880d91d238e616b926278c41caff6da4c622e51|f793de205ead5ac302c4a1627829dea41f176b1068b993a32373fc869918374b|SEMANA_05,0


In [4]:
train_probs_cb = pd.read_pickle('../data/preprocessed/catboost/train_probs.pkl')
train_probs_lg = pd.read_pickle('../data/preprocessed/lightgbm/train_probs.pkl')
train_probs_xg = pd.read_pickle('../data/preprocessed/xgboost/train_probs.pkl')

test_probs_cb = pd.read_pickle('../data/preprocessed/catboost/test_probs.pkl')
test_probs_lg = pd.read_pickle('../data/preprocessed/lightgbm/test_probs.pkl')
test_probs_xg = pd.read_pickle('../data/preprocessed/xgboost/test_probs.pkl')

In [5]:
train_probs_total = pd.concat([train_probs_cb,  train_probs_lg, train_probs_xg], axis=1) # train_probs_xg
test_probs_total =  pd.concat([test_probs_cb, test_probs_lg, test_probs_xg], axis=1) # test_probs_xg
train_probs_total.head()

Unnamed: 0_level_0,probs_cb,probs_lg,probs_xg
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1079f498d53d42ff438|0fedacb725c2c2128d2b42505880d91d238e616b926278c41caff6da4c622e51|f793de205ead5ac302c4a1627829dea41f176b1068b993a32373fc869918374b|SEMANA_01,0.174639,0.203581,0.34305
1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1079f498d53d42ff438|0fedacb725c2c2128d2b42505880d91d238e616b926278c41caff6da4c622e51|f793de205ead5ac302c4a1627829dea41f176b1068b993a32373fc869918374b|SEMANA_02,0.206461,0.203581,0.342209
1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1079f498d53d42ff438|0fedacb725c2c2128d2b42505880d91d238e616b926278c41caff6da4c622e51|f793de205ead5ac302c4a1627829dea41f176b1068b993a32373fc869918374b|SEMANA_03,0.181526,0.203581,0.352065
1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1079f498d53d42ff438|0fedacb725c2c2128d2b42505880d91d238e616b926278c41caff6da4c622e51|f793de205ead5ac302c4a1627829dea41f176b1068b993a32373fc869918374b|SEMANA_04,0.185617,0.203581,0.367332
1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1079f498d53d42ff438|0fedacb725c2c2128d2b42505880d91d238e616b926278c41caff6da4c622e51|f793de205ead5ac302c4a1627829dea41f176b1068b993a32373fc869918374b|SEMANA_05,0.179352,0.203581,0.34305


In [6]:
display(train_probs_total.corr(), test_probs_total.corr())

Unnamed: 0,probs_cb,probs_lg,probs_xg
probs_cb,1.0,0.959003,0.933867
probs_lg,0.959003,1.0,0.925664
probs_xg,0.933867,0.925664,1.0


Unnamed: 0,probs_cb,probs_lg,probs_xg
probs_cb,1.0,0.948077,0.915042
probs_lg,0.948077,1.0,0.929457
probs_xg,0.915042,0.929457,1.0


In [7]:
folds = []
demanda2 = pd.read_pickle('../data/preprocessed/demanda2.pkl')
train = demanda2[demanda2['TARGET']>=0].copy()
folds.append(list(train[(train['SEMANA_XX']>=1) & (train['SEMANA_XX']<=10)].index))
folds.append(list(train[(train['SEMANA_XX']>=11) & (train['SEMANA_XX']<=20)].index))
folds.append(list(train[(train['SEMANA_XX']>=21) & (train['SEMANA_XX']<=30)].index))
folds.append(list(train[(train['SEMANA_XX']>=31) & (train['SEMANA_XX']<=40)].index))
folds.append(list(train[(train['SEMANA_XX']>=41) & (train['SEMANA_XX']<=50)].index))
len(folds), len(folds[0])

(5, 471730)

In [8]:
train = train_probs_total.copy()
test = test_probs_total.copy()

In [9]:
from sklearn.metrics import mean_squared_error
from xgboost  import XGBRegressor
fi = []
test_probs = []
train_probs = []
for i, idx in enumerate(folds):
    print("*"*10, i, "*"*10)
    Xt = train.drop(idx)
    yt = y_train.loc[Xt.index].TARGET

    Xv = train.loc[idx]
    yv = y_train.loc[Xv.index].TARGET

    # learner = XGBRegressor(n_estimators=1000,  random_state=42, **param)
    learner = XGBRegressor(n_estimators=1000,  random_state=42, n_jobs=-1, max_depth=2, learning_rate=0.05, booster='gblinear') #gbtree | dart
    learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="rmse",
                eval_set=[(Xt, yt), (Xv, yv)], verbose=10)
    
    test_probs.append(pd.Series(learner.predict(test), index=test.index, name="fold_" + str(i)))
    train_probs.append(pd.Series(learner.predict(Xv), index=Xv.index, name="probs"))
    fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
test_probs = test_probs.apply(lambda x: 0 if x<0 else x)

train_probs = pd.concat(train_probs)
train_probs = train_probs.apply(lambda x: 0 if x<0 else x)

fi = pd.concat(fi, axis=1).mean(axis=1)
score = mean_squared_error(y_train.loc[train.index].TARGET, train_probs.loc[train.index], squared=False)
print('SCORE RMSE: ', score)

********** 0 **********
Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-rmse:5.20115	validation_1-rmse:6.57427




[10]	validation_0-rmse:3.66792	validation_1-rmse:4.77140
[20]	validation_0-rmse:3.56556	validation_1-rmse:4.61475
[30]	validation_0-rmse:3.55206	validation_1-rmse:4.59250
[40]	validation_0-rmse:3.54741	validation_1-rmse:4.58781
[50]	validation_0-rmse:3.54509	validation_1-rmse:4.58697
[60]	validation_0-rmse:3.54364	validation_1-rmse:4.58720
[61]	validation_0-rmse:3.54352	validation_1-rmse:4.58725
********** 1 **********
Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-rmse:5.60737	validation_1-rmse:4.98357




[10]	validation_0-rmse:4.01548	validation_1-rmse:3.35520
[20]	validation_0-rmse:3.91551	validation_1-rmse:3.26396
[30]	validation_0-rmse:3.90389	validation_1-rmse:3.25428
[40]	validation_0-rmse:3.90049	validation_1-rmse:3.25060
[50]	validation_0-rmse:3.89911	validation_1-rmse:3.24870
[60]	validation_0-rmse:3.89846	validation_1-rmse:3.24759
[70]	validation_0-rmse:3.89809	validation_1-rmse:3.24685
[80]	validation_0-rmse:3.89784	validation_1-rmse:3.24627
[90]	validation_0-rmse:3.89763	validation_1-rmse:3.24578
[100]	validation_0-rmse:3.89745	validation_1-rmse:3.24534
[110]	validation_0-rmse:3.89728	validation_1-rmse:3.24495
[120]	validation_0-rmse:3.89713	validation_1-rmse:3.24458
[130]	validation_0-rmse:3.89699	validation_1-rmse:3.24425
[140]	validation_0-rmse:3.89685	validation_1-rmse:3.24394
[150]	validation_0-rmse:3.89673	validation_1-rmse:3.24366
[160]	validation_0-rmse:3.89661	validation_1-rmse:3.24340
[170]	validation_0-rmse:3.89651	validation_1-rmse:3.24316
[180]	validation_0-rmse



[10]	validation_0-rmse:3.87923	validation_1-rmse:3.97055
[20]	validation_0-rmse:3.78294	validation_1-rmse:3.84260
[30]	validation_0-rmse:3.77098	validation_1-rmse:3.82838
[40]	validation_0-rmse:3.76713	validation_1-rmse:3.82455
[50]	validation_0-rmse:3.76536	validation_1-rmse:3.82319
[60]	validation_0-rmse:3.76433	validation_1-rmse:3.82276
[70]	validation_0-rmse:3.76361	validation_1-rmse:3.82271
[76]	validation_0-rmse:3.76324	validation_1-rmse:3.82278
********** 3 **********
Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-rmse:5.50141	validation_1-rmse:5.43717




[10]	validation_0-rmse:3.97225	validation_1-rmse:3.53517
[20]	validation_0-rmse:3.87219	validation_1-rmse:3.45414
[30]	validation_0-rmse:3.85962	validation_1-rmse:3.45418
[32]	validation_0-rmse:3.85852	validation_1-rmse:3.45432
********** 4 **********
Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-rmse:5.75095	validation_1-rmse:4.29717




[10]	validation_0-rmse:3.94123	validation_1-rmse:3.71407
[20]	validation_0-rmse:3.82570	validation_1-rmse:3.67332
[30]	validation_0-rmse:3.81302	validation_1-rmse:3.66270
[40]	validation_0-rmse:3.80939	validation_1-rmse:3.65781
[50]	validation_0-rmse:3.80789	validation_1-rmse:3.65530
[60]	validation_0-rmse:3.80716	validation_1-rmse:3.65386
[70]	validation_0-rmse:3.80671	validation_1-rmse:3.65292
[80]	validation_0-rmse:3.80639	validation_1-rmse:3.65221
[90]	validation_0-rmse:3.80612	validation_1-rmse:3.65164
[100]	validation_0-rmse:3.80589	validation_1-rmse:3.65114
[110]	validation_0-rmse:3.80568	validation_1-rmse:3.65069
[120]	validation_0-rmse:3.80548	validation_1-rmse:3.65028
[130]	validation_0-rmse:3.80530	validation_1-rmse:3.64990
[140]	validation_0-rmse:3.80514	validation_1-rmse:3.64953
[150]	validation_0-rmse:3.80498	validation_1-rmse:3.64919
[160]	validation_0-rmse:3.80484	validation_1-rmse:3.64886
[170]	validation_0-rmse:3.80471	validation_1-rmse:3.64855
[180]	validation_0-rmse

In [10]:
test_probs.name = 'Demanda'
test_submit = test_probs.reset_index()[['ID','Demanda']]
test_submit

Unnamed: 0,ID,Demanda
0,1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1...,0.276594
1,1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1...,0.285215
2,1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1...,0.307395
3,1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1...,0.307178
4,1161c077d04c02902341ddd6f20f322d6fdaac2619ecf1...,0.293745
...,...,...
471725,ddf2b5328dafe4a5863dc2463578ea789284d951943120...,0.167383
471726,ddf2b5328dafe4a5863dc2463578ea789284d951943120...,0.165282
471727,ddf2b5328dafe4a5863dc2463578ea789284d951943120...,0.165106
471728,ddf2b5328dafe4a5863dc2463578ea789284d951943120...,0.167068


In [11]:
test_submit['Demanda'].describe()

count    471730.000000
mean          0.668325
std           2.438453
min           0.070592
25%           0.180210
50%           0.264086
75%           0.516798
max         224.080734
Name: Demanda, dtype: float64

In [12]:
test_submit.to_csv('../results/test_stacking_xg_cb_lg_training_xg_cv.csv', index=False)

In [13]:
!kaggle competitions submit datathon-entel-2022-reto2 -f ../results/test_stacking_xg_cb_lg_training_xg_cv.csv -m "test"

Successfully submitted to DATATHON ENTEL 2022 - Reto2



  0%|          | 0.00/101M [00:00<?, ?B/s]
  0%|          | 8.00k/101M [00:00<31:53, 55.5kB/s]
  0%|          | 96.0k/101M [00:00<04:04, 434kB/s] 
  0%|          | 240k/101M [00:00<02:16, 779kB/s] 
  1%|          | 880k/101M [00:00<00:38, 2.72MB/s]
  1%|1         | 1.19M/101M [00:00<00:35, 2.92MB/s]
  2%|1         | 1.62M/101M [00:00<00:31, 3.37MB/s]
  2%|2         | 2.07M/101M [00:00<00:28, 3.70MB/s]
  2%|2         | 2.48M/101M [00:00<00:26, 3.86MB/s]
  3%|2         | 2.96M/101M [00:01<00:24, 4.17MB/s]
  3%|3         | 3.38M/101M [00:01<00:24, 4.20MB/s]
  4%|3         | 3.80M/101M [00:01<00:24, 4.25MB/s]
  4%|4         | 4.22M/101M [00:01<00:23, 4.28MB/s]
  5%|4         | 5.06M/101M [00:01<00:18, 5.59MB/s]
  6%|5         | 5.60M/101M [00:01<00:19, 5.04MB/s]
  6%|6         | 6.09M/101M [00:01<00:20, 4.85MB/s]
  6%|6         | 6.57M/101M [00:01<00:22, 4.38MB/s]
  7%|6         | 7.00M/101M [00:01<00:23, 4.29MB/s]
  7%|7         | 7.42M/101M [00:02<00:27, 3.55MB/s]
  8%|7         | 8.03M