# Housekeeping

Clean code, write functions to generate all necessary preprocessing and data.

Data Preparation
1. `create_train`
2. `calculate_customer_product_pair`
3. generate `data_month_{}.hdf`
    - `eda_4_7.ipynb` to `eda_4_10.ipynb`
4. `calculate_customer_product_pair`
5. `mean_encoding_month_product`
6. `count_pattern_2`
7. `count_history` and its helper functions
8. `calculate_weight`
9. `train_test_month`
10. calculate feature importance

In [1]:
from santander_helper import *
%matplotlib inline

create_monthly_data()

target = calculate_customer_product_pair()

mean_encoding_result = mean_encoding_month_product()

In [2]:
# Calculate feature importance
# param = {'objective': 'multi:softprob', 
#          'eta': 0.05, 
#          'max_depth': 12, 
#          'silent': 1, 
#          'num_class': len(target_cols),
#          'eval_metric': 'mlogloss',
#          'min_child_weight': 1,
#          'subsample': 0.7,
#          'colsample_bytree': 0.7,
#          'seed': 0}

# fi = calculate_feature_importance(param, 
#                                   num_rounds=50, 
#                                   n_repeat=5,
#                                   random_seed=42,
#                                   fi_name='feature_importance.csv')

In [None]:
# Train models on 2015-06-28 and predict
simulation_name0 = 'eda_5_1'
simulation_name1 = '1506_1'
param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8,
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'merror',
         'min_child_weight': 10,
         'min_split_loss': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 3870}
num_rounds = 125
n_repeat = 5

history, model_dict, y_pred, y_sub = \
    train_test_month(param, num_rounds, '2015-06-28', '2016-05-28', 
    sub_name='{}_{}.csv.gz'.format(simulation_name0, simulation_name1), 
    n_repeat=n_repeat, random_seed=54, 
    n_features=350, eval_train_flag=False)

# History and learning curve
plot_history_val(history)

# Feature importance
fi = plot_feature_importance(model_dict)

# Save data
save_pickle('{}_{}.pickle'.format(simulation_name0, simulation_name1), (history, model_dict, y_pred, y_sub, fi, param))

In [None]:
# Train all months' data
x_train = []
y_train = []
w_train = []
fixed_lag = 6
for i, m in tqdm.tqdm_notebook(enumerate(month_list), total=len(month_list)):
    if m in ['2015-01-28', '2016-06-28']:
        continue
    x_tmp, y_tmp, w_tmp = create_train(m, max_lag=i, fixed_lag=fixed_lag, pattern_flag=True)
    x_train.append(x_tmp)
    y_train.append(y_tmp)
    w_train.append(w_tmp)
del x_tmp, y_tmp, w_tmp
gc.collect()

x_train = pd.concat(x_train, axis=0, ignore_index=True, sort=False)
y_train = pd.concat(y_train, axis=0, ignore_index=True, sort=False)
w_train = pd.concat(w_train, axis=0, ignore_index=True, sort=False)

gc.collect()

param = {'objective': 'multi:softprob', 
         'eta': 0.1, 
         'max_depth': 8,
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'merror',
         'min_child_weight': 10,
         'lambda': 5,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}

# number of rows in train dataset, to simplify testing, always set to None
n_rows = None 
n_repeats = 3
n_trees = 50
train = {'x': x_train.iloc[:n_rows, :], 'y': y_train.iloc[:n_rows], 'w': w_train.iloc[:n_rows]}
clfs, running_time = cv_all_month(param, train, n_features=350, num_boost_round=n_trees, 
    n_repeats=n_repeats, random_state=3870, verbose_eval=True)