In [1]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features

import matplotlib #collection of functions for scientific and publication-ready visualization

import numpy as np #foundational package for scientific computing

import scipy as sp #collection of functions for scientific computing and advance mathematics

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook

import sklearn #collection of machine learning algorithms

#misc libraries
import random
import time
import datetime as dt

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

import featuretools as ft
from sklearn.feature_extraction.text import CountVectorizer

-------------------------


In [2]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
import xgboost as xgb

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.model_selection import train_test_split

from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8



In [3]:
data_raw = pd.read_csv('events_up_to_01062018.csv')
data_val = pd.read_csv('labels_training_set.csv')

In [4]:
data_raw.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [5]:
df_labels = data_val.copy(deep=True) 
df = data_raw.copy(deep=True)

In [6]:
pd.set_option('display.max_columns', 23)

In [7]:
df.describe(include= 'all')

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
count,2341681,2341681,2341681,191131,1320530.0,1321513,1320530,1320530,1320530,505949,113763,11201,191286,106406,204069,204069,204069,204069,204069,204069,204066,204069,204069
unique,1490912,11,38829,248,,208,5,8,63,52267,10964,14,23,4,7,2,2206,122,51,4,393,131,366
top,2018-05-30 23:13:56,viewed product,c76b8417,/,,iPhone 6,Bom,16GB,Preto,"2820,6706,6720,2750,6649,7251,6663,12604,7224,...",Iphone,CustomerService,google,Google,Paid,Returning,Unknown,Sao Paulo,Brazil,Smartphone,360x640,Windows 7,Chrome 66.0
freq,14,1248124,4438,64187,,107262,547617,442096,314925,2606,2577,5239,123354,105195,91753,165827,36866,57304,197699,103502,73234,46648,57953
mean,,,,,6899.178,,,,,,,,,,,,,,,,,,
std,,,,,4028.042,,,,,,,,,,,,,,,,,,
min,,,,,71.0,,,,,,,,,,,,,,,,,,
25%,,,,,2929.0,,,,,,,,,,,,,,,,,,
50%,,,,,7057.0,,,,,,,,,,,,,,,,,,
75%,,,,,10014.0,,,,,,,,,,,,,,,,,,


In [8]:
persons = (df.drop_duplicates('person'))['person'].to_frame()
persons_to_train = df_labels['person'].to_frame()
persons_to_predict = persons.loc[~persons['person'].isin(persons_to_train['person'])]


print(persons_to_train.shape)
print(persons_to_predict.shape)


(19414, 1)
(19415, 1)


Obtengo las personas que son para entrenar, y para predecir

# Feature DIFERENCIA entre primer y ultima fecha del mes 5

 - Algunos features mas sacados sobre la fecha usando libreria featuretools

In [9]:
df.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,,,,,,


In [10]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(['timestamp','event'])
df = df.reset_index(drop=True)



In [11]:
df_dates_per_month = persons
df_months = df
df_months['month'] = df_months['timestamp'].dt.month
df_months['days'] = df_months['timestamp'].dt.day

In [12]:
df_month = df_months.loc[df['month'] == 5]
df_dates = df_month.groupby('person').agg({'timestamp':['max', 'min']}).reset_index()
df_dates.columns = ['person', 'last month 5', 'first month 5']
df_dates['diferencia 5'] = (df_dates['last month 5'] - df_dates['first month 5']).dt.days



In [13]:
df_dates_per_month =df_dates_per_month.merge(df_dates, on='person', how='left')

In [14]:
es = ft.EntitySet(id = 'person')
es = es.entity_from_dataframe(entity_id = 'person_id', dataframe = df_dates_per_month, index = 'person')

In [15]:
%time feat_date, feature_names = ft.dfs(entityset=es, target_entity='person_id', max_depth = 2)

CPU times: user 1.1 s, sys: 0 ns, total: 1.1 s
Wall time: 1.7 s


In [16]:
feat_date.head()

Unnamed: 0_level_0,diferencia 5,DAY(last month 5),DAY(first month 5),YEAR(last month 5),YEAR(first month 5),MONTH(last month 5),MONTH(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5)
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0008ed71,0.0,17.0,17.0,2018.0,2018.0,5.0,5.0,3.0,3.0
00091926,27.0,31.0,3.0,2018.0,2018.0,5.0,5.0,3.0,3.0
00091a7a,,,,,,,,,
000ba417,9.0,26.0,17.0,2018.0,2018.0,5.0,5.0,5.0,3.0
000c79fe,0.0,29.0,29.0,2018.0,2018.0,5.0,5.0,1.0,1.0


In [17]:
feat_date['person'] = feat_date.index
feat_date = feat_date.reset_index(drop=True)
feat_date.head()

Unnamed: 0,diferencia 5,DAY(last month 5),DAY(first month 5),YEAR(last month 5),YEAR(first month 5),MONTH(last month 5),MONTH(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5),person
0,0.0,17.0,17.0,2018.0,2018.0,5.0,5.0,3.0,3.0,0008ed71
1,27.0,31.0,3.0,2018.0,2018.0,5.0,5.0,3.0,3.0,00091926
2,,,,,,,,,,00091a7a
3,9.0,26.0,17.0,2018.0,2018.0,5.0,5.0,5.0,3.0,000ba417
4,0.0,29.0,29.0,2018.0,2018.0,5.0,5.0,1.0,1.0,000c79fe


In [18]:
lista = ['diferencia 5','DAY(last month 5)', 'DAY(first month 5)','WEEKDAY(last month 5)','WEEKDAY(first month 5)','person']

feat_date = feat_date[lista]

 


Saco los años ya que para este tp todos los años son el mismo y no aporta nada

In [19]:
feat_date = persons.merge(feat_date, on='person', how='left')

# Feature sobre eventos en el mes 5

In [20]:
df_month['days'] = df['timestamp'].dt.day
df_month['month'].value_counts()

5    1713920
Name: month, dtype: int64

#### Personas que hicieron checkout

 - Promedio de checkouts por dia 
 - Suma de checkouts que tuvo total

In [21]:
(df.loc[df['event'] == 'checkout']).drop_duplicates('person').shape

(32833, 25)

In [22]:
df_checkout = (df.loc[df['event'] == 'checkout'])
df_checkout = df_checkout.loc[df_checkout['month']==5]
df_checkout['count'] = 1

In [23]:
df_checkout = df_checkout.groupby(['person','month', 'days']).agg({'count':'sum'}).reset_index()


In [24]:
df_checkout = df_checkout.groupby('person').agg({'count':['sum' ,'mean']}).reset_index()

In [25]:
df_checkout.head()

Unnamed: 0_level_0,person,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
0,0008ed71,3,3.0
1,00091926,2,1.0
2,000ba417,6,2.0
3,000c79fe,1,1.0
4,000e4d9e,1,1.0


In [26]:
df_checkout = persons.merge(df_checkout, on='person', how='left')

In [27]:
df_checkout.shape

(38829, 3)

In [28]:
df_checkout.columns = ['person' , 'checkout_sum', 'checkout_mean']

Pongo nombres mas descriptivos 

#### Personas que miraron productos en el mes 5
 - Suma total de los modelos vistos
 - Media de productos vistos por dia
 - Desviacion standard de vistos por dia

In [29]:
(df.loc[df['event'] == 'viewed product']).drop_duplicates('person').shape

(37130, 25)

In [30]:
df_viewed = (df.loc[df['event'] == 'viewed product'])

df_viewed['count'] = 1

In [31]:
df_viewed = df_viewed.groupby(['person', 'days','month']).agg({'count':'sum'}).reset_index()


In [32]:
df_viewed = df_viewed.groupby('person').agg({'count':['sum' , 'std','mean']}).reset_index()

In [33]:
df_viewed.head()

Unnamed: 0_level_0,person,count,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,std,mean
0,00091926,372,18.899735,18.6
1,00091a7a,3,,3.0
2,000ba417,153,16.093477,51.0
3,000c79fe,3,,3.0
4,000e4d9e,339,49.401417,37.666667


In [34]:
df_viewed = persons.merge(df_viewed, on='person', how='left')

In [35]:
df_viewed.shape

(38829, 4)

In [36]:
df_viewed.columns = ['person' , 'viewed_sum', 'viewed_mean', 'viewed_std']

Le pongo nombres descriptivos

### Personas que realizaron conversiones en el mes 5
 - Si realizo conversiones en el mes 5
 - Media de compras que realizo en el mes 5

In [37]:
(df.loc[df['event'] == 'conversion']).drop_duplicates('person').shape

(4293, 25)

In [38]:
df_conversion = (df.loc[df['event'] == 'conversion'])
df_conversion = df_conversion.loc[df_conversion['month']==5]

df_conversion['count'] = 1

In [39]:
df_conversion = df_conversion.groupby(['person','month', 'days']).agg({'count':'sum'}).reset_index()


In [40]:
df_conversion = df_conversion.groupby('person').agg({'count':['sum','mean']}).reset_index()

In [41]:
df_conversion.head()

Unnamed: 0_level_0,person,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
0,000ba417,1,1.0
1,001001be,1,1.0
2,001804a2,1,1.0
3,0019e639,1,1.0
4,001b0bf9,1,1.0


In [42]:
df_conversion = persons.merge(df_conversion, on='person', how='left')

In [43]:
df_conversion = df_conversion.fillna(0)
df_conversion.shape

(38829, 3)

In [44]:
df_conversion.columns = ['person' , 'conversion_sum', 'conversion_mean']

- Realice analisis mas exhautivos a lo largo del trabajo y los eventos mas optimos fueron los analizados en este notebook


# Checkout en mes 5 date

 - Mismo analisis que el anterior
 

In [45]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(['timestamp','event'])
df = df.reset_index(drop=True)



In [46]:
df_dates_5_checkout = persons
df_months = df
df_months['month'] = df_months['timestamp'].dt.month

In [47]:
df_month_checkout = df_months.loc[df_months['month'] == 5]
df_month_checkout = df_month_checkout.loc[df_month_checkout['event'] == 'checkout']
df_dates_checkout = df_month_checkout.groupby('person').agg({'timestamp':['max', 'min']}).reset_index()
df_dates_checkout.columns = ['person', 'last month 5', 'first month 5']
df_dates_checkout['diferencia 5'] = (df_dates_checkout['last month 5'] - df_dates_checkout['first month 5']).dt.days



In [48]:
df_dates_5_checkout =df_dates_5_checkout.merge(df_dates_checkout, on='person', how='left')

In [49]:
es = ft.EntitySet(id = 'person')
es = es.entity_from_dataframe(entity_id = 'person_id', dataframe = df_dates_5_checkout, index = 'person')

In [50]:
%time features_date_checkout, feature_names = ft.dfs(entityset=es, target_entity='person_id', max_depth = 2)

CPU times: user 1.16 s, sys: 8.5 ms, total: 1.17 s
Wall time: 3.03 s


In [51]:
features_date_checkout.head()

Unnamed: 0_level_0,diferencia 5,DAY(last month 5),DAY(first month 5),YEAR(last month 5),YEAR(first month 5),MONTH(last month 5),MONTH(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5)
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0008ed71,0.0,17.0,17.0,2018.0,2018.0,5.0,5.0,3.0,3.0
00091926,5.0,26.0,20.0,2018.0,2018.0,5.0,5.0,5.0,6.0
00091a7a,,,,,,,,,
000ba417,9.0,26.0,17.0,2018.0,2018.0,5.0,5.0,5.0,3.0
000c79fe,0.0,29.0,29.0,2018.0,2018.0,5.0,5.0,1.0,1.0


In [52]:
features_date_checkout['person'] = features_date_checkout.index
features_date_checkout = features_date_checkout.reset_index(drop=True)
features_date_checkout.head()

Unnamed: 0,diferencia 5,DAY(last month 5),DAY(first month 5),YEAR(last month 5),YEAR(first month 5),MONTH(last month 5),MONTH(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5),person
0,0.0,17.0,17.0,2018.0,2018.0,5.0,5.0,3.0,3.0,0008ed71
1,5.0,26.0,20.0,2018.0,2018.0,5.0,5.0,5.0,6.0,00091926
2,,,,,,,,,,00091a7a
3,9.0,26.0,17.0,2018.0,2018.0,5.0,5.0,5.0,3.0,000ba417
4,0.0,29.0,29.0,2018.0,2018.0,5.0,5.0,1.0,1.0,000c79fe


In [53]:
lista = ['person','diferencia 5','DAY(last month 5)', 'DAY(first month 5)','WEEKDAY(last month 5)','WEEKDAY(first month 5)']
features_date_checkout = features_date_checkout[lista]

 


In [54]:
features_date_checkout.columns = ['person', 'dif_5_check', 'last_day_check', 'first_day_check', 'last_week_check', 'first_week_check']

In [55]:
features_date_checkout.head()

Unnamed: 0,person,dif_5_check,last_day_check,first_day_check,last_week_check,first_week_check
0,0008ed71,0.0,17.0,17.0,3.0,3.0
1,00091926,5.0,26.0,20.0,5.0,6.0
2,00091a7a,,,,,
3,000ba417,9.0,26.0,17.0,5.0,3.0
4,000c79fe,0.0,29.0,29.0,1.0,1.0


# Junto mis features

In [56]:
features = persons
features = features.merge(df_conversion, on='person', how='left')
features = features.merge(df_viewed, on='person', how='left')
features = features.merge(features_date_checkout, on='person', how='left')

In [57]:
features.to_csv('features_nacho.csv', index = False)

In [58]:
features.columns

Index(['person', 'conversion_sum', 'conversion_mean', 'viewed_sum',
       'viewed_mean', 'viewed_std', 'dif_5_check', 'last_day_check',
       'first_day_check', 'last_week_check', 'first_week_check'],
      dtype='object')

In [59]:
features.head()

Unnamed: 0,person,conversion_sum,conversion_mean,viewed_sum,viewed_mean,viewed_std,dif_5_check,last_day_check,first_day_check,last_week_check,first_week_check
0,4886f805,0.0,0.0,4.0,,4.0,0.0,18.0,18.0,4.0,4.0
1,ad93850f,0.0,0.0,20.0,7.371115,6.666667,0.0,14.0,14.0,0.0,0.0
2,0297fc1e,0.0,0.0,404.0,5.228295,7.214286,12.0,22.0,10.0,1.0,3.0
3,2d681dd8,0.0,0.0,13.0,0.707107,6.5,0.0,27.0,27.0,6.0,6.0
4,cccea85e,0.0,0.0,739.0,26.241971,56.846154,0.0,11.0,11.0,4.0,4.0


## XGboost entrenamiento

In [60]:
df_train = df_labels.merge(features , left_on='person', right_on='person' , how='inner')

In [62]:
df_train.shape

(19414, 12)

In [162]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,conversion_sum,conversion_mean,viewed_sum,viewed_mean,viewed_std,dif_5_check,last_day_check,first_day_check,last_week_check,first_week_check
0,1.0,1.0,23.0,12.020815,11.5,0.0,23.0,23.0,2.0,2.0
1,0.0,0.0,,,,,,,,
2,0.0,0.0,31.0,3.778595,3.444444,0.0,29.0,29.0,1.0,1.0
3,0.0,0.0,24.0,12.124356,8.0,,,,,
4,0.0,0.0,9.0,4.949747,4.5,0.0,18.0,18.0,4.0,4.0


In [163]:
data_dmatrix = xgb.DMatrix(data=X,label=y)


## Xgboost

Para evaluar usen esta medida que me da valores muy parecidos a los de kaggle, para hacer las predicciones usen el otro

In [164]:
import xgboost as xgb
model = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 7,
                subsample = 0.9,
                gamma = 1,
                n_estimators = 65)






Este es el arbol con sus hiperparametros

In [165]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [173]:
model.fit(X_train,y_train,early_stopping_rounds=10, eval_metric="auc", eval_set=[(X_test, y_test)], verbose=True)

[0]	validation_0-auc:0.845775
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.860141
[2]	validation_0-auc:0.859984
[3]	validation_0-auc:0.862747
[4]	validation_0-auc:0.861647
[5]	validation_0-auc:0.862297
[6]	validation_0-auc:0.862074
[7]	validation_0-auc:0.861783
[8]	validation_0-auc:0.862078
[9]	validation_0-auc:0.862769
[10]	validation_0-auc:0.862941
[11]	validation_0-auc:0.864488
[12]	validation_0-auc:0.865108
[13]	validation_0-auc:0.866503
[14]	validation_0-auc:0.866524
[15]	validation_0-auc:0.866272
[16]	validation_0-auc:0.86643
[17]	validation_0-auc:0.866392
[18]	validation_0-auc:0.86613
[19]	validation_0-auc:0.86613
[20]	validation_0-auc:0.865826
[21]	validation_0-auc:0.866311
[22]	validation_0-auc:0.866064
[23]	validation_0-auc:0.865952
[24]	validation_0-auc:0.865838
Stopping. Best iteration:
[14]	validation_0-auc:0.866524



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=65,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.9)

In [167]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  model.predict_proba(X_test)[:,1])

0.8665240537060461

In [172]:
params = {"objective":"binary:logistic",'colsample_bytree':1,
          'learning_rate': 0.1, 'max_depth': 7, 'gamma': 1,'n_estimators': 65}
cv_val = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50, early_stopping_rounds=10,
                    metrics="auc", as_pandas=True, seed=123)

[15:56:10] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 10 pruned nodes, max_depth=7
[15:56:10] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 16 pruned nodes, max_depth=7
[15:56:10] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 10 pruned nodes, max_depth=7
[15:56:10] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 8 pruned nodes, max_depth=7
[15:56:10] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 8 pruned nodes, max_depth=7
[15:56:10] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 10 pruned nodes, max_depth=7
[15:56:10] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 10 pruned nodes, max_depth=7
[15:56:10] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 10 pruned nodes, max_depth=7
[15:56:10]

[15:56:11] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 26 pruned nodes, max_depth=7
[15:56:11] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 98 extra nodes, 30 pruned nodes, max_depth=7
[15:56:11] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 104 extra nodes, 30 pruned nodes, max_depth=7
[15:56:11] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 110 extra nodes, 26 pruned nodes, max_depth=7
[15:56:11] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 38 pruned nodes, max_depth=7
[15:56:11] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 94 extra nodes, 22 pruned nodes, max_depth=7
[15:56:12] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 104 extra nodes, 42 pruned nodes, max_depth=7
[15:56:12] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 108 extra nodes, 20 pruned nodes, max_depth=7
[1

In [169]:
cv_val.tail(1)

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
28,0.909531,0.004054,0.854809,0.005428
