In [1]:
%%javascript

IPython.keyboard_manager.command_shortcuts.add_shortcut('Ctrl-k', {
    help : 'move up selected cells',
    help_index : 'jupyter-notebook:move-selection-up',
    handler : function (event) {
        IPython.notebook.move_selection_up();
        return false;
    }}
);

IPython.keyboard_manager.command_shortcuts.add_shortcut('Ctrl-j', {
    help : 'move down selected cells',
    help_index : 'jupyter-notebook:move-selection-down',
    handler :  function (event) {
        IPython.notebook.move_selection_down();
        return false;
    }}
);

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from scipy.sparse import hstack
from xgboost.sklearn import XGBClassifier # <3
from sklearn.model_selection import train_test_split
import gc
import matplotlib.pyplot as plt

from scipy.sparse.csr import csr_matrix

pd.options.mode.chained_assignment = None


In [3]:

pd.set_option('display.width', 400)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 200)


In [4]:
df = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
df_labels= pd.read_csv('labels_training_set.csv', low_memory = False)


In [5]:
subjects = df['person'].drop_duplicates().to_frame()
subjects.shape

(38829, 1)

In [6]:
df.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,,,,,,


In [7]:
df['event'].value_counts()

viewed product       1248124
brand listing         216312
visited site          204069
ad campaign hit       191388
generic listing       160176
searched products     130616
search engine hit     106406
checkout               65315
staticpage             11201
conversion              7091
lead                     983
Name: event, dtype: int64

## Modelo count vector

In [8]:
df_model = df.loc[df['model'].notnull()]
df_model = df_model.groupby('person')['model'].apply(list).reset_index()

In [9]:
#drop nans
df_model.head()
df_model.shape

(38256, 2)

In [10]:
#df_all = df_labels.merge(df_model, left_on = 'person', right_on = 'person', how = 'left')
#df_all.head()

In [11]:
df_model['model'] = df_model['model'].apply(lambda x: ', '.join(map(str, x)))
df_model = df_model.fillna('')
df_model.shape

(38256, 2)

In [12]:
word_vectorizer = CountVectorizer()

tf_mat = word_vectorizer.fit_transform(df_model['model'])

tf_array = tf_mat.toarray()
tf_array.shape

(38256, 150)

In [13]:
doc = 0
feature_names = word_vectorizer.get_feature_names()
feature_index = tf_mat[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [tf_mat[doc, x] for x in feature_index])

In [14]:
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
  print (w, s)

se 1
iphone 1
flat 1
s6 1
galaxy 1
samsung 1
h818p 1
g4 1
lg 1


In [15]:
df_tfidf = pd.DataFrame(tf_array, columns=feature_names)
df_tfidf['person'] = df_model['person']
df_tfidf.head()

Unnamed: 0,10,15,16,2016,2017,32,3g,4g,4s,5c,5s,6s,a3,a5,a7,a7010,a9,air,aqua,asus,beat,chip,classic,com,compact,...,samsung,screen,se,selfie,slim,sony,style,stylus,tab,tv,ultra,up,vibe,wi,win,x2,xiaomi,xperia,you,young,z2,z3,z5,zenfone,person
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0008ed71
1,0,0,0,3,7,0,0,3,0,0,0,147,0,2,5,0,2,1,0,0,0,0,0,0,0,...,61,0,4,0,0,1,35,0,0,0,0,0,1,1,0,10,0,1,0,0,0,1,0,0,00091926
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,00091a7a
3,0,0,0,22,7,0,3,29,0,6,1,0,46,22,5,0,0,0,1,0,0,0,0,0,0,...,110,0,0,0,4,1,0,0,0,6,0,0,0,0,0,8,0,1,0,0,0,0,0,0,000ba417
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,000c79fe


## Top Index ?

In [16]:
df_conversion = df.loc[df['event']=='conversion']
df_top_conversion = df_conversion.loc[df_conversion['model'].isin(df_conversion['model'].value_counts().nlargest(50).index)]

In [17]:
a = np.array(df_conversion['model'].value_counts().nlargest(70).index.values.tolist())

In [18]:
word_vectorizer = CountVectorizer()
tf_mat = word_vectorizer.fit_transform(a)

In [19]:
feature_names_top = word_vectorizer.get_feature_names()

## Xg con los index que elegi


In [20]:
df_top = df_tfidf[feature_names_top]

In [21]:
df_tfidf.shape

(38256, 151)

In [22]:
df_top.shape

(38256, 65)

In [23]:
df_top['person'] = df_model['person']
df_top.head()

Unnamed: 0,2016,2017,3g,4g,4s,5c,5s,6s,a5,a7,a9,core,dtv,dual,duos,edge,edition,flat,g2,g3,g4,g5,galaxy,gran,grand,...,mini,moto,motorola,neo,new,note,on,play,plus,prime,pro,s3,s4,s5,s6,s7,s8,samsung,se,tv,vibe,win,x2,z2,person
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0008ed71
1,3,7,0,3,0,0,0,147,2,5,2,0,5,2,0,14,0,15,0,0,6,3,61,0,0,...,0,55,55,1,0,0,0,7,108,7,2,0,0,0,20,10,7,61,4,0,1,0,10,0,00091926
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,00091a7a
3,22,7,3,29,0,6,1,0,22,5,0,1,0,2,51,0,0,1,1,28,0,0,110,9,2,...,1,37,37,0,0,4,0,0,0,9,0,5,0,2,1,0,0,110,0,6,0,0,8,0,000ba417
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,000c79fe


In [24]:
df_top = subjects.merge(df_top, on = 'person', how='left')

## Diff de lo de caro 0.62

In [25]:
df_diff = pd.read_csv('diff.csv')
df_top = df_top.merge(df_diff , left_on='person', right_on='person', how='left')
df_top.shape

(38829, 67)

##  Feature Most Viewed and Searched in a day0.63780

In [26]:
df_v = df.loc[df['event'] == 'viewed product']
df_v['timestamp'] = pd.to_datetime(df_v['timestamp'])
df_v['count'] = 1
df_v['month'] = df_v['timestamp'].dt.month
df_v['day'] = df_v['timestamp'].dt.day

df_v = df_v.groupby(['person','month', 'day']).agg({'count':'sum'})
df_v = df_v.groupby(['person']).agg({'count':'max'}).reset_index()
df_v.head()

Unnamed: 0,person,count
0,00091926,62
1,00091a7a,3
2,000ba417,69
3,000c79fe,3
4,000e4d9e,153


In [27]:
df_s = df.loc[df['event'] == 'searched products']
df_s['timestamp'] = pd.to_datetime(df_s['timestamp'])
df_s['count'] = 1
df_s['month'] = df_s['timestamp'].dt.month
df_s['day'] = df_s['timestamp'].dt.day

df_s = df_s.groupby(['person','month', 'day']).agg({'count':'sum'})
df_s = df_s.groupby(['person']).agg({'count':'max'}).reset_index()
df_s['person'].describe()

count        13093
unique       13093
top       9e160f67
freq             1
Name: person, dtype: object

In [28]:
df_s.columns = ['person','searched']
df_v.columns = ['person', 'viewed']
df_s.head()

Unnamed: 0,person,searched
0,000c79fe,9
1,000e619d,5
2,001001be,17
3,001802e4,4
4,0019e639,7


In [29]:
df_vs_feat = df_v.merge(df_s,  on='person', how='left')
df_vs_feat.head()

Unnamed: 0,person,viewed,searched
0,00091926,62,
1,00091a7a,3,
2,000ba417,69,
3,000c79fe,3,9.0
4,000e4d9e,153,


In [30]:
df_top = df_top.merge(df_vs_feat, left_on='person', right_on='person', how='left')
df_top.shape

(38829, 69)

## Lead and conversion

In [31]:
df_lead = df.loc[df['event'] == 'lead']
df_lead = df_lead['person'].drop_duplicates().to_frame()
df_lead['lead'] = 1

df_conversion = df.loc[df['event'] == 'conversion']
df_conversion = df_conversion['person'].drop_duplicates().to_frame()
df_conversion['conversion'] = 1


In [32]:
df_top = df_top.merge(df_lead, left_on='person', right_on='person', how='left')
df_top = df_top.merge(df_conversion, left_on='person', right_on='person', how='left')

df_top.shape

(38829, 71)

## Most common searched values in conversion persons count vector

In [33]:
person_conv = (df.loc[df['event']=='conversion'])['person'].drop_duplicates()
df_conv = df.loc[df['person'].isin(person_conv)]
df_conv['search_term'].value_counts()

Iphone 6                                                                                                                                 579
J5                                                                                                                                       569
iphone 6                                                                                                                                 551
Iphone                                                                                                                                   485
j5                                                                                                                                       473
J7                                                                                                                                       458
iphone 6s                                                                                                                                447
S6           

In [34]:
b = np.array(df_conv['search_term'].value_counts().nlargest(1000).index.values.tolist())
word_vectorizer = CountVectorizer()
tf_mat = word_vectorizer.fit_transform(b)
feat_names_top = word_vectorizer.get_feature_names()

In [35]:
df_search = df.loc[df['search_term'].notnull()]
df_search = df_search.groupby('person')['search_term'].apply(list).reset_index()

In [36]:
#drop nans
df_search.head()

Unnamed: 0,person,search_term
0,000c79fe,"[Iphone 7, Galaxy a8, Iphone 7, Galaxy s8, Iph..."
1,000e619d,"[samsung rosa, sansung j7, sansung j7, sansung..."
2,001001be,"[IPhone 6, 5s, IPhone 6, IPhone 6, 5s, IPho..."
3,001802e4,"[Aiphone 6s, Aiphone 6s]"
4,0019e639,"[ON 7, ON 7, ON 7, ON 7, ON 7, ON 7, ON 7, ON ..."


In [37]:
#df_all = df_labels.merge(df_search, left_on = 'person', right_on = 'person', how = 'left')
#df_all.head()

In [38]:
df_search['search_term'] = df_search['search_term'].apply(lambda x: ', '.join(map(str, x)))
df_search = df_search.fillna('')
df_search.shape

(12570, 2)

In [39]:
word_vectorizer = CountVectorizer()

tf_mat = word_vectorizer.fit_transform(df_search['search_term'])

tf_array = tf_mat.toarray()
tf_array.shape

(12570, 3020)

In [40]:
doc = 0
feature_names = word_vectorizer.get_feature_names()

In [41]:
df_countvect = pd.DataFrame(tf_array, columns=feature_names)
df_countvect['person'] = df_search['person']
df_countvect.head()

Unnamed: 0,00,000,02,06,09,0r,10,100,1000reais,10059,1016,1020,1033,1059,10k,10s,1100,12,127,128,128g,128gb,12gb,12mp,12x,...,zm,zmoto,znefone,zoamos,zomm,zoom,zplay,zply,zq,zqx,zte,zuk,zz,zénfone,ásia,çite,édge,ícone,íphobe,íphone,íplone,ótima,ótimas,ótimo,person
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,000c79fe
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,000e619d
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,001001be
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,001802e4
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0019e639


In [42]:
df_topete = df_countvect[feat_names_top]
df_topete['person'] = df_search['person']
df_topete.shape

(12570, 309)

In [43]:
df_top = df_top.merge(df_topete, left_on='person', right_on='person', how='left')

In [44]:
df_top.head()

Unnamed: 0,person,2016_x,2017_x,3g_x,4g_x,4s_x,5c_x,5s_x,6s_x,a5_x,a7_x,a9_x,core_x,dtv_x,dual_x,duos_x,edge_x,edition,flat_x,g2_x,g3_x,g4_x,g5_x,galaxy_x,gran_x,...,titânio,todos,travesti,trocafone,tv_y,tvs,twist,usado,vendidos,wi,win_y,winduos,x2_y,x4,xiaomi,xiomi,xperia,xplay,z1,z2_y,z3,z5,zefone,zen,zenfone
0,4886f805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ad93850f,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,0297fc1e,0.0,0.0,0.0,2.0,0.0,0.0,1.0,127.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2d681dd8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,cccea85e,0.0,8.0,0.0,53.0,0.0,0.0,2.0,0.0,5.0,3.0,0.0,0.0,9.0,43.0,0.0,3.0,1.0,1.0,1.0,28.0,386.0,128.0,34.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## XGboost

In [45]:
df_train = df_labels.merge(df_top , left_on='person', right_on='person', how='left')

In [46]:
df_train.shape

(19414, 380)

In [47]:
df_train.head()

Unnamed: 0,person,label,2016_x,2017_x,3g_x,4g_x,4s_x,5c_x,5s_x,6s_x,a5_x,a7_x,a9_x,core_x,dtv_x,dual_x,duos_x,edge_x,edition,flat_x,g2_x,g3_x,g4_x,g5_x,galaxy_x,...,titânio,todos,travesti,trocafone,tv_y,tvs,twist,usado,vendidos,wi,win_y,winduos,x2_y,x4,xiaomi,xiomi,xperia,xplay,z1,z2_y,z3,z5,zefone,zen,zenfone
0,0566e9c1,0,1.0,0.0,0.0,1.0,6.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,11.0,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,6ec7ee77,0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,abe7a2fb,0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,34728364,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,2.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0,9.0,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,87ed62de,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [48]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,2016_x,2017_x,3g_x,4g_x,4s_x,5c_x,5s_x,6s_x,a5_x,a7_x,a9_x,core_x,dtv_x,dual_x,duos_x,edge_x,edition,flat_x,g2_x,g3_x,g4_x,g5_x,galaxy_x,gran_x,grand,...,titânio,todos,travesti,trocafone,tv_y,tvs,twist,usado,vendidos,wi,win_y,winduos,x2_y,x4,xiaomi,xiomi,xperia,xplay,z1,z2_y,z3,z5,zefone,zen,zenfone
0,1.0,0.0,0.0,1.0,6.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,11.0,0.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,2.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0,9.0,0.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [49]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [50]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [51]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 4,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 150) #150 = 0.61730  #1500 = 0.61296 , 

In [52]:
xg_reg.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=150,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [53]:
persons = df_labels['person']
df_predict = df_top.loc[~df_top.person.isin(persons)]
ppl_to_predict = (df.loc[~df['person'].isin(persons)])['person'].to_frame()
ppl_to_predict = ppl_to_predict.drop_duplicates('person')
df_predict = ppl_to_predict.merge(df_predict, left_on = 'person' , right_on = 'person', how='left')
X_predict = df_predict.drop(['person'], axis=1)

In [54]:
entrie = xg_reg.predict(X_predict)

In [55]:
seriesita = pd.Series(entrie)

In [56]:
df_entrie = df_predict['person'].to_frame()
df_entrie['label'] = seriesita

In [57]:
df_entrie.head()

Unnamed: 0,person,label
0,4886f805,0.042229
1,0297fc1e,0.153189
2,2d681dd8,0.053668
3,cccea85e,0.094267
4,4c8a8b93,0.038895


In [58]:
df_entrie = df_entrie.fillna(0)

In [59]:
num = df_entrie._get_numeric_data()
num[num < 0] = 0

In [60]:
df_entrie.to_csv(path_or_buf = 'entrie2.0', index = False)

In [61]:
df_entrie.shape

(19415, 2)

In [62]:
df_entrie['label'].nlargest()

273      0.724125
14296    0.568077
9492     0.555338
17       0.546413
11087    0.530641
Name: label, dtype: float32

## Scoring

Para evaluar usen esta medida que me da valores muy parecidos a los de kaggle, para hacer las predicciones usen el otro

In [63]:
my_classifier1 = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 4,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 150)

In [64]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [65]:
my_classifier1.fit(X_train,y_train)
predictions=my_classifier1.predict(X_test)

In [66]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  my_classifier1.predict_proba(X_test)[:,1])

0.6376784670973892

## Sin search 

0.642202895180183

## Con search
0.6376607409256182  aunque en kaggle me da un poco mejor