In [1]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features

import matplotlib #collection of functions for scientific and publication-ready visualization

import numpy as np #foundational package for scientific computing

import scipy as sp #collection of functions for scientific computing and advance mathematics

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook

import sklearn #collection of machine learning algorithms

#misc libraries
import random
import time
import datetime as dt

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

import featuretools as ft
from sklearn.feature_extraction.text import CountVectorizer


-------------------------


In [2]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
import xgboost as xgb

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.model_selection import train_test_split

from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8



In [3]:
data_raw = pd.read_csv('events_up_to_01062018.csv')
data_val = pd.read_csv('labels_training_set.csv')

In [4]:
df_labels = data_val.copy(deep=True) 
df = data_raw.copy(deep=True)

In [5]:
pd.set_option('display.max_columns', 23)

In [6]:
df.describe(include= 'all')

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
count,2341681,2341681,2341681,191131,1320530.0,1321513,1320530,1320530,1320530,505949,113763,11201,191286,106406,204069,204069,204069,204069,204069,204069,204066,204069,204069
unique,1490912,11,38829,248,,208,5,8,63,52267,10964,14,23,4,7,2,2206,122,51,4,393,131,366
top,2018-05-31 01:59:16,viewed product,c76b8417,/,,iPhone 6,Bom,16GB,Preto,"2820,6706,6720,2750,6649,7251,6663,12604,7224,...",Iphone,CustomerService,google,Google,Paid,Returning,Unknown,Sao Paulo,Brazil,Smartphone,360x640,Windows 7,Chrome 66.0
freq,14,1248124,4438,64187,,107262,547617,442096,314925,2606,2577,5239,123354,105195,91753,165827,36866,57304,197699,103502,73234,46648,57953
mean,,,,,6899.178,,,,,,,,,,,,,,,,,,
std,,,,,4028.042,,,,,,,,,,,,,,,,,,
min,,,,,71.0,,,,,,,,,,,,,,,,,,
25%,,,,,2929.0,,,,,,,,,,,,,,,,,,
50%,,,,,7057.0,,,,,,,,,,,,,,,,,,
75%,,,,,10014.0,,,,,,,,,,,,,,,,,,


In [7]:
persons = (df.drop_duplicates('person'))['person'].to_frame()
persons_to_train = df_labels['person'].to_frame()
persons_to_predict = persons.loc[~persons['person'].isin(persons_to_train['person'])]


print(persons_to_train.shape)
print(persons_to_predict.shape)


(19414, 1)
(19415, 1)


In [8]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['month'] = df['timestamp'].dt.month
df_month = df.loc[df['month'] == 5]

In [9]:
df_month.shape

(1713920, 24)

# Converted

In [10]:
df_modelos_conversion = df_month.loc[df_month['event']=='conversion']
df_modelos_conversion = df_modelos_conversion['model'].value_counts().to_frame()

In [11]:
df_modelos_conversion['modelo'] =df_modelos_conversion.index
df_modelos_conversion = df_modelos_conversion.reset_index(drop=True)
df_modelos_conversion.columns = ['cantidad', 'modelo']

In [12]:
df_modelos_conversion['modelo']= df_modelos_conversion['modelo'].str.lower()

In [13]:
df_modelos_conversion.head()

Unnamed: 0,cantidad,modelo
0,371,iphone 5s
1,355,samsung galaxy j5
2,287,iphone 6
3,274,iphone 6s
4,221,motorola moto g2 3g dual


# Viewed

In [14]:
df_modelos_viewed = df_month.loc[df_month['event']=='viewed product']
df_modelos_viewed = df_modelos_viewed['model'].value_counts().to_frame()

In [15]:
df_modelos_viewed['modelo'] =df_modelos_viewed.index
df_modelos_viewed = df_modelos_viewed.reset_index(drop=True)
df_modelos_viewed.columns = ['cantidad', 'modelo']

In [16]:
df_modelos_viewed['modelo']= df_modelos_viewed['modelo'].str.lower()

In [17]:
df_modelos_viewed.head()

Unnamed: 0,cantidad,modelo
0,72321,iphone 6s
1,69669,iphone 6
2,64615,iphone 5s
3,40771,iphone 7
4,34401,samsung galaxy s7 edge


# Searched


In [18]:
df_modelos_searched = df_month.loc[df_month['event']=='searched products']
df_modelos_searched = df_modelos_searched['search_term'].value_counts().to_frame()

In [19]:
df_modelos_searched['term'] =df_modelos_searched.index
df_modelos_searched = df_modelos_searched.reset_index(drop=True)
df_modelos_searched.columns = ['cantidad', 'modelo']

In [20]:
df_modelos_searched['modelo']= df_modelos_searched['modelo'].str.lower()

In [21]:
df_modelos_searched = df_modelos_searched.groupby('modelo').agg({'cantidad':'sum'}).reset_index().sort_values(by='cantidad',ascending = False)

In [22]:
df_modelos_searched = df_modelos_searched.reset_index(drop= True)
df_modelos_searched.head()

Unnamed: 0,modelo,cantidad
0,iphone 6,5056
1,iphone 6s,4448
2,iphone,3701
3,iphone 7,2105
4,iphone 5s,2058


# Factor de CONVERSION / VISTO


In [23]:
df_factor_cv = df_modelos_conversion['modelo'].to_frame()

In [24]:
df_factor_cv['relacion c/v'] = df_modelos_conversion['cantidad'] / df_modelos_viewed ['cantidad']

In [25]:
df_factor_cv.sort_values(by='relacion c/v' , ascending=False).head()

Unnamed: 0,modelo,relacion c/v
48,samsung galaxy j3,0.011026
47,samsung galaxy j1 2016,0.010931
46,lg k10 tv,0.010819
44,motorola moto x play 4g dual,0.010299
42,samsung galaxy s8 plus,0.010213


In [26]:
df_factor_cv.index = df_factor_cv['modelo']
df_factor_cv = df_factor_cv.drop('modelo', axis = 1)
df_factor_cv.head()

Unnamed: 0_level_0,relacion c/v
modelo,Unnamed: 1_level_1
iphone 5s,0.00513
samsung galaxy j5,0.005096
iphone 6,0.004442
iphone 6s,0.00672
motorola moto g2 3g dual,0.006424


# Factor de CONVERSION / BUSCADO 

- Cuidado con este ya que al tener que buscar muy exacto genera ruido por ejemplo el  <b>motorola moto g2 3g dual <\b>

In [27]:
df_factor_cs = df_modelos_conversion.merge(df_modelos_searched, on='modelo', how='left')
df_factor_cs.head()

Unnamed: 0,cantidad_x,modelo,cantidad_y
0,371,iphone 5s,2058.0
1,355,samsung galaxy j5,63.0
2,287,iphone 6,5056.0
3,274,iphone 6s,4448.0
4,221,motorola moto g2 3g dual,


In [28]:
df_factor_cs['relacion c/s'] = df_factor_cs['cantidad_x'] / df_factor_cs ['cantidad_y']

In [29]:
df_factor_cs = df_factor_cs.drop(['cantidad_x','cantidad_y'],axis=1)

In [30]:
df_factor_cs.head()

Unnamed: 0,modelo,relacion c/s
0,iphone 5s,0.180272
1,samsung galaxy j5,5.634921
2,iphone 6,0.056764
3,iphone 6s,0.061601
4,motorola moto g2 3g dual,


In [31]:
df_factor_cs.index = df_factor_cs['modelo']
df_factor_cs = df_factor_cs.drop('modelo', axis = 1)
df_factor_cv.head()

Unnamed: 0_level_0,relacion c/v
modelo,Unnamed: 1_level_1
iphone 5s,0.00513
samsung galaxy j5,0.005096
iphone 6,0.004442
iphone 6s,0.00672
motorola moto g2 3g dual,0.006424


# Uso para mis features

## Viewed product

In [35]:
df_conversion = df_month.loc[df_month['event'] == 'conversion']
models_converted = (df_conversion['model'].value_counts())
df_top_models =models_converted.index

df_top_models = pd.Series(df_top_models)
models_converted = df_top_models.values
models_converted = np.array([x.lower() if isinstance(x, str) else x for x in models_converted])
models_converted = list(models_converted)

In [36]:
models_converted

['iphone 5s',
 'samsung galaxy j5',
 'iphone 6',
 'iphone 6s',
 'motorola moto g2 3g dual',
 'samsung galaxy j7 prime',
 'samsung galaxy s7',
 'motorola moto g4 plus',
 'samsung galaxy s6 flat',
 'samsung galaxy s8',
 'samsung galaxy s7 edge',
 'iphone 7 plus',
 'samsung galaxy a5 2017',
 'iphone 4s',
 'iphone 6 plus',
 'samsung galaxy s6 edge',
 'iphone 5c',
 'iphone 7',
 'samsung galaxy j7',
 'samsung galaxy j2 prime tv',
 'samsung galaxy gran prime duos tv',
 'samsung galaxy win duos',
 'lenovo vibe k5',
 'samsung galaxy s5',
 'samsung galaxy core plus duos tv',
 'motorola moto g5 plus',
 'samsung galaxy j2 4g duos tv',
 'motorola moto g5 ',
 'iphone se',
 'samsung galaxy a5',
 'samsung galaxy s5 duos',
 'samsung galaxy j5 prime',
 'samsung galaxy j1 mini',
 'iphone 6s plus',
 'samsung galaxy j7 2016 metal',
 'samsung galaxy a7 2017',
 'motorola moto g3 4g',
 'motorola moto g3 hdtv',
 'iphone 4g',
 'samsung galaxy s5 mini duos',
 'samsung galaxy a5 2016',
 'iphone 5',
 'samsung gala

In [37]:
df_model_viewed = df.loc[df['event'] == 'viewed product']
df_model_viewed = df_model_viewed.groupby('person')['model'].apply(list).reset_index()
df_model_viewed.head()

Unnamed: 0,person,model
0,00091926,"[iPhone 6 Plus, iPhone 6S, iPhone 6S, Motorola..."
1,00091a7a,"[iPhone SE, iPhone 6, iPhone 6S]"
2,000ba417,"[Samsung Galaxy A3 2016, Samsung Galaxy Gran P..."
3,000c79fe,"[iPhone 7, iPhone 7, iPhone 7]"
4,000e4d9e,"[Samsung Galaxy S4 i9505, Samsung Galaxy S6 Fl..."


In [38]:
df_model_viewed['model'] = df_model_viewed['model'].apply(lambda x: ', '.join(map(str, x)))
df_model_viewed = df_model_viewed.fillna('')
df_model_viewed.head()

Unnamed: 0,person,model
0,00091926,"iPhone 6 Plus, iPhone 6S, iPhone 6S, Motorola ..."
1,00091a7a,"iPhone SE, iPhone 6, iPhone 6S"
2,000ba417,"Samsung Galaxy A3 2016, Samsung Galaxy Gran Pr..."
3,000c79fe,"iPhone 7, iPhone 7, iPhone 7"
4,000e4d9e,"Samsung Galaxy S4 i9505, Samsung Galaxy S6 Fla..."


In [39]:
word_vectorizer = CountVectorizer(vocabulary = models_converted, tokenizer=lambda x: x.split(', '))

tf_mat = word_vectorizer.fit_transform(df_model_viewed['model'])

tf_array = tf_mat.toarray()
tf_array.shape

(37130, 144)

In [40]:
model_names = word_vectorizer.get_feature_names()

In [41]:
df_count = pd.DataFrame(tf_array, columns=model_names)
df_count.shape

(37130, 144)

In [42]:
df_count.head()

Unnamed: 0,iphone 5s,samsung galaxy j5,iphone 6,iphone 6s,motorola moto g2 3g dual,samsung galaxy j7 prime,samsung galaxy s7,motorola moto g4 plus,samsung galaxy s6 flat,samsung galaxy s8,samsung galaxy s7 edge,...,samsung galaxy s3 slim duos,ipad air wi-fi + 4g,samsung galaxy tab 4 10.1 wi-fi + 3g,samsung galaxy gran neo plus duos,samsung galaxy e5 4g duos,quantum go 4g,ipad mini wi-fi,samsung galaxy s3 duos,asus zenfone 3 max 32 gb,lg g3 beat d724,sony xperia z5 premium
0,0,1,5,94,0,7,1,0,15,5,9,...,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0
2,1,11,0,0,0,0,0,0,1,0,0,...,4,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0
4,7,1,0,1,0,9,22,0,139,9,2,...,0,0,0,0,0,0,0,0,0,0,0


In [43]:
df_count = df_count.transpose()

In [44]:
df_count_cv = df_count.mul(df_factor_cv.reindex(df_count.index)['relacion c/v'], axis=0)

In [45]:
df_count_cv = df_count_cv.transpose()

In [46]:
df_count_cv.head()

Unnamed: 0,iphone 5s,samsung galaxy j5,iphone 6,iphone 6s,motorola moto g2 3g dual,samsung galaxy j7 prime,samsung galaxy s7,motorola moto g4 plus,samsung galaxy s6 flat,samsung galaxy s8,samsung galaxy s7 edge,...,samsung galaxy s3 slim duos,ipad air wi-fi + 4g,samsung galaxy tab 4 10.1 wi-fi + 3g,samsung galaxy gran neo plus duos,samsung galaxy e5 4g duos,quantum go 4g,ipad mini wi-fi,samsung galaxy s3 duos,asus zenfone 3 max 32 gb,lg g3 beat d724,sony xperia z5 premium
0,0.0,0.005096,0.022208,0.631724,0.0,0.04227,0.00568,0.0,0.087901,0.020944,0.04071,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.004442,0.00672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.00513,0.056051,0.0,0.0,0.0,0.0,0.0,0.0,0.00586,0.0,0.0,...,0.015326,0.0,0.0,0.0,0.0,0.0,0.0,0.004878,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.035909,0.005096,0.0,0.00672,0.0,0.054347,0.12497,0.0,0.814545,0.037699,0.009047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
df_count_cv['total_cv_factor'] = df_count_cv.iloc[:, 1:].max(1)
df_count_cv.head()

Unnamed: 0,iphone 5s,samsung galaxy j5,iphone 6,iphone 6s,motorola moto g2 3g dual,samsung galaxy j7 prime,samsung galaxy s7,motorola moto g4 plus,samsung galaxy s6 flat,samsung galaxy s8,samsung galaxy s7 edge,...,ipad air wi-fi + 4g,samsung galaxy tab 4 10.1 wi-fi + 3g,samsung galaxy gran neo plus duos,samsung galaxy e5 4g duos,quantum go 4g,ipad mini wi-fi,samsung galaxy s3 duos,asus zenfone 3 max 32 gb,lg g3 beat d724,sony xperia z5 premium,total_cv_factor
0,0.0,0.005096,0.022208,0.631724,0.0,0.04227,0.00568,0.0,0.087901,0.020944,0.04071,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.631724
1,0.0,0.0,0.004442,0.00672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006957
2,0.00513,0.056051,0.0,0.0,0.0,0.0,0.0,0.0,0.00586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004878,0.0,0.0,0.0,0.230843
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014998
4,0.035909,0.005096,0.0,0.00672,0.0,0.054347,0.12497,0.0,0.814545,0.037699,0.009047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.814545


In [48]:
df_count_cv['person'] = df_model_viewed['person']

In [49]:
df_factorcv_by_model = df_count_cv['total_cv_factor'].to_frame()
df_factorcv_by_model.sort_values(by='total_cv_factor',ascending=False).head()

Unnamed: 0,total_cv_factor
16084,4.867577
3338,4.563194
25691,3.854266
14702,3.582007
15478,3.454318


In [50]:
df_count_cv.head()

Unnamed: 0,iphone 5s,samsung galaxy j5,iphone 6,iphone 6s,motorola moto g2 3g dual,samsung galaxy j7 prime,samsung galaxy s7,motorola moto g4 plus,samsung galaxy s6 flat,samsung galaxy s8,samsung galaxy s7 edge,...,samsung galaxy tab 4 10.1 wi-fi + 3g,samsung galaxy gran neo plus duos,samsung galaxy e5 4g duos,quantum go 4g,ipad mini wi-fi,samsung galaxy s3 duos,asus zenfone 3 max 32 gb,lg g3 beat d724,sony xperia z5 premium,total_cv_factor,person
0,0.0,0.005096,0.022208,0.631724,0.0,0.04227,0.00568,0.0,0.087901,0.020944,0.04071,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.631724,00091926
1,0.0,0.0,0.004442,0.00672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006957,00091a7a
2,0.00513,0.056051,0.0,0.0,0.0,0.0,0.0,0.0,0.00586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.004878,0.0,0.0,0.0,0.230843,000ba417
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014998,000c79fe
4,0.035909,0.005096,0.0,0.00672,0.0,0.054347,0.12497,0.0,0.814545,0.037699,0.009047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.814545,000e4d9e


In [51]:
df_count_cv['person'] = df_model_viewed['person']
df_count_cv.head()

Unnamed: 0,iphone 5s,samsung galaxy j5,iphone 6,iphone 6s,motorola moto g2 3g dual,samsung galaxy j7 prime,samsung galaxy s7,motorola moto g4 plus,samsung galaxy s6 flat,samsung galaxy s8,samsung galaxy s7 edge,...,samsung galaxy tab 4 10.1 wi-fi + 3g,samsung galaxy gran neo plus duos,samsung galaxy e5 4g duos,quantum go 4g,ipad mini wi-fi,samsung galaxy s3 duos,asus zenfone 3 max 32 gb,lg g3 beat d724,sony xperia z5 premium,total_cv_factor,person
0,0.0,0.005096,0.022208,0.631724,0.0,0.04227,0.00568,0.0,0.087901,0.020944,0.04071,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.631724,00091926
1,0.0,0.0,0.004442,0.00672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006957,00091a7a
2,0.00513,0.056051,0.0,0.0,0.0,0.0,0.0,0.0,0.00586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.004878,0.0,0.0,0.0,0.230843,000ba417
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014998,000c79fe
4,0.035909,0.005096,0.0,0.00672,0.0,0.054347,0.12497,0.0,0.814545,0.037699,0.009047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.814545,000e4d9e


In [52]:
df_count_cv['total_cv_factor'] = df_count_cv.iloc[:, 0:151].max(1)

In [53]:
df_count_cv = df_count_cv [['person' , 'total_cv_factor']]
df_count_cv.head()

Unnamed: 0,person,total_cv_factor
0,00091926,0.631724
1,00091a7a,0.006957
2,000ba417,0.230843
3,000c79fe,0.014998
4,000e4d9e,0.814545


## Searched product

In [62]:
df_model_viewed = df_month.loc[df_month['event'] == 'searched products']
df_model_viewed = df_model_viewed.groupby('person')['search_term'].apply(list).reset_index()
df_model_viewed.head()

Unnamed: 0,person,search_term
0,000c79fe,"[Iphone 7, Galaxy a8, Iphone 7, nan, Galaxy s8..."
1,000e619d,"[samsung rosa, sansung j7, sansung j7, sansung..."
2,001001be,"[IPhone 6, 5s, IPhone 6, IPhone 6, 5s, IPho..."
3,001802e4,"[Aiphone 6s, nan, nan, Aiphone 6s]"
4,0019e639,[ON 7]


In [63]:
df_model_viewed['search_term'] = df_model_viewed['search_term'].apply(lambda x: ', '.join(map(str, x)))
df_model_viewed = df_model_viewed.fillna('')
df_model_viewed.head()

Unnamed: 0,person,search_term
0,000c79fe,"Iphone 7, Galaxy a8, Iphone 7, nan, Galaxy s8,..."
1,000e619d,"samsung rosa, sansung j7, sansung j7, sansung ..."
2,001001be,"IPhone 6, 5s, IPhone 6, IPhone 6, 5s, IPhon..."
3,001802e4,"Aiphone 6s, nan, nan, Aiphone 6s"
4,0019e639,ON 7


In [64]:
word_vectorizer = CountVectorizer(vocabulary = models_converted, tokenizer=lambda x: x.split(', '))

tf_mat = word_vectorizer.fit_transform(df_model_viewed['search_term'])

tf_array = tf_mat.toarray()
tf_array.shape

(11310, 144)

In [65]:
model_names = word_vectorizer.get_feature_names()

In [66]:
df_count = pd.DataFrame(tf_array, columns=model_names)
df_count.shape

(11310, 144)

In [67]:
df_count.head()

Unnamed: 0,iphone 5s,samsung galaxy j5,iphone 6,iphone 6s,motorola moto g2 3g dual,samsung galaxy j7 prime,samsung galaxy s7,motorola moto g4 plus,samsung galaxy s6 flat,samsung galaxy s8,samsung galaxy s7 edge,...,samsung galaxy s3 slim duos,ipad air wi-fi + 4g,samsung galaxy tab 4 10.1 wi-fi + 3g,samsung galaxy gran neo plus duos,samsung galaxy e5 4g duos,quantum go 4g,ipad mini wi-fi,samsung galaxy s3 duos,asus zenfone 3 max 32 gb,lg g3 beat d724,sony xperia z5 premium
0,0,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0


In [68]:
df_count = df_count.transpose()

In [69]:
df_count_cs = df_count.mul(df_factor_cs.reindex(df_count.index)['relacion c/s'], axis=0)

In [70]:
df_count_cs = df_count_cs.transpose()

In [71]:
df_count_cs.head()

Unnamed: 0,iphone 5s,samsung galaxy j5,iphone 6,iphone 6s,motorola moto g2 3g dual,samsung galaxy j7 prime,samsung galaxy s7,motorola moto g4 plus,samsung galaxy s6 flat,samsung galaxy s8,samsung galaxy s7 edge,...,samsung galaxy s3 slim duos,ipad air wi-fi + 4g,samsung galaxy tab 4 10.1 wi-fi + 3g,samsung galaxy gran neo plus duos,samsung galaxy e5 4g duos,quantum go 4g,ipad mini wi-fi,samsung galaxy s3 duos,asus zenfone 3 max 32 gb,lg g3 beat d724,sony xperia z5 premium
0,0.0,0.0,0.056764,0.061601,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,,
1,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,,
2,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,,
3,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,,
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,,


In [72]:
df_count_cs['total_cs_factor'] = df_count_cs.iloc[:, 1:].sum(1)
df_count_cs.head()

Unnamed: 0,iphone 5s,samsung galaxy j5,iphone 6,iphone 6s,motorola moto g2 3g dual,samsung galaxy j7 prime,samsung galaxy s7,motorola moto g4 plus,samsung galaxy s6 flat,samsung galaxy s8,samsung galaxy s7 edge,...,ipad air wi-fi + 4g,samsung galaxy tab 4 10.1 wi-fi + 3g,samsung galaxy gran neo plus duos,samsung galaxy e5 4g duos,quantum go 4g,ipad mini wi-fi,samsung galaxy s3 duos,asus zenfone 3 max 32 gb,lg g3 beat d724,sony xperia z5 premium,total_cs_factor
0,0.0,0.0,0.056764,0.061601,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,,0.217994
1,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,,0.0
2,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,,0.0
3,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,,0.0


In [73]:
df_count_cs['person'] = df_model_viewed['person']

In [74]:
df_factorcs_by_model = df_count_cs['total_cs_factor'].to_frame()
df_factorcs_by_model.sort_values(by='total_cs_factor',ascending=False).head()

Unnamed: 0,total_cs_factor
1077,185.163478
9932,95.727989
8130,78.0
5914,77.013552
6722,76.809802


In [75]:
df_count_cs['person'] = df_model_viewed['person']
df_count_cs.head()


Unnamed: 0,iphone 5s,samsung galaxy j5,iphone 6,iphone 6s,motorola moto g2 3g dual,samsung galaxy j7 prime,samsung galaxy s7,motorola moto g4 plus,samsung galaxy s6 flat,samsung galaxy s8,samsung galaxy s7 edge,...,samsung galaxy tab 4 10.1 wi-fi + 3g,samsung galaxy gran neo plus duos,samsung galaxy e5 4g duos,quantum go 4g,ipad mini wi-fi,samsung galaxy s3 duos,asus zenfone 3 max 32 gb,lg g3 beat d724,sony xperia z5 premium,total_cs_factor,person
0,0.0,0.0,0.056764,0.061601,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.217994,000c79fe
1,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0,000e619d
2,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0,001001be
3,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0,001802e4
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0,0019e639


In [76]:
df_count_cs['total_cs_factor'] = df_count_cs.iloc[:, 0:151].max(1)

In [77]:
df_count_cs = df_count_cs [['person' , 'total_cs_factor']]
df_count_cs.head()

Unnamed: 0,person,total_cs_factor
0,000c79fe,0.217994
1,000e619d,0.0
2,001001be,0.0
3,001802e4,0.0
4,0019e639,0.0


In [78]:
features = df_count_cs.merge(df_count_cv, on='person' , how='right')

In [79]:
features.head()

Unnamed: 0,person,total_cs_factor,total_cv_factor
0,000c79fe,0.217994,0.014998
1,000e619d,0.0,0.025697
2,001001be,0.0,0.115484
3,001802e4,0.0,0.013441
4,0019e639,0.0,0.372605


In [80]:
features = features.merge(persons, on='person', how='right')
features =features.fillna(0)
features.shape


(38829, 3)

## XGboost entrenamiento

In [81]:
df_train = df_labels.merge(features , left_on='person', right_on='person' , how='inner')

In [82]:
df_train.head()

Unnamed: 0,person,label,total_cs_factor,total_cv_factor
0,0566e9c1,0,0.0,0.02052
1,6ec7ee77,0,0.0,0.0
2,abe7a2fb,0,0.0,0.071067
3,34728364,0,0.0,0.040622
4,87ed62de,0,0.0,0.053111


In [83]:
df_train2 = pd.concat([df_train.loc[df_train['label'] == 1] , df_train.loc[df_train['label']==0].sample(7000)])

In [84]:
df_train2.head()

Unnamed: 0,person,label,total_cs_factor,total_cv_factor
5,db2c4d27,1,0.0,0.426403
25,8123457d,1,0.0,0.012888
46,e4b02ea2,1,0.0,0.058803
48,d8001b23,1,0.0,0.187843
54,7a472832,1,0.0,0.333444


In [85]:
df_train2.shape

(7980, 4)

Los labels me dan mi set para entrenar, los que no se encuentran en labels tengo que predecirlos

Si ven aca, de la columna label en adelante tenemos los features.

In [86]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,total_cs_factor,total_cv_factor
0,0.0,0.02052
1,0.0,0.0
2,0.0,0.071067
3,0.0,0.040622
4,0.0,0.053111


## Xgboost

Para evaluar usen esta medida que me da valores muy parecidos a los de kaggle, para hacer las predicciones usen el otro

In [89]:
import xgboost as xgb
model = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 6,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 10)



Este es el arbol con sus hiperparametros

In [90]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [91]:
model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [92]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  model.predict_proba(X_test)[:,1])

0.5703828285865009

Obtengo un resultado con los que separe para el test mas arriba en :
```python
    X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)
    ```
###### Lo que hace es sacar las predicciones para X_test y evaluarlos con y_test
    

In [93]:
X.isnull().sum()

total_cs_factor    0
total_cv_factor    0
dtype: int64

In [94]:
X_predict = persons_to_predict.merge(features, on='person', how='left')
X_predict.head()


Unnamed: 0,person,total_cs_factor,total_cv_factor
0,4886f805,0.0,0.024154
1,0297fc1e,0.0,0.946081
2,2d681dd8,0.0,0.029996
3,cccea85e,0.0,1.37885
4,4c8a8b93,0.418776,0.368661


In [107]:
df_entrie = persons_to_predict
df_entrie['label'] = model.predict_proba(X_predict.iloc[:,1:])[:,1]
df_entrie.head()

Unnamed: 0,person,label
0,4886f805,0.208441
2,0297fc1e,0.228913
3,2d681dd8,0.208441
4,cccea85e,0.240742
5,4c8a8b93,0.23789


In [108]:
df_entrie2 = pd.read_csv('submit_kaggle.csv')

In [109]:
df_entrie = df_entrie.merge(df_entrie2, on='person', how='inner')
df_entrie['label'] = (df_entrie['label_x'] + df_entrie['label_y'])/2

In [110]:
df_entrie = df_entrie[['person','label']]

In [111]:
df_entrie.to_csv(path_or_buf = 'submittt.csv', index = False)

In [112]:
X_predict.shape

(19415, 3)

In [114]:
X_predict.head()

Unnamed: 0,person,total_cs_factor,total_cv_factor
0,4886f805,0.0,0.020335
1,0297fc1e,0.108023,0.917895
2,2d681dd8,0.0,0.040197
3,cccea85e,0.0,1.336512
4,4c8a8b93,0.399041,0.357677


In [116]:
entrie = pd.read_csv('submittt.csv')
entrie['label'].value_counts()

0.200037    2713
0.195907    1795
0.199535    1557
0.199478     917
0.198059     682
0.204246     566
0.204107     374
0.206815     360
0.205958     278
0.203744     243
0.211023     227
0.210167     207
0.203686     194
0.200116     193
0.204963     188
0.200894     140
0.208315     136
0.202268     107
0.209172      99
0.208570      97
0.204543      92
0.212778      90
0.202692      84
0.202190      76
0.201488      58
0.199081      56
0.208752      55
0.205103      54
0.201280      53
0.202614      52
            ... 
0.211799       1
0.250320       1
0.260446       1
0.223779       1
0.241093       1
0.202866       1
0.237225       1
0.210992       1
0.266138       1
0.233524       1
0.216047       1
0.224894       1
0.202139       1
0.237877       1
0.251865       1
0.256286       1
0.227782       1
0.235492       1
0.224716       1
0.286852       1
0.219141       1
0.229407       1
0.241152       1
0.223937       1
0.233890       1
0.211776       1
0.225932       1
0.230450      