In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer



In [2]:
df = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
df_labels= pd.read_csv('labels_training_set.csv', low_memory = False)

In [3]:
df_conversion = df.loc[df['event'] == 'conversion']
models_converted = (df_conversion['model'].value_counts())
df_top_models =(models_converted.head(10)).index

df_top_models = pd.Series(df_top_models)
models_converted = df_top_models.values
models_converted = np.array([x.lower() if isinstance(x, str) else x for x in models_converted])
models_converted = list(list(models_converted))

In [4]:
df_model = df.loc[df['event'] == 'viewed product']
df_model = df_model.groupby('person')['model'].apply(list).reset_index()
df_model.head()

Unnamed: 0,person,model
0,00091926,"[iPhone 6 Plus, iPhone 6S, iPhone 6S, Motorola..."
1,00091a7a,"[iPhone SE, iPhone 6, iPhone 6S]"
2,000ba417,"[Samsung Galaxy A3 2016, Samsung Galaxy Gran P..."
3,000c79fe,"[iPhone 7, iPhone 7, iPhone 7]"
4,000e4d9e,"[Samsung Galaxy S4 i9505, Samsung Galaxy S6 Fl..."


In [5]:
df_model['cantidad_vistos'] = df_model['model'].apply(len)
df_model['model'] = df_model['model'].apply(lambda x: ', '.join(map(str, x)))
df_model = df_model.fillna('')
df_model.head()

Unnamed: 0,person,model,cantidad_vistos
0,00091926,"iPhone 6 Plus, iPhone 6S, iPhone 6S, Motorola ...",372
1,00091a7a,"iPhone SE, iPhone 6, iPhone 6S",3
2,000ba417,"Samsung Galaxy A3 2016, Samsung Galaxy Gran Pr...",153
3,000c79fe,"iPhone 7, iPhone 7, iPhone 7",3
4,000e4d9e,"Samsung Galaxy S4 i9505, Samsung Galaxy S6 Fla...",339


In [6]:
list(models_converted)

['samsung galaxy j5',
 'iphone 5s',
 'iphone 6',
 'iphone 6s',
 'motorola moto g2 3g dual',
 'samsung galaxy j7 prime',
 'motorola moto g4 plus',
 'samsung galaxy s7',
 'samsung galaxy s6 flat',
 'samsung galaxy s7 edge']

In [7]:
word_vectorizer = CountVectorizer(vocabulary = models_converted, tokenizer=lambda x: x.split(', '))

tf_mat = word_vectorizer.fit_transform(df_model['model'])

tf_array = tf_mat.toarray()
tf_array.shape

(37130, 10)

In [8]:
feature_names = word_vectorizer.get_feature_names()

In [9]:
df_tfidf = pd.DataFrame(tf_array, columns=feature_names)
df_tfidf['person'] = df_model['person']
df_tfidf['cantidad_vistos'] = df_model['cantidad_vistos']
df_tfidf.shape

(37130, 12)

In [10]:
df_searched = df.loc[df['search_term'].notnull()]

In [11]:
df_searched = df_searched.groupby('person')['search_term'].apply(list).reset_index()
df_searched.head()

Unnamed: 0,person,search_term
0,000c79fe,"[Iphone 7, Galaxy a8, Iphone 7, Galaxy s8, Iph..."
1,000e619d,"[samsung rosa, sansung j7, sansung j7, sansung..."
2,001001be,"[IPhone 6, 5s, IPhone 6, IPhone 6, 5s, IPho..."
3,001802e4,"[Aiphone 6s, Aiphone 6s]"
4,0019e639,"[ON 7, ON 7, ON 7, ON 7, ON 7, ON 7, ON 7, ON ..."


In [12]:
df_searched['cantidad_vistos'] = df_searched['search_term'].apply(len)
df_searched['search_term'] = df_searched['search_term'].apply(lambda x: ', '.join(map(str, x)))
df_searched = df_searched.fillna('')
df_searched.head()

Unnamed: 0,person,search_term,cantidad_vistos
0,000c79fe,"Iphone 7, Galaxy a8, Iphone 7, Galaxy s8, Ipho...",8
1,000e619d,"samsung rosa, sansung j7, sansung j7, sansung ...",6
2,001001be,"IPhone 6, 5s, IPhone 6, IPhone 6, 5s, IPhon...",16
3,001802e4,"Aiphone 6s, Aiphone 6s",2
4,0019e639,"ON 7, ON 7, ON 7, ON 7, ON 7, ON 7, ON 7, ON 7...",11


In [13]:
word_vectorizer = CountVectorizer(vocabulary = models_converted, tokenizer=lambda x: x.split(', '))

tf_mat = word_vectorizer.fit_transform(df_searched['search_term'])

tf_array = tf_mat.toarray()
tf_array.shape

(12570, 10)

In [14]:
feature_names_searched = word_vectorizer.get_feature_names()

In [16]:
df_tfidf_searched = pd.DataFrame(tf_array, columns=feature_names)
df_tfidf_searched['person'] = df_model['person']
df_tfidf_searched['cantidad_vistos'] = df_model['cantidad_vistos']
df_tfidf_searched.head()

Unnamed: 0,samsung galaxy j5,iphone 5s,iphone 6,iphone 6s,motorola moto g2 3g dual,samsung galaxy j7 prime,motorola moto g4 plus,samsung galaxy s7,samsung galaxy s6 flat,samsung galaxy s7 edge,person,cantidad_vistos
0,0,0,1,1,0,0,0,0,0,0,00091926,372
1,0,0,0,0,0,0,0,0,0,0,00091a7a,3
2,0,0,0,0,0,0,0,0,0,0,000ba417,153
3,0,0,0,0,0,0,0,0,0,0,000c79fe,3
4,0,0,0,0,0,0,0,0,0,0,000e4d9e,339


## XGboost entrenamiento

In [17]:
df_train = df_labels.merge(df_tfidf , left_on='person', right_on='person' , how='inner')

Los labels me dan mi set para entrenar, los que no se encuentran en labels tengo que predecirlos

Si ven aca, de la columna label en adelante tenemos los features.

In [18]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X = X.fillna(0)
X.head()

Unnamed: 0,samsung galaxy j5,iphone 5s,iphone 6,iphone 6s,motorola moto g2 3g dual,samsung galaxy j7 prime,motorola moto g4 plus,samsung galaxy s7,samsung galaxy s6 flat,samsung galaxy s7 edge,cantidad_vistos
0,0,4,0,0,0,2,0,0,2,0,23
1,0,0,16,0,0,0,0,2,8,0,31
2,0,0,0,0,0,2,0,0,2,0,24
3,0,0,0,0,0,0,0,0,0,0,9
4,0,0,96,0,0,0,0,0,0,0,121


Separamos los datos para hacer xgboost de la siguiente forma


|Variable |Contiene|
|------------------------|-----------------------------------------------------|
|X| features que usa xgboost son solo numeros es decir que sacamos a la persona   |
|y| label de cada persona|

## Xgboost

Para evaluar usen esta medida que me da valores muy parecidos a los de kaggle, para hacer las predicciones usen el otro

In [19]:
import xgboost as xgb
model = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 6,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 10)

model.add(Dense(64, input_dim=64, kernel_regularizer=regularizers.l2(0.01)))

AttributeError: 'XGBClassifier' object has no attribute 'add'

Este es el arbol con sus hiperparametros

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
model.fit(X_train,y_train)

Aca vuelvo a separar los datos para poder realizar una metrica interna y ver masomenos como performan las cosas que hago

In [None]:
persons = df_labels['person']
persons_to_predict = (df.loc[~df['person'].isin(persons)])['person'].drop_duplicates().to_frame()
persons_to_predict.shape

Obtengo las personas a predecir!

In [None]:
X_predict = persons_to_predict.merge(df_tfidf_searched, on= 'person', how= 'left')
X_predict = X_predict.drop(['person'], axis=1)
X_predict.head()

Necesito tener el mismo dataframe que tenia cuando lo entrene pero ahora para predecir, en este caso <b>X</b>

Entreno al arbol!

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  model.predict_proba(X_test)[:,1])

Obtengo un resultado con los que separe para el test mas arriba en :
```python
    X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)
    ```
###### Lo que hace es sacar las predicciones para X_test y evaluarlos con y_test
    