In [1]:
%%javascript

IPython.keyboard_manager.command_shortcuts.add_shortcut('Ctrl-k', {
    help : 'move up selected cells',
    help_index : 'jupyter-notebook:move-selection-up',
    handler : function (event) {
        IPython.notebook.move_selection_up();
        return false;
    }}
);

IPython.keyboard_manager.command_shortcuts.add_shortcut('Ctrl-j', {
    help : 'move down selected cells',
    help_index : 'jupyter-notebook:move-selection-down',
    handler :  function (event) {
        IPython.notebook.move_selection_down();
        return false;
    }}
);

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from scipy.sparse import hstack
from xgboost.sklearn import XGBClassifier # <3
from sklearn.model_selection import train_test_split
import gc
import matplotlib.pyplot as plt

from scipy.sparse.csr import csr_matrix

pd.options.mode.chained_assignment = None


In [3]:

pd.set_option('display.width', 400)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 200)


In [4]:
df = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
df_labels= pd.read_csv('labels_training_set.csv', low_memory = False)


In [5]:
df.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,,,,,,


In [47]:
models_conversion = df['sku'].value_counts()
models_conversion.head()


2830.0    8601
2831.0    7777
8443.0    7442
6371.0    7053
2829.0    6708
Name: sku, dtype: int64

In [53]:
models_conversion = df['sku'].value_counts()
models_conversion = models_conversion.to_frame().reset_index()
print(models_conversion['sku'].describe())
models_conversion = models_conversion.loc[models_conversion['sku']>162]
models_conversion.shape

count    2328.000000
mean      567.237973
std       970.753092
min         1.000000
25%        19.000000
50%       162.000000
75%       644.000000
max      8601.000000
Name: sku, dtype: float64


(1163, 2)

In [54]:
#drop nans
df_model = df.loc[df['sku'].isin(models_conversion['sku'])]
df_model = df_model.groupby('person')['sku'].apply(list).reset_index()
df_model.shape

(13713, 2)

In [55]:
#df_all = df_labels.merge(df_model, left_on = 'person', right_on = 'person', how = 'left')
#df_all.head()

In [56]:
df_model['sku'] = df_model['sku'].apply(lambda x: ', '.join(map(str, x)))
df_model = df_model.fillna('')
df_model.shape

(13713, 2)

In [57]:
word_vectorizer = CountVectorizer()

tf_mat = word_vectorizer.fit_transform(df_model['sku'])

tf_array = tf_mat.toarray()
tf_array.shape

(13713, 120)

In [58]:
doc = 0
feature_names = word_vectorizer.get_feature_names()
feature_index = tf_mat[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [tf_mat[doc, x] for x in feature_index])

In [59]:
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
  print (w, s)

1398 1
1390 4
1406 1
1389 8


In [60]:
df_tfidf = pd.DataFrame(tf_array, columns=feature_names)
df_tfidf.shape

(13713, 120)

In [61]:
df_tfidf['person'] = df_model['person']
df_tfidf.head()

Unnamed: 0,1061,1066,1263,1302,1326,1333,1389,1390,1397,1398,1406,1437,1438,1469,1485,1525,1526,1541,1606,1621,1622,1629,1646,165,171,...,466,472,473,506,508,515,765,767,774,781,802,849,863,886,893,898,905,907,912,914,919,921,947,961,person
0,0,0,0,0,0,0,8,4,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,00091926
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,000ba417
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,5,0,0,0,0,0,5,1,0,0,0,0,0,0,0,0,0,000e4d9e
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,000e619d
4,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,001001be


## XgBoost

In [62]:
df_train = df_labels.merge(df_tfidf , left_on='person', right_on='person', how='left')

In [63]:
df_train.head()

Unnamed: 0,person,label,1061,1066,1263,1302,1326,1333,1389,1390,1397,1398,1406,1437,1438,1469,1485,1525,1526,1541,1606,1621,1622,1629,1646,...,465,466,472,473,506,508,515,765,767,774,781,802,849,863,886,893,898,905,907,912,914,919,921,947,961
0,0566e9c1,0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,6ec7ee77,0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,abe7a2fb,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,34728364,0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,87ed62de,0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [64]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,1061,1066,1263,1302,1326,1333,1389,1390,1397,1398,1406,1437,1438,1469,1485,1525,1526,1541,1606,1621,1622,1629,1646,165,171,...,465,466,472,473,506,508,515,765,767,774,781,802,849,863,886,893,898,905,907,912,914,919,921,947,961
0,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [65]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [66]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [67]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 4,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 150) #150 = 0.61730  #1500 = 0.61296 , 

In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)


In [69]:
xg_reg.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=150,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [70]:
preds = xg_reg.predict(X_test)

In [71]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.216157


In [72]:
persons = df_labels['person']
df_predict = df_tfidf.loc[~df_tfidf.person.isin(persons)]
ppl_to_predict = (df.loc[~df['person'].isin(persons)])['person'].to_frame()
ppl_to_predict = ppl_to_predict.drop_duplicates('person')
df_predict = ppl_to_predict.merge(df_predict, left_on = 'person' , right_on = 'person', how='left')
X_predict = df_predict.drop(['person'], axis=1)

In [73]:
entrie = xg_reg.predict(X_predict)

In [74]:
seriesita = pd.Series(entrie)

In [75]:
df_entrie = df_predict['person'].to_frame()
df_entrie['label'] = seriesita

In [76]:
df_entrie.head()

Unnamed: 0,person,label
0,4886f805,0.043674
1,0297fc1e,0.038818
2,2d681dd8,0.043674
3,cccea85e,0.056778
4,4c8a8b93,0.056734


In [77]:
df_entrie = df_entrie.fillna(0)

In [78]:
num = df_entrie._get_numeric_data()
num[num < 0] = 0

In [79]:
df_entrie.to_csv(path_or_buf = 'entrie2.0', index = False)

In [80]:
df_entrie.shape

(19415, 2)

In [81]:
df_entrie['label'].nlargest(1)

8603    0.445831
Name: label, dtype: float32

## XgBoost 0.62

In [82]:
df_diff = pd.read_csv('diff.csv')
df_top = df_tfidf.merge(df_diff , left_on='person', right_on='person', how='left')
df_top.head()

Unnamed: 0,1061,1066,1263,1302,1326,1333,1389,1390,1397,1398,1406,1437,1438,1469,1485,1525,1526,1541,1606,1621,1622,1629,1646,165,171,...,472,473,506,508,515,765,767,774,781,802,849,863,886,893,898,905,907,912,914,919,921,947,961,person,diff
0,0,0,0,0,0,0,8,4,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,00091926,0.14276
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,000ba417,0.128333
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,5,0,0,0,0,0,5,1,0,0,0,0,0,0,0,0,0,000e4d9e,0.145844
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,000e619d,0.120185
4,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,001001be,0.323472


In [83]:
df_train = df_labels.merge(df_top , left_on='person', right_on='person', how='left')

In [84]:
df_train.shape

(19414, 123)

In [85]:
df_train.head()

Unnamed: 0,person,label,1061,1066,1263,1302,1326,1333,1389,1390,1397,1398,1406,1437,1438,1469,1485,1525,1526,1541,1606,1621,1622,1629,1646,...,466,472,473,506,508,515,765,767,774,781,802,849,863,886,893,898,905,907,912,914,919,921,947,961,diff
0,0566e9c1,0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,6ec7ee77,0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,abe7a2fb,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095119
3,34728364,0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,87ed62de,0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [86]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,1061,1066,1263,1302,1326,1333,1389,1390,1397,1398,1406,1437,1438,1469,1485,1525,1526,1541,1606,1621,1622,1629,1646,165,171,...,466,472,473,506,508,515,765,767,774,781,802,849,863,886,893,898,905,907,912,914,919,921,947,961,diff
0,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095119
3,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [87]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [88]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [89]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 4,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 150) #150 = 0.61730  #1500 = 0.61296 , 

In [90]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)


In [91]:
xg_reg.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=150,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [92]:
preds = xg_reg.predict(X_test)

In [93]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.215935


In [94]:
persons = df_labels['person']
df_predict = df_top.loc[~df_top.person.isin(persons)]
ppl_to_predict = (df.loc[~df['person'].isin(persons)])['person'].to_frame()
ppl_to_predict = ppl_to_predict.drop_duplicates('person')
df_predict = ppl_to_predict.merge(df_predict, left_on = 'person' , right_on = 'person', how='left')
X_predict = df_predict.drop(['person'], axis=1)

In [95]:
entrie = xg_reg.predict(X_predict)

In [96]:
seriesita = pd.Series(entrie)

In [97]:
df_entrie = df_predict['person'].to_frame()
df_entrie['label'] = seriesita

In [98]:
df_entrie.head()

Unnamed: 0,person,label
0,4886f805,0.042837
1,0297fc1e,0.056882
2,2d681dd8,0.042837
3,cccea85e,0.067634
4,4c8a8b93,0.06376


In [99]:
df_entrie = df_entrie.fillna(0)
df_entrie['label'].value_counts()

0.042837    12854
0.070652      762
0.049145      735
0.056582      652
0.059149      488
0.065743      456
0.063760      247
0.055237      166
0.069377      154
0.067634      147
0.052105      137
0.075333      114
0.073591       82
0.053826       81
0.052085       66
0.061263       61
0.059521       58
0.063830       54
0.070425       53
0.057034       45
0.065934       43
0.061399       40
0.062088       38
0.068441       38
0.075846       37
0.085059       36
0.054340       31
0.044428       31
0.068683       30
0.078994       27
0.061026       26
0.066699       26
0.058176       25
0.082120       24
0.054431       23
0.072919       23
0.074059       22
0.059918       22
0.067235       22
0.072315       21
0.051864       20
0.070938       17
0.055122       17
0.045776       16
0.064660       14
0.064343       14
0.058061       14
0.080151       14
0.056787       13
0.061776       13
0.070573       13
0.072317       13
0.068050       12
0.059042       12
0.077212       12
0.050519  

In [100]:
num = df_entrie._get_numeric_data()
num[num < 0] = 0
df_entrie['label'].value_counts()


0.042837    12854
0.070652      762
0.049145      735
0.056582      652
0.059149      488
0.065743      456
0.063760      247
0.055237      166
0.069377      154
0.067634      147
0.052105      137
0.075333      114
0.073591       82
0.053826       81
0.052085       66
0.061263       61
0.059521       58
0.063830       54
0.070425       53
0.057034       45
0.065934       43
0.061399       40
0.062088       38
0.068441       38
0.075846       37
0.085059       36
0.054340       31
0.044428       31
0.068683       30
0.078994       27
0.061026       26
0.066699       26
0.058176       25
0.082120       24
0.054431       23
0.072919       23
0.074059       22
0.059918       22
0.067235       22
0.072315       21
0.051864       20
0.070938       17
0.055122       17
0.045776       16
0.064660       14
0.064343       14
0.058061       14
0.080151       14
0.056787       13
0.061776       13
0.070573       13
0.072317       13
0.068050       12
0.059042       12
0.077212       12
0.050519  

In [101]:
df_entrie.to_csv(path_or_buf = 'entrie2.0', index = False)

In [102]:
df_entrie.shape

(19415, 2)

In [103]:
df_entrie['label'].nlargest(1)

8603    0.442841
Name: label, dtype: float32