# Automated Features


In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from scipy.sparse import hstack
from xgboost.sklearn import XGBClassifier # <3
from sklearn.model_selection import train_test_split
import gc
import matplotlib.pyplot as plt

from scipy.sparse.csr import csr_matrix

pd.options.mode.chained_assignment = None


In [2]:
import featuretools as ft

In [3]:

pd.set_option('display.width', 400)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 1000)


In [4]:
df = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
df_labels= pd.read_csv('labels_training_set.csv', low_memory = False)
df

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,,,,,,
5,2018-05-18 00:44:27,searched products,4c8a8b93,,,,,,,"10240,9987,10322,10085,9944,9931,13404,10154,1...",iPhone se,,,,,,,,,,,,
6,2018-05-18 00:44:14,viewed product,1b9f7cf6,,2831.0,iPhone 6,Bom,16GB,Dourado,,,,,,,,,,,,,,
7,2018-05-18 00:44:02,viewed product,29ebb414,,2845.0,iPhone 6 Plus,Bom,128GB,Cinza espacial,,,,,,,,,,,,,,
8,2018-05-18 00:43:59,viewed product,de8fe91b,,12548.0,Motorola Moto G5 Plus,Bom,32GB,Platinum,,,,,,,,,,,,,,
9,2018-05-18 00:43:40,ad campaign hit,45baf068,/,,,,,,,,,google,,,,,,,,,,


In [5]:
df.iloc[-10:-1,:]

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
2341671,2018-05-31 18:21:01,lead,1fb641f3,,,LG K10 TV,,,,,,,,,,,,,,,,,
2341672,2018-05-31 19:34:12,lead,7bd89bdf,,,LG K10 TV,,,,,,,,,,,,,,,,,
2341673,2018-05-31 19:41:59,lead,4ead4504,,,iPhone 6,,,,,,,,,,,,,,,,,
2341674,2018-05-31 23:50:44,lead,73b5ddeb,,,iPhone 6S Plus,,,,,,,,,,,,,,,,,
2341675,2018-05-30 21:52:37,lead,1182c372,,,Motorola Moto G5S Plus,,,,,,,,,,,,,,,,,
2341676,2018-05-31 13:05:04,lead,9b1ba18d,,,Motorola Moto Z2 Play,,,,,,,,,,,,,,,,,
2341677,2018-05-31 14:49:59,lead,27b90284,,,Asus Zenfone 3 Max 16 GB,,,,,,,,,,,,,,,,,
2341678,2018-05-31 15:21:09,lead,a0b57323,,,Motorola Moto Z2 Play,,,,,,,,,,,,,,,,,
2341679,2018-05-31 10:34:49,lead,ff9fc164,,,Samsung Galaxy On 7,,,,,,,,,,,,,,,,,


In [6]:
df.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,,,,,,


In [7]:
df['session_id'] = df.index

In [8]:
df.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,session_id
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,,,,,0
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,,,,,,1
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,,,,,,,2
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,,,,,,,3
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,,,,,,,4


In [9]:
df_person = df['person'].drop_duplicates().to_frame()

In [10]:
df['session']=df.index

In [11]:
es = ft.EntitySet(id = 'person')
es = es.entity_from_dataframe(entity_id = 'person_id', dataframe = df_person, index = 'person')

In [12]:

df_session = df[['person','session']]

In [13]:
es = es.entity_from_dataframe(entity_id = 'session', dataframe = df_session, 
                            index = 'session_id')




In [14]:
es

Entityset: person
  Entities:
    person_id [Rows: 38829, Columns: 1]
    session [Rows: 2341681, Columns: 3]
  Relationships:
    No relationships

In [15]:
# Relationship between clients and previous loans
r_client_previous = ft.Relationship(es['person_id']['person'],
                                    es['session']['person'])

# Add the relationship to the entity set
es = es.add_relationship(r_client_previous)

In [16]:
es

Entityset: person
  Entities:
    person_id [Rows: 38829, Columns: 1]
    session [Rows: 2341681, Columns: 3]
  Relationships:
    session.person -> person_id.person

In [17]:

%time features, feature_names = ft.dfs(entityset=es, target_entity='person_id', max_depth = 2)

features.head()

CPU times: user 13.4 s, sys: 34.3 ms, total: 13.4 s
Wall time: 13.5 s


Unnamed: 0_level_0,SUM(session.session),STD(session.session),MAX(session.session),SKEW(session.session),MIN(session.session),MEAN(session.session),COUNT(session)
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0008ed71,11315957,422725.090373,2336761,0.1301,1505383,1885993.0,6
00091926,176032329,628799.773981,2146953,2.175637,129,392929.3,448
00091a7a,14303262,694823.466211,2219186,-0.41255,630080,1430326.0,10
000ba417,220069748,498486.409687,2229932,0.837486,244746,1068300.0,206
000c79fe,4924515,647464.28102,2174099,2.654337,57876,289677.4,17


In [18]:
features, feature_names = ft.encode_features(features, feature_names)
features.head()

Unnamed: 0_level_0,SUM(session.session),STD(session.session),MAX(session.session),SKEW(session.session),MIN(session.session),MEAN(session.session),COUNT(session)
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0008ed71,11315957,422725.090373,2336761,0.1301,1505383,1885993.0,6
00091926,176032329,628799.773981,2146953,2.175637,129,392929.3,448
00091a7a,14303262,694823.466211,2219186,-0.41255,630080,1430326.0,10
000ba417,220069748,498486.409687,2229932,0.837486,244746,1068300.0,206
000c79fe,4924515,647464.28102,2174099,2.654337,57876,289677.4,17


## Diff de lo de caro 0.62

In [19]:
df_not_numerical = features.select_dtypes(['number'])

In [20]:
df_diff = pd.read_csv('diff.csv')
df_not_numerical = df_not_numerical.merge(df_diff , left_on='person', right_on='person', how='left')


In [21]:
df_not_numerical.head()

Unnamed: 0,person,SUM(session.session),STD(session.session),MAX(session.session),SKEW(session.session),MIN(session.session),MEAN(session.session),COUNT(session),diff_mean,diff_max,session_count
0,0008ed71,11315957,422725.090373,2336761,0.1301,1505383,1885993.0,6,0.001389,0.001389,1.0
1,00091926,176032329,628799.773981,2146953,2.175637,129,392929.3,448,0.14276,0.501667,31.0
2,00091a7a,14303262,694823.466211,2219186,-0.41255,630080,1430326.0,10,0.171111,0.171111,1.0
3,000ba417,220069748,498486.409687,2229932,0.837486,244746,1068300.0,206,0.128333,0.464444,13.0
4,000c79fe,4924515,647464.28102,2174099,2.654337,57876,289677.4,17,0.184444,0.333889,3.0


In [65]:
df_not_numerical = df_not_numerical[feats_servibles]

# Last Date

In [23]:
df_last_date = pd.read_csv('last_date.csv')
df_not_numerical = df_not_numerical.merge(df_last_date , left_on='person', right_on='person', how='left')


df_not_numerical.head()

Unnamed: 0,person,SUM(session.session),STD(session.session),MAX(session.session),SKEW(session.session),MIN(session.session),MEAN(session.session),COUNT(session),diff_mean,diff_max,session_count,DAY(max),DAY(min),YEAR(max),YEAR(min),MONTH(max),MONTH(min),WEEKDAY(max),WEEKDAY(min)
0,0008ed71,11315957,422725.090373,2336761,0.1301,1505383,1885993.0,6,0.001389,0.001389,1.0,17,17,2018,2018,5,5,3,3
1,00091926,176032329,628799.773981,2146953,2.175637,129,392929.3,448,0.14276,0.501667,31.0,31,3,2018,2018,5,5,3,3
2,00091a7a,14303262,694823.466211,2219186,-0.41255,630080,1430326.0,10,0.171111,0.171111,1.0,26,26,2018,2018,3,3,0,0
3,000ba417,220069748,498486.409687,2229932,0.837486,244746,1068300.0,206,0.128333,0.464444,13.0,26,17,2018,2018,5,5,5,3
4,000c79fe,4924515,647464.28102,2174099,2.654337,57876,289677.4,17,0.184444,0.333889,3.0,29,29,2018,2018,5,5,1,1


In [66]:
df_not_numerical['MAX(session.session)'] = np.log(df_not_numerical['MAX(session.session)']+1)
#df_not_numerical['MIN(session.session)'] = np.log(df_not_numerical['MIN(session.session)']+1)
#df_not_numerical['MEAN(session.session)'] = np.log(df_not_numerical['MEAN(session.session)']+1)
#df_not_numerical['SKEW(session.session)'] = np.log(df_not_numerical['SKEW(session.session)']+1)

## XGboost

In [67]:
df_train = df_labels.merge(df_not_numerical , left_on='person', right_on='person' , how='inner')

In [68]:
df_train.shape

(19414, 3)

In [69]:
df_train.head()

Unnamed: 0,person,label,MAX(session.session)
0,0566e9c1,0,2.750655
1,6ec7ee77,0,2.747361
2,abe7a2fb,0,2.750144
3,34728364,0,2.749201
4,87ed62de,0,2.750101


In [70]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X = X.fillna(0)
X.head()

Unnamed: 0,MAX(session.session)
0,2.750655
1,2.747361
2,2.750144
3,2.749201
4,2.750101


In [71]:

import xgboost as xgb
from sklearn.linear_model import LinearRegression

In [72]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [73]:
reg = xgb.XGBRegressor(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 4,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 15)

In [74]:
reg.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=15,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [75]:
persons = df_labels['person']
df_predict = df_not_numerical.loc[~df_not_numerical.person.isin(persons)]
ppl_to_predict = (df.loc[~df['person'].isin(persons)])['person'].to_frame()
ppl_to_predict = ppl_to_predict.drop_duplicates('person')
df_predict = ppl_to_predict.merge(df_predict, left_on = 'person' , right_on = 'person', how='left')
X_predict = df_predict.drop(['person'], axis=1)

In [76]:
X_predict = X_predict.fillna(0)

In [77]:
entrie = reg.predict(X_predict)

In [78]:
seriesita = pd.Series(entrie)

In [79]:
df_entrie = df_predict['person'].to_frame()
df_entrie['label'] = seriesita

In [80]:
df_entrie.head()

Unnamed: 0,person,label
0,4886f805,0.137356
1,0297fc1e,0.190934
2,2d681dd8,0.137356
3,cccea85e,0.137356
4,4c8a8b93,0.137356


In [81]:
df_entrie = df_entrie.fillna(0)

In [82]:
num = df_entrie._get_numeric_data()
num[num < 0] = 0

In [83]:
df_entrie.to_csv(path_or_buf = 'automated.csv', index = False)

In [84]:
df_entrie.shape

(19415, 2)

In [85]:
df_entrie['label'].nlargest()

1685    0.348157
20      0.318202
38      0.318202
40      0.318202
123     0.318202
Name: label, dtype: float32

## Scoring

Para evaluar usen esta medida que me da valores muy parecidos a los de kaggle, para hacer las predicciones usen el otro

In [86]:
my_classifier1 = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 6,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 10)

In [87]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [88]:
my_classifier1.fit(X_train,y_train)
entrie = my_classifier1.predict_proba(X_predict)[:,1]

In [89]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  my_classifier1.predict_proba(X_test)[:,1])

0.8432467539834254

In [90]:
entrie

array([0.20114738, 0.31764188, 0.20114738, ..., 0.25081128, 0.24373767,
       0.24373767], dtype=float32)

In [91]:
seriesita = pd.Series(entrie)

In [92]:
df_entrie = df_predict['person'].to_frame()
df_entrie['label'] = seriesita

In [93]:
df_entrie.head()

Unnamed: 0,person,label
0,4886f805,0.201147
1,0297fc1e,0.317642
2,2d681dd8,0.201147
3,cccea85e,0.201147
4,4c8a8b93,0.201147


In [94]:
df_entrie = df_entrie.fillna(0)

In [95]:
num = df_entrie._get_numeric_data()
num[num < 0] = 0

In [96]:
df_entrie.to_csv(path_or_buf = 'Yo ya no se que da bien aca.csv', index = False)

In [55]:
df_entrie.shape

(19415, 2)

In [56]:
df_entrie['label'].nlargest()

4312    0.610147
4310    0.609266
4814    0.578030
4702    0.560159
1636    0.552554
Name: label, dtype: float32

 # Random Forest feature importance
    -TEST THIS

In [57]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
#Load boston housing dataset as an example

names = X.columns
rf = RandomForestRegressor()
rf.fit(X, y)
print ("Features sorted by their score:")
zipped = zip(map(lambda x: round(x, 4), rf.feature_importances_), names)
feature = sorted(zipped, key=lambda x: x[1])



Features sorted by their score:


In [58]:
X.shape

(19414, 18)

In [59]:
feat_importance = pd.DataFrame(feature, columns=['importance', 'feature'])
feat_importance.sort_values('importance', ascending=False).head(100)

Unnamed: 0,importance,feature
3,0.1328,MAX(session.session)
5,0.1154,MIN(session.session)
8,0.0995,SKEW(session.session)
9,0.0877,STD(session.session)
4,0.0786,MEAN(session.session)
16,0.0759,diff_mean
10,0.0737,SUM(session.session)
15,0.0651,diff_max
0,0.0575,COUNT(session)
2,0.0492,DAY(min)


In [60]:
feat_importance =feat_importance.loc[feat_importance['importance'].isin(feat_importance['importance'].nlargest(1))]
feats_servibles = feat_importance['feature'].tolist()
len(feats_servibles)

1

In [61]:
feats_servibles =[ x for x in feats_servibles if "sku" not in x ]

In [62]:
feats_servibles.append('person')

In [63]:
len(feats_servibles)

2

In [64]:
feats_servibles

['MAX(session.session)', 'person']

##  Feature Most Viewed and Searched in a day

In [181]:
df_v = df.loc[df['event'] == 'viewed product']
df_v['timestamp'] = pd.to_datetime(df_v['timestamp'])
df_v['count'] = 1
df_v['month'] = df_v['timestamp'].dt.month
df_v['day'] = df_v['timestamp'].dt.day

df_v = df_v.groupby(['person','month', 'day']).agg({'count':'sum'})
df_v = df_v.groupby(['person']).agg({'count':'mean'}).reset_index()
df_v.head()

Unnamed: 0,person,count
0,00091926,18.6
1,00091a7a,3.0
2,000ba417,51.0
3,000c79fe,3.0
4,000e4d9e,37.666667


In [182]:
df_s = df.loc[df['event'] == 'searched products']
df_s['timestamp'] = pd.to_datetime(df_s['timestamp'])
df_s['count'] = 1
df_s['month'] = df_s['timestamp'].dt.month
df_s['day'] = df_s['timestamp'].dt.day

df_s = df_s.groupby(['person','month', 'day']).agg({'count':'sum'})
df_s = df_s.groupby(['person']).agg({'count':'mean'}).reset_index()
df_s['person'].describe()

count        13093
unique       13093
top       387aeb76
freq             1
Name: person, dtype: object

In [183]:
df_s.columns = ['person','searched']
df_v.columns = ['person', 'viewed']
df_s.head()

Unnamed: 0,person,searched
0,000c79fe,9.0
1,000e619d,3.0
2,001001be,17.0
3,001802e4,4.0
4,0019e639,3.666667


In [184]:
df_vs_feat = df_v.merge(df_s,  on='person', how='left')
df_vs_feat.head()

Unnamed: 0,person,viewed,searched
0,00091926,18.6,
1,00091a7a,3.0,
2,000ba417,51.0,
3,000c79fe,3.0,9.0
4,000e4d9e,37.666667,


In [185]:
df_not_numerical = df_not_numerical.merge(df_vs_feat, left_on='person', right_on='person', how='left')
df_not_numerical.head()

Unnamed: 0,SUM(session.session),STD(session.session),MAX(session.session),SKEW(session.session),MIN(session.session),MEAN(session.session),COUNT(session),person,diff_mean,diff_max,session_count,viewed,searched
0,11315957,422725.090373,2336761,0.1301,1505383,1885993.0,6,0008ed71,0.001389,0.001389,1.0,,
1,176032329,628799.773981,2146953,2.175637,129,392929.3,448,00091926,0.14276,0.501667,31.0,18.6,
2,14303262,694823.466211,2219186,-0.41255,630080,1430326.0,10,00091a7a,0.171111,0.171111,1.0,3.0,
3,220069748,498486.409687,2229932,0.837486,244746,1068300.0,206,000ba417,0.128333,0.464444,13.0,51.0,
4,4924515,647464.28102,2174099,2.654337,57876,289677.4,17,000c79fe,0.184444,0.333889,3.0,3.0,9.0
