In [1]:
import numpy as np
import pandas as pd
import feather
from statistics import mode

# location target encoding

In [2]:
sessions = pd.read_csv("/Users/annazeng/Downloads/Data/sessions.csv")

In [3]:
labels = pd.read_csv("/Users/annazeng/Downloads/Data/labels.csv")

In [4]:
df = sessions[['country','city','user_id_hash']]

In [5]:
from collections import Counter

In [6]:
def mode_location(x):
    "find the mode location on user-level"
    c = Counter(x)
    return c.most_common(1)[0][0]

In [7]:
df_clean = pd.DataFrame()
df_clean['mode_country'] = df.groupby('user_id_hash')['country'].apply(mode_location)

In [8]:
df_clean['mode_city'] = df.groupby('user_id_hash')['city'].apply(mode_location)

In [9]:
df_clean = df_clean.reset_index()

In [10]:
df_clean.head()

Unnamed: 0,user_id_hash,mode_country,mode_city
0,00000db35682058bb7916cb90f85709f54c1a0f7a3b6de...,AU,brisbane
1,000059859ec188af6035870faf885c3038cedda05b3a54...,US,apple valley
2,000062e9be78f3da274fec338e78f89d12000e781967f2...,US,campbell
3,00006ad44c0d33a2a526caf75514643639b7b7070dfffd...,ID,bandung
4,000081a7d841b0f953bdba6c185fd5cc335fd6015a30c3...,US,chesapeake


In [11]:
# join with labels
df_labeled = df_clean.set_index('user_id_hash').join(labels.set_index('user_id_hash')).reset_index()

In [12]:
df_labeled = df_labeled.fillna(0)

In [13]:
df_labeled.head()

Unnamed: 0,user_id_hash,mode_country,mode_city,label_14,label_7
0,00000db35682058bb7916cb90f85709f54c1a0f7a3b6de...,AU,brisbane,0.0,0.0
1,000059859ec188af6035870faf885c3038cedda05b3a54...,US,apple valley,0.0,0.0
2,000062e9be78f3da274fec338e78f89d12000e781967f2...,US,campbell,0.0,0.0
3,00006ad44c0d33a2a526caf75514643639b7b7070dfffd...,ID,bandung,0.0,0.0
4,000081a7d841b0f953bdba6c185fd5cc335fd6015a30c3...,US,chesapeake,0.0,0.0


In [17]:
# % of the NaN country 
len(df_labeled[df_labeled.mode_country==0])/len(df_labeled)

0.0013685264673018776

In [18]:
def enc_cat_cols(df, cols):
    encoders = {}
    for col in cols:
        #print('Encoding "%s" ...'%col)
        unique_values = df[col].unique()
        enc = {val:idx for idx, val in enumerate(unique_values)}
        df[col] = df[col].map(enc)
        df[col] = df[col].astype("int16")
        encoders[col] = enc
    return encoders

In [19]:
enc_cat_cols(df_labeled,['mode_country', 'mode_city'])

{'mode_country': {'AU': 0,
  'US': 1,
  'ID': 2,
  'RO': 3,
  'AT': 4,
  'TT': 5,
  'GB': 6,
  'SE': 7,
  'ZA': 8,
  'CA': 9,
  'BR': 10,
  'JO': 11,
  'CH': 12,
  'ZM': 13,
  'LB': 14,
  'EG': 15,
  'PH': 16,
  'VN': 17,
  'AE': 18,
  'IN': 19,
  'SV': 20,
  'JP': 21,
  'IT': 22,
  'TN': 23,
  'MU': 24,
  'NZ': 25,
  'GR': 26,
  'MY': 27,
  'PK': 28,
  'AL': 29,
  'HK': 30,
  'MT': 31,
  'MK': 32,
  'NG': 33,
  'HU': 34,
  'PL': 35,
  'JM': 36,
  'SA': 37,
  'IL': 38,
  'CM': 39,
  'OM': 40,
  'BD': 41,
  'FR': 42,
  'FI': 43,
  'NO': 44,
  'BA': 45,
  'LT': 46,
  'EE': 47,
  'NL': 48,
  'LK': 49,
  'DK': 50,
  'MX': 51,
  'BE': 52,
  'TR': 53,
  'UA': 54,
  'MD': 55,
  'SI': 56,
  'HR': 57,
  'GU': 58,
  'SG': 59,
  'MM': 60,
  'RS': 61,
  'NP': 62,
  'CZ': 63,
  'TZ': 64,
  'IE': 65,
  'KR': 66,
  'GD': 67,
  'DE': 68,
  'SK': 69,
  'KE': 70,
  'BH': 71,
  'CR': 72,
  'CL': 73,
  'BJ': 74,
  'KH': 75,
  'AR': 76,
  'TH': 77,
  'MA': 78,
  'ZZ': 79,
  'CO': 80,
  'PT': 81,
  'KZ': 82

In [20]:
df_labeled.head()

Unnamed: 0,user_id_hash,mode_country,mode_city,label_14,label_7
0,00000db35682058bb7916cb90f85709f54c1a0f7a3b6de...,0,0,0.0,0.0
1,000059859ec188af6035870faf885c3038cedda05b3a54...,1,1,0.0,0.0
2,000062e9be78f3da274fec338e78f89d12000e781967f2...,1,2,0.0,0.0
3,00006ad44c0d33a2a526caf75514643639b7b7070dfffd...,2,3,0.0,0.0
4,000081a7d841b0f953bdba6c185fd5cc335fd6015a30c3...,1,4,0.0,0.0


In [21]:
from sklearn.model_selection import KFold
def reg_target_encoding_14(df, col, n):
    kf = KFold(n_splits=n, shuffle=False)
    new_col = col + "_" + "mean_enc14"
    df[new_col] = np.nan
    global_mean = df.label_14.mean()

    for test_inx, val_inx in kf.split(df):
        X_test, X_val = df.iloc[test_inx], df.iloc[val_inx]
        mean_enc = X_test.groupby(col).label_14.mean()
        df.loc[val_inx, new_col] = X_val[col].map(mean_enc)

    df[new_col].fillna(global_mean, inplace=True)

In [22]:
for c in ['mode_country', 'mode_city']:
    reg_target_encoding_14(df_labeled, c, 5)

In [23]:
df_labeled.head()

Unnamed: 0,user_id_hash,mode_country,mode_city,label_14,label_7,mode_country_mean_enc14,mode_city_mean_enc14
0,00000db35682058bb7916cb90f85709f54c1a0f7a3b6de...,0,0,0.0,0.0,0.012547,0.017707
1,000059859ec188af6035870faf885c3038cedda05b3a54...,1,1,0.0,0.0,0.015144,0.0
2,000062e9be78f3da274fec338e78f89d12000e781967f2...,1,2,0.0,0.0,0.015144,0.0
3,00006ad44c0d33a2a526caf75514643639b7b7070dfffd...,2,3,0.0,0.0,0.003237,0.002188
4,000081a7d841b0f953bdba6c185fd5cc335fd6015a30c3...,1,4,0.0,0.0,0.015144,0.02069


In [24]:
def reg_target_encoding_7(df, col, n):
    kf = KFold(n_splits=n, shuffle=False)
    new_col = col + "_" + "mean_enc7"
    df[new_col] = np.nan
    global_mean = df.label_7.mean()

    for test_inx, val_inx in kf.split(df):
        X_test, X_val = df.iloc[test_inx], df.iloc[val_inx]
        mean_enc = X_test.groupby(col).label_7.mean()
        df.loc[val_inx, new_col] = X_val[col].map(mean_enc)

    df[new_col].fillna(global_mean, inplace=True)

In [25]:
for c in ['mode_country', 'mode_city']:
    reg_target_encoding_7(df_labeled, c, 5)

In [26]:
df_labeled.head()

Unnamed: 0,user_id_hash,mode_country,mode_city,label_14,label_7,mode_country_mean_enc14,mode_city_mean_enc14,mode_country_mean_enc7,mode_city_mean_enc7
0,00000db35682058bb7916cb90f85709f54c1a0f7a3b6de...,0,0,0.0,0.0,0.012547,0.017707,0.006857,0.01054
1,000059859ec188af6035870faf885c3038cedda05b3a54...,1,1,0.0,0.0,0.015144,0.0,0.008077,0.0
2,000062e9be78f3da274fec338e78f89d12000e781967f2...,1,2,0.0,0.0,0.015144,0.0,0.008077,0.0
3,00006ad44c0d33a2a526caf75514643639b7b7070dfffd...,2,3,0.0,0.0,0.003237,0.002188,0.00112,0.0
4,000081a7d841b0f953bdba6c185fd5cc335fd6015a30c3...,1,4,0.0,0.0,0.015144,0.02069,0.008077,0.02069


In [28]:
df_enc = df_labeled[['user_id_hash', 'mode_country_mean_enc14','mode_city_mean_enc14',
                     'mode_country_mean_enc7','mode_city_mean_enc7']]

In [29]:
df_enc.head()

Unnamed: 0,user_id_hash,mode_country_mean_enc14,mode_city_mean_enc14,mode_country_mean_enc7,mode_city_mean_enc7
0,00000db35682058bb7916cb90f85709f54c1a0f7a3b6de...,0.012547,0.017707,0.006857,0.01054
1,000059859ec188af6035870faf885c3038cedda05b3a54...,0.015144,0.0,0.008077,0.0
2,000062e9be78f3da274fec338e78f89d12000e781967f2...,0.015144,0.0,0.008077,0.0
3,00006ad44c0d33a2a526caf75514643639b7b7070dfffd...,0.003237,0.002188,0.00112,0.0
4,000081a7d841b0f953bdba6c185fd5cc335fd6015a30c3...,0.015144,0.02069,0.008077,0.02069


In [30]:
feather.write_dataframe(df_enc,"location_mean_enc.feather")

# previous session duration

In [31]:
datatype = {'session_id': np.uint64, 'event':'object',
            'event_timestamp':str,'event_value':str,'user_id_hash':'category'}

events = pd.read_csv('/Users/annazeng/Downloads/Data/events.csv',
                     usecols=["session_id", "event", "user_id_hash"],dtype=datatype)

In [35]:
# all the session_ids when the purchase is made
evnent_session = events[events.event=='8']

In [36]:
df_sevent = evnent_session[['session_id','event']]

In [37]:
df_sevent.head()

Unnamed: 0,session_id,event
279,5159335150551901701,8
2898,845986603555615931,8
2970,2846996524173831068,8
3002,6141788581998894012,8
3597,6974128812803113234,8


In [38]:
df_sssion = sessions[['session_id','user_id_hash','previous_sessions_duration']]

In [39]:
# join the data from session on the data from events
df_join = df_sevent.set_index('session_id').join(df_sssion.set_index('session_id'))

In [40]:
df_join = df_join.reset_index()

In [41]:
df_join.head()

Unnamed: 0,session_id,event,user_id_hash,previous_sessions_duration
0,44097414271980,8,e1ca4b4371fab3442652313818df4decdc5ab3f5f11ab3...,34381845
1,139239025219926,8,da47944d5920073aab4d7daa7b73e0a73360307b312131...,124487748
2,198661066907208,8,98625390474bdd559a53af3fe9b3ee92d2ac08f17e7c0a...,4161545
3,277838121971337,8,519bab06524dea84a247668c3566adc8d68c5cba9de2fd...,45124770
4,330581615202681,8,0e6aa0563a5e6756e60ac4271015d9b4164d6660c845c9...,19928604


In [42]:
df_duration = pd.DataFrame()

In [43]:
def mean_duration(x):
    "the mean sessions_duration on user-level"
    return x.mean()

df_duration['mean_duration'] = df_join.groupby('user_id_hash')['previous_sessions_duration'].apply(mean_duration)

In [44]:
# not sure how to transoform the mean_duration time, remain in original ms
df_duration.reset_index().head()

Unnamed: 0,user_id_hash,mean_duration
0,00000db35682058bb7916cb90f85709f54c1a0f7a3b6de...,9388056.0
1,000059859ec188af6035870faf885c3038cedda05b3a54...,1867869.0
2,000d99c8e82878915b33ffe27ac3585ce9fb7cd4b82ace...,32056150.0
3,00124c21b3ec87a2f17f884c5eee25462b67d489ebad09...,3000952.0
4,0012e392350f0f6408b8b1a03bc5ee292e29dc735ee24d...,13518630.0


In [45]:
feather.write_dataframe(df_duration,'duration.feather')