In [1]:
import pandas as pd
import numpy as np

# Constructing the target variable

In [2]:
# read in the events table
event_path = '../events.csv'

types = {'session_id': np.uint64, 'event': 'category', 'event_timestamp': np.uint64,
         'event_value': np.float32, 'user_id_hash': 'category'}
events = pd.read_csv(event_path, usecols=types.keys(), dtype=types)

In [3]:
# determine which users made a purchase after dec 1st 12 AM
dec_1_epoch = 1543622400000

after_dec_1 = events['event_timestamp'] > dec_1_epoch
val = events[after_dec_1]
purchase_after_dec_1 = val[val['event'] == '8']
user_purchased_after_dec_1 = purchase_after_dec_1['user_id_hash'].unique()

In [4]:
# determine which users made a purchase after dec 8th 12 AM
dec_8_epoch = 1544227200000

after_dec_8 = events['event_timestamp'] > dec_8_epoch
val = events[after_dec_8]
purchase_after_dec_8 = val[val['event'] == '8']
user_purchased_after_dec_8 = purchase_after_dec_8['user_id_hash'].unique()

In [5]:
# calculate necesarry values for target dataframe
all_users = events['user_id_hash'].unique()

target = [(x, (x in user_purchased_after_dec_1), (x in user_purchased_after_dec_8))
          for x in all_users]

# all_users_two_weeks = all_users.apply(lambda x: x in user_purchased_after_dec_1)
# all_users_one_week = all_users.apply(lambda x: x in user_purchased_after_dec_8)

In [6]:
target_df = pd.DataFrame(target, columns=['user_id_hash', 'two_week_purchase', 'one_week_purchase'])

In [8]:
target_df.to_csv('validation_set.csv', index=False)

# Basic Prediction
### Based on whether or not a user has made a purchase in the past

In [10]:
dec_1_epoch = 1543622400000

before_dec_1 = events['event_timestamp'] < dec_1_epoch
before_dec_1 = events[before_dec_1]
purchased_before_dec_1 = before_dec_1[before_dec_1['event'] == '8']

user_purchased_before_dec_1 = purchased_before_dec_1['user_id_hash'].unique()

In [32]:
dec_15_epoch = 1544832000000

before_dec_15 = events['event_timestamp'] < dec_15_epoch
before_dec_15 = events[before_dec_15]
purchased_before_dec_15 = before_dec_15[before_dec_15['event'] == '8']

user_purchased_before_dec_15 = purchased_before_dec_15['user_id_hash'].unique()

In [34]:
target_df['previously_purchased'] = target_df['user_id_hash'].apply(lambda x: (x in user_purchased_before_dec_1))
train_df = pd.DataFrame({'previously_purchased': target_df['user_id_hash'].apply(lambda x: (x in user_purchased_before_dec_15))})

In [16]:
# target_df

In [18]:
target_df.head()

Unnamed: 0,user_id_hash,two_week_purchase,one_week_purchase,previously_purchased
0,9943447915df3a45fd6720a026af905b6da6b56a37701b...,False,False,True
1,43f75f8042d3c80c45e222bdd09267f4584684c54d6fae...,False,False,False
2,999524249720812f2d8c0390293efd58e1ac84d587a01c...,False,False,False
3,4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2...,False,False,False
4,dc009148ee26d658e0240c7b7f6a258790a457737f96e8...,False,False,False


In [24]:
X = target_df['previously_purchased'].values.reshape(-1, 1)
y_one_week = target_df['one_week_purchase']
y_two_week = target_df['two_week_purchase']

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import auc

In [26]:
dt_cf = DecisionTreeClassifier()
dt_cf.fit(X, y_one_week)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [35]:
one_week_pred = dt_cf.predict_proba(train_df['previously_purchased'].values.reshape(-1, 1))

In [38]:
one_week_pred

array([[0.92428006, 0.07571994],
       [0.99861147, 0.00138853],
       [0.99861147, 0.00138853],
       ...,
       [0.99861147, 0.00138853],
       [0.99861147, 0.00138853],
       [0.99861147, 0.00138853]])

In [39]:
dt_cf2 = DecisionTreeClassifier()
dt_cf2.fit(X, y_two_week)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [40]:
two_week_pred = dt_cf2.predict_proba(train_df['previously_purchased'].values.reshape(-1, 1))

In [41]:
two_week_pred

array([[0.86098077, 0.13901923],
       [0.99729263, 0.00270737],
       [0.99729263, 0.00270737],
       ...,
       [0.99729263, 0.00270737],
       [0.99729263, 0.00270737],
       [0.99729263, 0.00270737]])

In [43]:
one_week_pred = [pair[1] for pair in one_week_pred]
two_week_pred = [pair[1] for pair in two_week_pred]

prediction = pd.DataFrame({'user_id_hash': all_users,
                           'user_purchase_binary_7_days': one_week_pred,
                           'user_purchase_binary_14_days': two_week_pred})

In [45]:
prediction.to_csv('/Users/jon_ross/Desktop/prediction.csv', index=False)

In [46]:
len(prediction)

621001

In [48]:
len(all_users)

621001

In [53]:
len(events['user_id_hash'].unique())

621001

In [55]:
events['user_id_hash'].unique()

[9943447915df3a45fd6720a026af905b6da6b56a37701b..., 43f75f8042d3c80c45e222bdd09267f4584684c54d6fae..., 999524249720812f2d8c0390293efd58e1ac84d587a01c..., 4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2..., dc009148ee26d658e0240c7b7f6a258790a457737f96e8..., ..., b8df5637f865257165270f06128d9f363db70dafbe1921..., 48c437d19ea514f48e794f387d30b943b8177c7801f14f..., 3634f7c91added0bee9e3516ddeb865cae4a2cd9bf994f..., 3af850ac2e9ee39ae40f0e1297cef53eb97e383ba92959..., 62f2865f7fb8782548d151de539e8d93cf8a5aac1b380f...]
Length: 621001
Categories (621001, object): [9943447915df3a45fd6720a026af905b6da6b56a37701b..., 43f75f8042d3c80c45e222bdd09267f4584684c54d6fae..., 999524249720812f2d8c0390293efd58e1ac84d587a01c..., 4e6bc35cf7fd79a5312047651e7865915f4a6bec193cf2..., ..., 48c437d19ea514f48e794f387d30b943b8177c7801f14f..., 3634f7c91added0bee9e3516ddeb865cae4a2cd9bf994f..., 3af850ac2e9ee39ae40f0e1297cef53eb97e383ba92959..., 62f2865f7fb8782548d151de539e8d93cf8a5aac1b380f...]

In [56]:
event_users = set(events['user_id_hash'])

In [57]:
asdfsa = pd.read_csv('/Users/jon_ross/Downloads/sample_submission_2.csv')

In [61]:
sample_users = set(asdfsa['user_id_hash'])