In [1]:
#refer https://www.kaggle.com/waxbabi/light-gbm-benchmark-0-3692/code
#implement basic content-based prediction

In [2]:
import pandas as pd
import numpy as np
from scipy import sparse
import gensim
import re

In [9]:
products = pd.read_csv('../csv/products.csv')

In [10]:
pnames = [re.sub(r'[^.0-9a-zA-Z]', ' ', line).lower().split() for line in products.product_name.astype(str).values]
w2v = gensim.models.Word2Vec(pnames, size=10, window=5, workers=4, min_count=1)
pname_vector = np.array([w2v.wv[pname].mean(axis=0) for pname in pnames])
product_w2v = pd.DataFrame(pname_vector).add_prefix('w2v_')
product_w2v['product_id'] = products['product_id']
product_w2v.set_index('product_id', inplace=True)

In [11]:
order_products_prior = pd.read_csv('../csv/order_products__prior.csv')

In [12]:
order_products_train = pd.read_csv('../csv/order_products__train.csv')

In [41]:
product_detail = order_products_prior.set_index('product_id').join(products.set_index('product_id')).reset_index()

In [43]:
aisle = product_detail.groupby('aisle_id').agg({'reordered': ['sum', 'count']})
aisle[('reordered', 'ratio')] = aisle[('reordered', 'sum')].div(aisle[('reordered', 'count')])
aisle = aisle.add_prefix('aisle_').reset_index()

In [50]:
department = product_detail.groupby('department_id').agg({'reordered': ['sum', 'count']})
department[('reordered', 'ratio')] = department[('reordered', 'sum')].div(department[('reordered', 'count')])
department = department.add_prefix('department_').reset_index()

In [13]:
orders = pd.read_csv('../csv/orders.csv')

In [14]:
df = order_products_prior.set_index('order_id').join(orders.set_index('order_id'))

In [37]:
product_dow = pd.pivot_table(df[['product_id', 'order_dow']], index=['product_id'], columns=['order_dow'], aggfunc=len, fill_value=0)\
.add_prefix('pdow_').reset_index().set_index('product_id')

In [38]:
product_hod = pd.pivot_table(df[['product_id', 'order_hour_of_day']], index=['product_id'], columns=['order_hour_of_day'], aggfunc=len, fill_value=0)\
.add_prefix('phod_').reset_index().set_index('product_id')

In [21]:
product_user = df.groupby(['product_id', 'user_id']).agg({'reordered': ['sum', 'count'], 
'order_number': ['mean'], 'days_since_prior_order': ['mean'], 'add_to_cart_order': ['mean']})\
.reset_index().set_index(['product_id', 'user_id'])
product_user[('reordered', 'ratio')] = product_user[('reordered', 'sum')].div(product_user[('reordered', 'count')])
product_user = product_user.add_prefix('pu')

In [9]:
pu = pd.DataFrame(df.groupby(['product_id', 'user_id']).size().rename('count')).reset_index()

In [10]:
truncted_product_user = pu.copy()
truncted_product_user.loc[truncted_product_user['count'] > 10, 'count'] = 10

In [11]:
product_level = pd.pivot_table(truncted_product_user, index=['product_id'], columns=['count'], aggfunc=len, fill_value=0)\
.reset_index().set_index('product_id').add_prefix('plevel_')

In [12]:
user_level = pd.pivot_table(truncted_product_user, index=['user_id'], columns=['count'], aggfunc=len, fill_value=0)\
.reset_index().set_index('user_id').add_prefix('ulevel_')

In [13]:
product_v = df.reset_index().groupby('product_id').agg({'order_id': ['nunique'], 'user_id': ['nunique'], 
'reordered': ['sum', 'count'], 'add_to_cart_order': ['mean'], 'days_since_prior_order': ['mean']})\
.reset_index().set_index('product_id')
product_v[('reordered', 'ratio')] = product_v[('reordered', 'sum')].div(product_v[('reordered', 'count')])
product_v = product_v.add_prefix('p_')

In [14]:
user_v = df.reset_index().groupby('user_id').agg({'days_since_prior_order': ['mean'], 'order_id': ['nunique'], 
'product_id': ['nunique'], 'reordered': ['sum', 'count']})\
.reset_index().set_index('user_id')
user_v[('reordered', 'ratio')] = user_v[('reordered', 'sum')].div(user_v[('reordered', 'count')])
user_v = user_v.add_prefix('u_')

In [15]:
label = orders[orders.eval_set == 'train'].set_index('order_id')\
.join(order_products_train.set_index('order_id'))[['product_id', 'user_id']].set_index(['product_id', 'user_id'])
label['label'] = 1

In [16]:
train = orders[orders.eval_set == 'train'].set_index('user_id')\
.join(product_user.reset_index().set_index('user_id'))\
.rename(columns={('product_id', ''): 'product_id'})\
.reset_index().set_index(['product_id', 'user_id']).join(label)\
.reset_index().set_index('product_id').join(product_level).join(product_v)\
.join(products[['product_id', 'aisle_id', 'department_id']].set_index('product_id'))\
.reset_index().set_index('user_id').join(user_level).join(user_v)\
.reset_index().fillna(0)



In [17]:
train.to_csv('../data/train.data')

In [18]:
test = orders[orders.eval_set == 'test'].set_index('user_id')\
.join(product_user.reset_index().set_index('user_id'))\
.rename(columns={('product_id', ''): 'product_id'})\
.reset_index().set_index('product_id').join(product_level).join(product_v)\
.join(products[['product_id', 'aisle_id', 'department_id']].set_index('product_id'))\
.reset_index().set_index('user_id').join(user_level).join(user_v)\
.reset_index().fillna(0)

In [19]:
test.to_csv('../data/test.data')