 refer https://www.kaggle.com/waxbabi/light-gbm-benchmark-0-3692/code
#implement basic content-based prediction

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import gensim
import re

In [2]:
products = pd.read_csv('../csv/products.csv')

In [3]:
pnames = [re.sub(r'[^.0-9a-zA-Z]', ' ', line).lower().split() for line in products.product_name.astype(str).values]
w2v = gensim.models.Word2Vec(pnames, size=10, window=5, workers=4, min_count=1)
pname_vector = np.array([w2v.wv[pname].mean(axis=0) for pname in pnames])
product_w2v = pd.DataFrame(pname_vector).add_prefix('w2v_')
product_w2v['product_id'] = products['product_id']
product_w2v.set_index('product_id', inplace=True)

In [4]:
order_products_prior = pd.read_csv('../csv/order_products__prior.csv')

In [5]:
order_products_train = pd.read_csv('../csv/order_products__train.csv')

In [6]:
product_detail = order_products_prior.set_index('product_id').join(products.set_index('product_id')).reset_index()

In [78]:
aisle = product_detail.groupby('aisle_id').agg({'reordered': ['sum', 'count']})
aisle[('reordered', 'ratio')] = aisle[('reordered', 'sum')].div(aisle[('reordered', 'count')])
aisle = aisle.add_prefix('aisle_').reset_index().set_index('aisle_id')

In [79]:
department = product_detail.groupby('department_id').agg({'reordered': ['sum', 'count']})
department[('reordered', 'ratio')] = department[('reordered', 'sum')].div(department[('reordered', 'count')])
department = department.add_prefix('department_').reset_index().set_index('department_id')

In [9]:
orders = pd.read_csv('../csv/orders.csv')

In [10]:
df = order_products_prior.set_index('order_id').join(orders.set_index('order_id'))

In [11]:
temp = pd.pivot_table(df[['product_id', 'order_dow']], index=['product_id'], columns=['order_dow'], aggfunc=len, fill_value=0)
product_dow = temp.div(temp.sum(axis=1), axis=0).add_prefix('pdow_').reset_index().set_index('product_id')

In [14]:
temp = pd.pivot_table(df[['product_id', 'order_hour_of_day']], index=['product_id'], columns=['order_hour_of_day'], aggfunc=len, fill_value=0)
product_hod = temp.div(temp.sum(axis=1), axis=0).add_prefix('phod_').reset_index().set_index('product_id')

In [16]:
product_user = df.groupby(['product_id', 'user_id']).agg({'reordered': ['sum', 'count'], 
'order_number': ['mean'], 'days_since_prior_order': ['mean'], 'add_to_cart_order': ['mean']})\
.reset_index().set_index(['product_id', 'user_id'])
product_user[('reordered', 'ratio')] = product_user[('reordered', 'sum')].div(product_user[('reordered', 'count')])
product_user = product_user.add_prefix('pu')

In [17]:
pu = pd.DataFrame(df.groupby(['product_id', 'user_id']).size().rename('count')).reset_index()

In [18]:
truncted_product_user = pu.copy()
truncted_product_user.loc[truncted_product_user['count'] > 10, 'count'] = 10

In [19]:
product_level = pd.pivot_table(truncted_product_user, index=['product_id'], columns=['count'], aggfunc=len, fill_value=0)\
.reset_index().set_index('product_id').add_prefix('plevel_')

In [20]:
user_level = pd.pivot_table(truncted_product_user, index=['user_id'], columns=['count'], aggfunc=len, fill_value=0)\
.reset_index().set_index('user_id').add_prefix('ulevel_')

In [21]:
product_v = df.reset_index().groupby('product_id').agg({'order_id': ['nunique'], 'user_id': ['nunique'], 
'reordered': ['sum', 'count'], 'add_to_cart_order': ['mean'], 'days_since_prior_order': ['mean']})\
.reset_index().set_index('product_id')
product_v[('reordered', 'ratio')] = product_v[('reordered', 'sum')].div(product_v[('reordered', 'count')])
product_v = product_v.add_prefix('p_')

In [22]:
user_v = df.reset_index().groupby('user_id').agg({'days_since_prior_order': ['mean'], 'order_id': ['nunique'], 
'product_id': ['nunique'], 'reordered': ['sum', 'count']})\
.reset_index().set_index('user_id')
user_v[('reordered', 'ratio')] = user_v[('reordered', 'sum')].div(user_v[('reordered', 'count')])
user_v = user_v.add_prefix('u_')

In [23]:
label = orders[orders.eval_set == 'train'].set_index('order_id')\
.join(order_products_train.set_index('order_id'))[['product_id', 'user_id']].set_index(['product_id', 'user_id'])
label['label'] = 1

In [None]:
#product_user
#product_v, product_level, product_w2v, aisle, department, product_dow, product_hod
#user_level, user_v
#label

In [87]:
product_all = products.set_index('product_id').join(product_v).join(product_level).join(product_w2v)\
.join(product_dow).join(product_hod)\
.reset_index().set_index('aisle_id').join(aisle)\
.reset_index().set_index('department_id').join(department)\
.reset_index().set_index('product_id')\
.drop(['product_name', 'aisle_id', 'department_id'], axis=1)

In [89]:
user_all = user_v.join(user_level)

In [90]:
train = orders[orders.eval_set == 'train'].set_index('user_id')\
.join(product_user.reset_index().set_index('user_id'))\
.rename(columns={('product_id', ''): 'product_id'})\
.reset_index().set_index(['product_id', 'user_id']).join(label)\
.reset_index().set_index('product_id').join(product_all)\
.reset_index().set_index('user_id').join(user_all)\
.reset_index().fillna(0)

In [91]:
train.to_csv('../data/train.data', float_format='%.4f')

In [93]:
test = orders[orders.eval_set == 'test'].set_index('user_id')\
.join(product_user.reset_index().set_index('user_id'))\
.rename(columns={('product_id', ''): 'product_id'})\
.reset_index().set_index('product_id').join(product_all)\
.reset_index().set_index('user_id').join(user_all)\
.reset_index().fillna(0)

In [94]:
test.to_csv('../data/test.data', float_format='%.4f')