### Instacart - Model Prep

###### Lily Elizabeth John

In [39]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [40]:
#Read data
aisles=pd.read_csv('aisles.csv')
department=pd.read_csv('departments.csv')
products=pd.read_csv('products.csv')
order_products_train=pd.read_csv('order_products__train.csv')
order_products_prior=pd.read_csv('order_products__prior.csv')
orders=pd.read_csv('orders.csv')

In [41]:
order_products_test=orders[orders.eval_set=='test'][['user_id']]
prior_orders=orders[orders.eval_set=='prior'][['order_id','user_id']]
prior_orders_products=pd.merge(prior_orders,order_products_prior,on='order_id',how='left')
prior_orders_products=prior_orders_products.drop(['order_id','add_to_cart_order','reordered'],axis=1)
prior_orders_products.head()

Unnamed: 0,user_id,product_id
0,1,196
1,1,14084
2,1,12427
3,1,26088
4,1,26405


In [42]:
del prior_orders

In [43]:
prior_orders_products.shape

(32434489, 2)

In [44]:
prior_orders_products=prior_orders_products.drop_duplicates() 

In [45]:
prior_orders_products.shape

(13307953, 2)

In [46]:
test_user_products=pd.merge(order_products_test,prior_orders_products,on='user_id',how='left')

In [47]:
test_user_products.head()

Unnamed: 0,user_id,product_id
0,3,9387
1,3,17668
2,3,15143
3,3,16797
4,3,39190


In [48]:
del prior_orders_products,order_products_test

In [49]:
test_order_user=orders[orders.eval_set=='test'][['order_id','user_id']]
test_user_products=pd.merge(test_user_products,test_order_user,on='user_id',how='left')

In [50]:
order_products_test=test_user_products[['order_id','product_id']]
order_products_test['reordered']=''

In [51]:
products=pd.merge(products,aisles,how='left',on='aisle_id')
products=pd.merge(products,department,how='left',on='department_id')
order_products_train.drop('add_to_cart_order',axis=1)
order_products_prior.drop('add_to_cart_order',axis=1)
order_products_train['eval_set']='train'
order_products_prior['eval_set']='prior'
order_products_test['eval_set']='test'
order_products=pd.concat([order_products_train,order_products_prior])
order_products=pd.merge(order_products,products,how='left',on='product_id')

In [52]:
del aisles,department,products

In [53]:
combined=pd.merge(orders,order_products,on=['eval_set','order_id'],how='outer')

In [54]:
combined.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,2539329,1,prior,1,2,8,,196.0,1.0,0.0,Soda,77.0,7.0,soft drinks,beverages
1,2539329,1,prior,1,2,8,,14084.0,2.0,0.0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,soy lactosefree,dairy eggs
2,2539329,1,prior,1,2,8,,12427.0,3.0,0.0,Original Beef Jerky,23.0,19.0,popcorn jerky,snacks
3,2539329,1,prior,1,2,8,,26088.0,4.0,0.0,Aged White Cheddar Popcorn,23.0,19.0,popcorn jerky,snacks
4,2539329,1,prior,1,2,8,,26405.0,5.0,0.0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,paper goods,household


###### Feature Engineering

In [55]:
#Prior orders per user
prior_order_user=orders[orders.eval_set=='prior']
prior_order_count=prior_order_user.groupby(['user_id'])['order_id'].count().reset_index()
prior_order_count.columns=['user_id','order_count']
prior_order_count.head()

Unnamed: 0,user_id,order_count
0,1,10
1,2,14
2,3,12
3,4,5
4,5,4


In [56]:
#Products per order
products_per_order=order_products_prior.groupby(['order_id'])['product_id'].count().reset_index()
products_per_order.columns=['order_id','product_count']
products_per_order.head()

Unnamed: 0,order_id,product_count
0,2,9
1,3,8
2,4,13
3,5,26
4,6,3


In [57]:
#Average Time between orders per user
prior_orders=orders[orders.eval_set=='prior']
prior_order_days=prior_orders.groupby(['user_id'])['days_since_prior_order'].mean().reset_index()
prior_order_days.columns=['user_id','average_days_between_orders']
prior_order_days.head()

Unnamed: 0,user_id,average_days_between_orders
0,1,19.555556
1,2,15.230769
2,3,12.090909
3,4,13.75
4,5,13.333333


In [58]:
del order_products_train,order_products_prior,order_products_test,order_products,orders

In [59]:
combined=pd.merge(combined,prior_order_count,on='user_id',how='left')
del prior_order_count

In [60]:
combined=pd.merge(combined,products_per_order,on='order_id',how='left')
del products_per_order

In [61]:
combined=pd.merge(combined,prior_order_days,on='user_id',how='left')
del prior_order_days

###### Build XGBoost

In [62]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [63]:
combined.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,order_count,product_count,average_days_between_orders
0,2539329,1,prior,1,2,8,,196.0,1.0,0.0,Soda,77.0,7.0,soft drinks,beverages,10,5.0,19.555556
1,2539329,1,prior,1,2,8,,14084.0,2.0,0.0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,soy lactosefree,dairy eggs,10,5.0,19.555556
2,2539329,1,prior,1,2,8,,12427.0,3.0,0.0,Original Beef Jerky,23.0,19.0,popcorn jerky,snacks,10,5.0,19.555556
3,2539329,1,prior,1,2,8,,26088.0,4.0,0.0,Aged White Cheddar Popcorn,23.0,19.0,popcorn jerky,snacks,10,5.0,19.555556
4,2539329,1,prior,1,2,8,,26405.0,5.0,0.0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,paper goods,household,10,5.0,19.555556


In [64]:
train_test=combined[(combined.eval_set=='train')|(combined.eval_set=='test')]

In [65]:
del combined

In [66]:
train_test=train_test.drop(['user_id','order_number','product_id','add_to_cart_order','aisle_id','department_id'],axis=1)

In [67]:
train=train_test[train_test.eval_set=='train']
test=train_test[train_test.eval_set=='test']
test_order_id=test['order_id','product_name']
train=train.drop(['order_id','eval_set','product_name'],axis=1)
test=test.drop(['order_id','eval_set','product_name'],axis=1)

In [68]:
del train_test

In [69]:
train.shape

(1384617, 9)

In [70]:
test.shape

(75000, 9)

In [71]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384617 entries, 59 to 33894105
Data columns (total 9 columns):
order_dow                      1384617 non-null int64
order_hour_of_day              1384617 non-null int64
days_since_prior_order         1384617 non-null float64
reordered                      1384617 non-null float64
aisle                          1384617 non-null object
department                     1384617 non-null object
order_count                    1384617 non-null int64
product_count                  0 non-null float64
average_days_between_orders    1384617 non-null float64
dtypes: float64(4), int64(3), object(2)
memory usage: 105.6+ MB


In [72]:
import gc
gc.collect()

454

In [73]:
train=pd.get_dummies(train)

In [74]:
X=train.drop(['reordered'],axis=1)
y=train['reordered']

In [75]:
seed=7
test_size=.3
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=test_size,random_state=seed)

In [76]:
model=XGBClassifier()

In [77]:
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [78]:
accuracy=accuracy_score(y_test,y_pred)

In [79]:
accuracy

0.66555685555122224

In [81]:
from sklearn.metrics import f1_score
f1=f1_score(y_test,y_pred)

In [82]:
f1

0.74725510636633408