In [16]:
%matplotlib inline
import pyspark.sql.functions as F

In [17]:
aisles = spark.read \
    .options(header=True, inferSchema=True) \
    .csv("Instacart Kaggle/aisles.csv")
dptmts = spark.read \
    .options(header=True, inferSchema=True) \
    .csv("Instacart Kaggle/departments.csv")
prod_in_orders = spark.read \
    .options(header=True, inferSchema=True) \
    .csv("Instacart Kaggle/order_products__prior.csv")
all_orders = spark.read \
    .options(header=True, inferSchema=True) \
    .csv("Instacart Kaggle/orders.csv")
train = spark.read \
    .options(header=True, inferSchema=True) \
    .csv("Instacart Kaggle/order_products__train.csv")
products = spark.read \
    .options(header=True, inferSchema=True) \
    .csv("Instacart Kaggle/products.csv")
sample_sub = spark.read \
    .options(header=True, inferSchema=True) \
    .csv("Instacart Kaggle/sample_submission.csv")

### 1. Days since first order

The idea here was to have a feature that count how many days has passed since the user first order.

In [18]:
from pyspark.sql import Window

windowval = Window.partitionBy('user_id').orderBy('order_number').rangeBetween(Window.unboundedPreceding, 0)
all_orders = all_orders.withColumn('dspo_cum_sum', F.sum('days_since_prior_order').over(windowval))
all_orders.show(10)

+--------+-------+--------+------------+---------+-----------------+----------------------+------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|dspo_cum_sum|
+--------+-------+--------+------------+---------+-----------------+----------------------+------------+
| 3266887|    148|   prior|           1|        5|               16|                  null|        null|
| 3169439|    148|   prior|           2|        1|                7|                   3.0|         3.0|
| 2175302|    148|   prior|           3|        4|                5|                   3.0|         6.0|
| 3221240|    148|   prior|           4|        6|               13|                   2.0|         8.0|
|  415062|    148|   prior|           5|        2|               13|                   3.0|        11.0|
| 1759549|    148|   prior|           6|        3|               12|                   1.0|        12.0|
| 3300653|    148|   prior|           7|        6|     

### 2. Frequency of products purchased by user

Frequency in days of how often the user historicaly has purchased the product
Along with the calculations I've included the **number of times the user has purchased the product**

In [19]:
#The first join will connect the products tables
prod_full = products.join(aisles, on='aisle_id')
prod_full = prod_full.join(dptmts, on='department_id')
#The second join will connect the orders tables
order_full = prod_in_orders.join(all_orders, on='order_id')
# The third join will connect products and orders
order_prod_full = order_full.join(prod_full, on='product_id')

In [20]:
from pyspark.sql import Window
opf = order_prod_full
windowval = Window.partitionBy('user_id','product_id').orderBy('order_number').rangeBetween(Window.unboundedPreceding, 0)
opf = opf.withColumn('MIN_dspo_cum_sum', F.min('dspo_cum_sum').over(windowval))
opf = opf.withColumn('MAX_dspo_cum_sum', F.max('dspo_cum_sum').over(windowval))
opf = opf.withColumn('COUNT_dspo_cum_sum', F.count('dspo_cum_sum').over(windowval))

In [22]:
opf = opf.withColumn('DIFF_dspo_cum_sum', opf.MAX_dspo_cum_sum - opf.MIN_dspo_cum_sum)

In [23]:
opf = opf.drop('MAX_dspo_cum_sum')
opf = opf.drop('MIN_dspo_cum_sum')

In [25]:
opf = opf.withColumn('freq_user_prod', opf.DIFF_dspo_cum_sum / (opf.COUNT_dspo_cum_sum - 1))

In [26]:
opf = opf.withColumnRenamed('COUNT_dspo_cum_sum', 'count_user_prod')

In [27]:
opf.show()

+----------+--------+-----------------+---------+-------+--------+------------+---------+-----------------+----------------------+------------+-------------+--------+--------------------+------------------+----------+---------------+-----------------+------------------+
|product_id|order_id|add_to_cart_order|reordered|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|dspo_cum_sum|department_id|aisle_id|        product_name|             aisle|department|count_user_prod|DIFF_dspo_cum_sum|    freq_user_prod|
+----------+--------+-----------------+---------+-------+--------+------------+---------+-----------------+----------------------+------------+-------------+--------+--------------------+------------------+----------+---------------+-----------------+------------------+
|     29894| 2984707|                7|        0|      7|   prior|          18|        0|                9|                   7.0|       193.0|           13|      17|Organic Dark Brow...|