## Install Spark & Install Packages & Initial Spark

In [None]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark3.0.0
!wget -q https://www-us.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
# unzip it
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
# install findspark 
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

In [None]:
import findspark
findspark.init()

In [None]:
import numpy as np
import pandas as pd 
import warnings
import zipfile
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.font_manager import FontProperties
from pyspark.sql import functions as F
from pyspark.sql import Window

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Feature Engineering(Counts & Ratio)")\
        .config("spark.sql.broadcastTimeout", "1000")\
        .getOrCreate()

## Read Data

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
train = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/train_format1.csv")
test = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/test_format1.csv")
train_info = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/user_info_format1.csv") 
train_log = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/user_log_format1.csv") \
    .withColumnRenamed("seller_id", "merchant_id")

In [None]:
train_info_0 = train_info \
    .withColumn("age", F.when(train_info.age_range == 8, 7).otherwise(train_info.age_range)) \
    .fillna({"age": 0, "gender": 2}) \
    .drop("age_range")

In [None]:
train_log_0 = train_log \
            .withColumn("action_type_tmp", F.when(train_log.action_type == 0, 1).otherwise(train_log.action_type)) \
            .drop("action_type") \
            .withColumnRenamed("action_type_tmp", "action_type")

## Show Data

In [None]:
train.show()

+-------+-----------+-----+
|user_id|merchant_id|label|
+-------+-----------+-----+
|  34176|       3906|    0|
|  34176|        121|    0|
|  34176|       4356|    1|
|  34176|       2217|    0|
| 230784|       4818|    0|
| 362112|       2618|    0|
|  34944|       2051|    0|
| 231552|       3828|    1|
| 231552|       2124|    0|
| 232320|       1168|    0|
| 232320|       4270|    0|
| 167040|        671|    0|
| 101760|       1760|    0|
| 298368|       2981|    0|
|  36480|       4730|    0|
| 299136|       2935|    0|
|  37248|       2615|    0|
| 103296|       2482|    0|
| 299904|       1742|    0|
|  38016|       1028|    0|
+-------+-----------+-----+
only showing top 20 rows



In [None]:
train_info.show()

+-------+---------+------+
|user_id|age_range|gender|
+-------+---------+------+
| 376517|        6|     1|
| 234512|        5|     0|
| 344532|        5|     0|
| 186135|        5|     0|
|  30230|        5|     0|
| 272389|        6|     1|
| 281071|        4|     0|
| 139859|        7|     0|
| 198411|        5|     1|
|  67037|        4|     1|
| 149002|        5|     2|
|   7468|        4|     0|
|  94292|        4|     0|
| 347414|        6|     1|
| 191719|        4|     0|
| 391524|        5|     1|
| 153790|        6|     0|
| 349112|        3|     1|
| 344766|        6|     0|
|  81816|        5|     0|
+-------+---------+------+
only showing top 20 rows



In [None]:
train_log.orderBy("merchant_id", "user_id").show()

+-------+-------+------+-----------+--------+----------+-----------+
|user_id|item_id|cat_id|merchant_id|brand_id|time_stamp|action_type|
+-------+-------+------+-----------+--------+----------+-----------+
|    100|1041304|   420|          1|    1662|      1108|          0|
|    100| 472260|   420|          1|    1662|      1108|          0|
|    100| 912479|   993|          1|    1662|      1018|          0|
|    100|  24620|   420|          1|    1662|      1008|          3|
|    100| 912479|   993|          1|    1662|      1016|          0|
|    100|1008023|   629|          1|    1662|      1008|          0|
|    100| 912479|   993|          1|    1662|      1016|          0|
|    100| 918789|   629|          1|    1662|      1008|          0|
|    100| 912479|   993|          1|    1662|      1016|          0|
|    100| 912479|   993|          1|    1662|      1020|          0|
|    100|  83998|   420|          1|    1662|      1108|          0|
|    100|1008023|   629|          

## Feature Engineering (Merchant Profile)

### Count Features

In [51]:
# columns to rows
from pyspark.sql import functions as F

def dfPivot(df, keys, column):
    '''
    params: 
      df: dataframe
      keys: 待转换表中需要保留的主键key，以list[]类型传入
      column: 待转换的列名
    '''
    # .fillna(-999.0): 行转列时有时对应的列没有值，就会产生null值，fillna会对null做处理，转换为其他值，如不需要可以删除
    return df.groupBy(keys).pivot(column).agg(F.count(column)).fillna(0)

In [52]:
keys = ["merchant_id", "time_stamp"]
column = "action_type"

act_cnt_ratio_0 = dfPivot(train_log_0, keys, column) \
            .withColumnRenamed("1", "clk_atc_cnt") \
            .withColumnRenamed("2", "buy_cnt") \
            .withColumnRenamed("3", "atf_cnt")

In [53]:
def monthly_action_counts(profile, profile_id, act_cnt_log):
  # montly filter
  act_cnt_0 = act_cnt_log \
              .withColumn(profile + "_clk_atc_cnt_m5", F.when(( (act_cnt_log.time_stamp >= "0501") & (act_cnt_log.time_stamp <= "0531") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m6", F.when(( (act_cnt_log.time_stamp >= "0601") & (act_cnt_log.time_stamp <= "0630") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m7", F.when(( (act_cnt_log.time_stamp >= "0701") & (act_cnt_log.time_stamp <= "0731") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m8", F.when(( (act_cnt_log.time_stamp >= "0801") & (act_cnt_log.time_stamp <= "0831") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m9", F.when(( (act_cnt_log.time_stamp >= "0901") & (act_cnt_log.time_stamp <= "0930") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m10", F.when(( (act_cnt_log.time_stamp >= "1001") & (act_cnt_log.time_stamp <= "1031") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m11", F.when(( act_cnt_log.time_stamp >= "1101" ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m5", F.when(( (act_cnt_log.time_stamp >= "0501") & (act_cnt_log.time_stamp <= "0531") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m6", F.when(( (act_cnt_log.time_stamp >= "0601") & (act_cnt_log.time_stamp <= "0630") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m7", F.when(( (act_cnt_log.time_stamp >= "0701") & (act_cnt_log.time_stamp <= "0731") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m8", F.when(( (act_cnt_log.time_stamp >= "0801") & (act_cnt_log.time_stamp <= "0831") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m9", F.when(( (act_cnt_log.time_stamp >= "0901") & (act_cnt_log.time_stamp <= "0930") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m10", F.when(( (act_cnt_log.time_stamp >= "1001") & (act_cnt_log.time_stamp <= "1031") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m11", F.when(( act_cnt_log.time_stamp >= "1101" ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m5", F.when(( (act_cnt_log.time_stamp >= "0501") & (act_cnt_log.time_stamp <= "0531") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m6", F.when(( (act_cnt_log.time_stamp >= "0601") & (act_cnt_log.time_stamp <= "0630") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m7", F.when(( (act_cnt_log.time_stamp >= "0701") & (act_cnt_log.time_stamp <= "0731") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m8", F.when(( (act_cnt_log.time_stamp >= "0801") & (act_cnt_log.time_stamp <= "0831") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m9", F.when(( (act_cnt_log.time_stamp >= "0901") & (act_cnt_log.time_stamp <= "0930") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m10", F.when(( (act_cnt_log.time_stamp >= "1001") & (act_cnt_log.time_stamp <= "1031") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m11", F.when(( act_cnt_log.time_stamp >= "1101" ), act_cnt_log.atf_cnt).otherwise(0))

  act_cnt_1 = act_cnt_0 \
              .groupBy(profile_id) \
              .sum() \
              .withColumnRenamed("sum(clk_atc_cnt)", profile + "_ttl_clk_atc_cnt") \
              .withColumnRenamed("sum(buy_cnt)", profile + "_ttl_buy_cnt") \
              .withColumnRenamed("sum(atf_cnt)", profile + "_ttl_atf_cnt") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m5)", profile + "_clk_atc_cnt_m5") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m6)", profile + "_clk_atc_cnt_m6") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m7)", profile + "_clk_atc_cnt_m7") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m8)", profile + "_clk_atc_cnt_m8") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m9)", profile + "_clk_atc_cnt_m9") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m10)", profile + "_clk_atc_cnt_m10") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m11)", profile + "_clk_atc_cnt_m11") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m5)", profile + "_buy_cnt_m5") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m6)", profile + "_buy_cnt_m6") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m7)", profile + "_buy_cnt_m7") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m8)", profile + "_buy_cnt_m8") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m9)", profile + "_buy_cnt_m9") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m10)", profile + "_buy_cnt_m10") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m11)", profile + "_buy_cnt_m11") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m5)", profile + "_atf_cnt_m5") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m6)", profile + "_atf_cnt_m6") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m7)", profile + "_atf_cnt_m7") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m8)", profile + "_atf_cnt_m8") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m9)", profile + "_atf_cnt_m9") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m10)", profile + "_atf_cnt_m10") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m11)", profile + "_atf_cnt_m11") 
  return act_cnt_1

In [54]:
act_cnt_ratio_1 = monthly_action_counts("m", "merchant_id", act_cnt_ratio_0).orderBy("merchant_id")

# +-----------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+
# |merchant_id|m_ttl_clk_atc_cnt|m_ttl_buy_cnt|m_ttl_atf_cnt|m_clk_atc_cnt_m5|m_clk_atc_cnt_m6|m_clk_atc_cnt_m7|m_clk_atc_cnt_m8|m_clk_atc_cnt_m9|m_clk_atc_cnt_m10|m_clk_atc_cnt_m11|m_buy_cnt_m5|m_buy_cnt_m6|m_buy_cnt_m7|m_buy_cnt_m8|m_buy_cnt_m9|m_buy_cnt_m10|m_buy_cnt_m11|m_atf_cnt_m5|m_atf_cnt_m6|m_atf_cnt_m7|m_atf_cnt_m8|m_atf_cnt_m9|m_atf_cnt_m10|m_atf_cnt_m11|
# +-----------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+
# |          1|           308680|        17705|        12755|           24290|           25416|           25408|           25102|           35260|            38013|           135191|        2329|        1408|        1320|        1353|        2143|         1558|         7594|        1521|         828|         868|        1220|        1789|         2235|         4294|
# |         10|            19189|         1133|          866|             141|             323|              68|             302|            1407|             2008|            14940|          23|          24|           1|          14|          29|           45|          997|          17|           6|           3|          26|          73|          143|          598|
# |        100|             4062|          538|          181|               0|              23|              23|               7|             494|             1723|             1792|           0|           1|           2|           1|          68|          215|          251|           0|           2|           2|           1|          25|           87|           64|
# |       1000|            10513|          959|          781|             172|             798|             962|            1394|            1010|             1383|             4794|          27|          30|          80|         111|          83|           84|          544|          51|          70|          83|         100|          90|          137|          250|
# |       1001|             2412|          196|          116|             281|             494|             146|             280|             268|              244|              699|          68|          34|          15|           8|          15|           12|           44|          27|          20|          12|          18|           9|           11|           19|
# |       1002|             4788|          386|          222|             239|             680|             538|             617|             489|              317|             1908|          35|          58|          55|          49|          42|           33|          114|          19|          27|          16|          32|          22|           20|           86|
# |       1003|             1896|          173|          125|              80|             256|              55|              84|             263|              397|              761|          10|          17|           4|           8|          19|           38|           77|           7|          16|           3|           2|          19|           36|           42|
# |       1004|             9370|          337|          729|            1081|            2354|            1071|             536|             905|             2059|             1364|          36|          71|          32|          16|          29|           76|           77|          60|         165|          86|          48|          78|          208|           84|
# |       1005|             2315|          305|          136|              88|             205|             292|             228|             153|              843|              506|          21|          26|          28|          27|          11|          162|           30|          10|          12|           7|          12|           6|           59|           30|
# |       1006|             4432|          447|          233|               0|               0|               0|              17|             513|             2576|             1326|           0|           0|           0|           0|          38|          273|          136|           0|           0|           0|           8|          40|          139|           46|
# +-----------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+


### Ratio Features

In [55]:
def monthly_action_ratio(profile, act_cnt_log):
  monthly_ttl_act_cnt_0 = act_cnt_log \
                         .withColumn("ttl_cnt", sum([act_cnt_log[col] for col in act_cnt_log.columns if profile + "_ttl_" in col])) \
                         .withColumn("ttl_cnt_m5", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m5" in col])) \
                         .withColumn("ttl_cnt_m6", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m6" in col])) \
                         .withColumn("ttl_cnt_m7", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m7" in col])) \
                         .withColumn("ttl_cnt_m8", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m8" in col])) \
                         .withColumn("ttl_cnt_m9", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m9" in col])) \
                         .withColumn("ttl_cnt_m10", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m10" in col])) \
                         .withColumn("ttl_cnt_m11", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m11" in col])) 

  act_ratio_0 = monthly_ttl_act_cnt_0 \
                    .withColumn(profile + "_ttl_clk_atc_ratio", F.round(monthly_ttl_act_cnt_0[profile + "_ttl_clk_atc_cnt"] / monthly_ttl_act_cnt_0["ttl_cnt"], 2)) \
                    .withColumn(profile + "_ttl_buy_ratio", F.round(monthly_ttl_act_cnt_0[profile + "_ttl_buy_cnt"] / monthly_ttl_act_cnt_0["ttl_cnt"], 2)) \
                    .withColumn(profile + "_ttl_atf_ratio", F.round(monthly_ttl_act_cnt_0[profile + "_ttl_atf_cnt"] / monthly_ttl_act_cnt_0["ttl_cnt"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m5", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m5"] / monthly_ttl_act_cnt_0["ttl_cnt_m5"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m6", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m6"] / monthly_ttl_act_cnt_0["ttl_cnt_m6"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m7", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m7"] / monthly_ttl_act_cnt_0["ttl_cnt_m7"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m8", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m8"] / monthly_ttl_act_cnt_0["ttl_cnt_m8"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m9", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m9"] / monthly_ttl_act_cnt_0["ttl_cnt_m9"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m10", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m10"] / monthly_ttl_act_cnt_0["ttl_cnt_m10"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m11", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m11"] / monthly_ttl_act_cnt_0["ttl_cnt_m11"], 2)) \
                    .withColumn(profile + "_buy_ratio_m5", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m5"] / monthly_ttl_act_cnt_0["ttl_cnt_m5"], 2)) \
                    .withColumn(profile + "_buy_ratio_m6", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m6"] / monthly_ttl_act_cnt_0["ttl_cnt_m6"], 2)) \
                    .withColumn(profile + "_buy_ratio_m7", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m7"] / monthly_ttl_act_cnt_0["ttl_cnt_m7"], 2)) \
                    .withColumn(profile + "_buy_ratio_m8", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m8"] / monthly_ttl_act_cnt_0["ttl_cnt_m8"], 2)) \
                    .withColumn(profile + "_buy_ratio_m9", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m9"] / monthly_ttl_act_cnt_0["ttl_cnt_m9"], 2)) \
                    .withColumn(profile + "_buy_ratio_m10", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m10"] / monthly_ttl_act_cnt_0["ttl_cnt_m10"], 2)) \
                    .withColumn(profile + "_buy_ratio_m11", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m11"] / monthly_ttl_act_cnt_0["ttl_cnt_m11"], 2)) \
                    .withColumn(profile + "_atf_ratio_m5", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m5"] / monthly_ttl_act_cnt_0["ttl_cnt_m5"], 2)) \
                    .withColumn(profile + "_atf_ratio_m6", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m6"] / monthly_ttl_act_cnt_0["ttl_cnt_m6"], 2)) \
                    .withColumn(profile + "_atf_ratio_m7", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m7"] / monthly_ttl_act_cnt_0["ttl_cnt_m7"], 2)) \
                    .withColumn(profile + "_atf_ratio_m8", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m8"] / monthly_ttl_act_cnt_0["ttl_cnt_m8"], 2)) \
                    .withColumn(profile + "_atf_ratio_m9", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m9"] / monthly_ttl_act_cnt_0["ttl_cnt_m9"], 2)) \
                    .withColumn(profile + "_atf_ratio_m10", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m10"] / monthly_ttl_act_cnt_0["ttl_cnt_m10"], 2)) \
                    .withColumn(profile + "_atf_ratio_m11", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m11"] / monthly_ttl_act_cnt_0["ttl_cnt_m11"], 2)) 

  act_ratio_1 = act_ratio_0 \
               .fillna(0) \
               .drop(*["ttl_cnt", "ttl_cnt_m5", "ttl_cnt_m6", "ttl_cnt_m7", "ttl_cnt_m8", "ttl_cnt_m9", "ttl_cnt_m10", "ttl_cnt_m11"])
  return act_ratio_1

In [56]:
act_cnt_ratio_2 = monthly_action_ratio("m", act_cnt_ratio_1).orderBy("merchant_id")

# +-----------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+
# |merchant_id|m_ttl_clk_atc_cnt|m_ttl_buy_cnt|m_ttl_atf_cnt|m_clk_atc_cnt_m5|m_clk_atc_cnt_m6|m_clk_atc_cnt_m7|m_clk_atc_cnt_m8|m_clk_atc_cnt_m9|m_clk_atc_cnt_m10|m_clk_atc_cnt_m11|m_buy_cnt_m5|m_buy_cnt_m6|m_buy_cnt_m7|m_buy_cnt_m8|m_buy_cnt_m9|m_buy_cnt_m10|m_buy_cnt_m11|m_atf_cnt_m5|m_atf_cnt_m6|m_atf_cnt_m7|m_atf_cnt_m8|m_atf_cnt_m9|m_atf_cnt_m10|m_atf_cnt_m11|m_ttl_clk_atc_ratio|m_ttl_buy_ratio|m_ttl_atf_ratio|m_clk_atc_ratio_m5|m_clk_atc_ratio_m6|m_clk_atc_ratio_m7|m_clk_atc_ratio_m8|m_clk_atc_ratio_m9|m_clk_atc_ratio_m10|m_clk_atc_ratio_m11|m_buy_ratio_m5|m_buy_ratio_m6|m_buy_ratio_m7|m_buy_ratio_m8|m_buy_ratio_m9|m_buy_ratio_m10|m_buy_ratio_m11|m_atf_ratio_m5|m_atf_ratio_m6|m_atf_ratio_m7|m_atf_ratio_m8|m_atf_ratio_m9|m_atf_ratio_m10|m_atf_ratio_m11|
# +-----------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+
# |          1|           308680|        17705|        12755|           24290|           25416|           25408|           25102|           35260|            38013|           135191|        2329|        1408|        1320|        1353|        2143|         1558|         7594|        1521|         828|         868|        1220|        1789|         2235|         4294|               0.91|           0.05|           0.04|              0.86|              0.92|              0.92|              0.91|               0.9|               0.91|               0.92|          0.08|          0.05|          0.05|          0.05|          0.05|           0.04|           0.05|          0.05|          0.03|          0.03|          0.04|          0.05|           0.05|           0.03|
# |         10|            19189|         1133|          866|             141|             323|              68|             302|            1407|             2008|            14940|          23|          24|           1|          14|          29|           45|          997|          17|           6|           3|          26|          73|          143|          598|               0.91|           0.05|           0.04|              0.78|              0.92|              0.94|              0.88|              0.93|               0.91|                0.9|          0.13|          0.07|          0.01|          0.04|          0.02|           0.02|           0.06|          0.09|          0.02|          0.04|          0.08|          0.05|           0.07|           0.04|
# |        100|             4062|          538|          181|               0|              23|              23|               7|             494|             1723|             1792|           0|           1|           2|           1|          68|          215|          251|           0|           2|           2|           1|          25|           87|           64|               0.85|           0.11|           0.04|               0.0|              0.88|              0.85|              0.78|              0.84|               0.85|               0.85|           0.0|          0.04|          0.07|          0.11|          0.12|           0.11|           0.12|           0.0|          0.08|          0.07|          0.11|          0.04|           0.04|           0.03|
# |       1000|            10513|          959|          781|             172|             798|             962|            1394|            1010|             1383|             4794|          27|          30|          80|         111|          83|           84|          544|          51|          70|          83|         100|          90|          137|          250|               0.86|           0.08|           0.06|              0.69|              0.89|              0.86|              0.87|              0.85|               0.86|               0.86|          0.11|          0.03|          0.07|          0.07|          0.07|           0.05|            0.1|           0.2|          0.08|          0.07|          0.06|          0.08|           0.09|           0.04|
# |       1001|             2412|          196|          116|             281|             494|             146|             280|             268|              244|              699|          68|          34|          15|           8|          15|           12|           44|          27|          20|          12|          18|           9|           11|           19|               0.89|           0.07|           0.04|              0.75|               0.9|              0.84|              0.92|              0.92|               0.91|               0.92|          0.18|          0.06|          0.09|          0.03|          0.05|           0.04|           0.06|          0.07|          0.04|          0.07|          0.06|          0.03|           0.04|           0.02|
# |       1002|             4788|          386|          222|             239|             680|             538|             617|             489|              317|             1908|          35|          58|          55|          49|          42|           33|          114|          19|          27|          16|          32|          22|           20|           86|               0.89|           0.07|           0.04|              0.82|              0.89|              0.88|              0.88|              0.88|               0.86|               0.91|          0.12|          0.08|          0.09|          0.07|          0.08|           0.09|           0.05|          0.06|          0.04|          0.03|          0.05|          0.04|           0.05|           0.04|
# |       1003|             1896|          173|          125|              80|             256|              55|              84|             263|              397|              761|          10|          17|           4|           8|          19|           38|           77|           7|          16|           3|           2|          19|           36|           42|               0.86|           0.08|           0.06|              0.82|              0.89|              0.89|              0.89|              0.87|               0.84|               0.86|           0.1|          0.06|          0.06|          0.09|          0.06|           0.08|           0.09|          0.07|          0.06|          0.05|          0.02|          0.06|           0.08|           0.05|
# |       1004|             9370|          337|          729|            1081|            2354|            1071|             536|             905|             2059|             1364|          36|          71|          32|          16|          29|           76|           77|          60|         165|          86|          48|          78|          208|           84|                0.9|           0.03|           0.07|              0.92|              0.91|               0.9|              0.89|              0.89|               0.88|               0.89|          0.03|          0.03|          0.03|          0.03|          0.03|           0.03|           0.05|          0.05|          0.06|          0.07|          0.08|          0.08|           0.09|           0.06|
# |       1005|             2315|          305|          136|              88|             205|             292|             228|             153|              843|              506|          21|          26|          28|          27|          11|          162|           30|          10|          12|           7|          12|           6|           59|           30|               0.84|           0.11|           0.05|              0.74|              0.84|              0.89|              0.85|               0.9|               0.79|               0.89|          0.18|          0.11|          0.09|           0.1|          0.06|           0.15|           0.05|          0.08|          0.05|          0.02|          0.04|          0.04|           0.06|           0.05|
# |       1006|             4432|          447|          233|               0|               0|               0|              17|             513|             2576|             1326|           0|           0|           0|           0|          38|          273|          136|           0|           0|           0|           8|          40|          139|           46|               0.87|           0.09|           0.05|               0.0|               0.0|               0.0|              0.68|              0.87|               0.86|               0.88|           0.0|           0.0|           0.0|           0.0|          0.06|           0.09|           0.09|           0.0|           0.0|           0.0|          0.32|          0.07|           0.05|           0.03|
# +-----------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+


### Day Counts Features

In [57]:
act_day_cnt_0 = train_log_0 \
            .groupBy("merchant_id", "action_type") \
            .agg(F.countDistinct("time_stamp")) \
            .withColumnRenamed("count(time_stamp)", "day_cnt")

In [58]:
from pyspark.sql import functions as F

def dfPivot_value(df, keys, column, column_value):
    '''
    params: 
      df: dataframe
      keys: 待转换表中需要保留的主键key，以list[]类型传入
      column: 待转换的列名
      column_value_list: 需要成列值
    '''
    # .fillna(-999.0): 行转列时有时对应的列没有值，就会产生null值，fillna会对null做处理，转换为其他值，如不需要可以删除
    return df.groupBy(keys).pivot(column).agg(F.first(column_value, ignorenulls=True)).fillna(0)

In [59]:
keys = "merchant_id"
column = "action_type"
column_value = "day_cnt"

act_day_cnt_1 = dfPivot_value(act_day_cnt_0, keys, column, column_value) \
            .withColumnRenamed("1", "m_clk_atc_day_cnt") \
            .withColumnRenamed("2", "m_buy_day_cnt") \
            .withColumnRenamed("3", "m_atf_day_cnt")

In [60]:
from pyspark.sql.types import *
dt_mth = F.udf(lambda x: x[:2], StringType())
train_log_1 = train_log_0 \
        .withColumn("time_stamp_mth", dt_mth("time_stamp")) \
        .select("merchant_id", "time_stamp_mth", "time_stamp", "action_type")

act_day_cnt_2 = train_log_1 \
            .groupBy("merchant_id", "time_stamp_mth", "action_type") \
            .agg(F.countDistinct("time_stamp")) \
            .withColumnRenamed("count(time_stamp)", "day_cnt") \
            .orderBy("merchant_id", "time_stamp_mth")

In [61]:
def monthly_day_counts(profile, profile_id, df):
  day_cnt_0 = df \
              .withColumn(profile + "_clk_atc_day_cnt_m5", F.when(( (df.time_stamp_mth == "05") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m6", F.when(( (df.time_stamp_mth == "06") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m7", F.when(( (df.time_stamp_mth == "07") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m8", F.when(( (df.time_stamp_mth == "08") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m9", F.when(( (df.time_stamp_mth == "09") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m10", F.when(( (df.time_stamp_mth == "10") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m11", F.when(( (df.time_stamp_mth == "11") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m5", F.when(( (df.time_stamp_mth == "05") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m6", F.when(( (df.time_stamp_mth == "06") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m7", F.when(( (df.time_stamp_mth == "07") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m8", F.when(( (df.time_stamp_mth == "08") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m9", F.when(( (df.time_stamp_mth == "09") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m10", F.when(( (df.time_stamp_mth == "10") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m11", F.when(( (df.time_stamp_mth == "11") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m5", F.when(( (df.time_stamp_mth == "05") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m6", F.when(( (df.time_stamp_mth == "06") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m7", F.when(( (df.time_stamp_mth == "07") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m8", F.when(( (df.time_stamp_mth == "08") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m9", F.when(( (df.time_stamp_mth == "09") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m10", F.when(( (df.time_stamp_mth == "10") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m11", F.when(( (df.time_stamp_mth == "11") & (df.action_type == "3") ), df.day_cnt).otherwise(0))

  day_cnt_1 = day_cnt_0 \
            .groupBy(profile_id) \
            .sum() \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m5)", profile + "_clk_atc_day_cnt_m5") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m6)", profile + "_clk_atc_day_cnt_m6") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m7)", profile + "_clk_atc_day_cnt_m7") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m8)", profile + "_clk_atc_day_cnt_m8") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m9)", profile + "_clk_atc_day_cnt_m9") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m10)", profile + "_clk_atc_day_cnt_m10") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m11)", profile + "_clk_atc_day_cnt_m11") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m5)", profile + "_buy_day_cnt_m5") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m6)", profile + "_buy_day_cnt_m6") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m7)", profile + "_buy_day_cnt_m7") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m8)", profile + "_buy_day_cnt_m8") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m9)", profile + "_buy_day_cnt_m9") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m10)", profile + "_buy_day_cnt_m10") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m11)", profile + "_buy_day_cnt_m11") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m5)", profile + "_atf_day_cnt_m5") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m6)", profile + "_atf_day_cnt_m6") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m7)", profile + "_atf_day_cnt_m7") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m8)", profile + "_atf_day_cnt_m8") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m9)", profile + "_atf_day_cnt_m9") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m10)", profile + "_atf_day_cnt_m10") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m11)", profile + "_atf_day_cnt_m11") \
            .drop("sum(day_cnt)")
  return day_cnt_1

In [62]:
act_day_cnt_3 = monthly_day_counts("m", "merchant_id", act_day_cnt_2).orderBy("merchant_id")

# +-----------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+
# |merchant_id|m_clk_atc_day_cnt_m5|m_clk_atc_day_cnt_m6|m_clk_atc_day_cnt_m7|m_clk_atc_day_cnt_m8|m_clk_atc_day_cnt_m9|m_clk_atc_day_cnt_m10|m_clk_atc_day_cnt_m11|m_buy_day_cnt_m5|m_buy_day_cnt_m6|m_buy_day_cnt_m7|m_buy_day_cnt_m8|m_buy_day_cnt_m9|m_buy_day_cnt_m10|m_buy_day_cnt_m11|m_atf_day_cnt_m5|m_atf_day_cnt_m6|m_atf_day_cnt_m7|m_atf_day_cnt_m8|m_atf_day_cnt_m9|m_atf_day_cnt_m10|m_atf_day_cnt_m11|
# +-----------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+
# |          1|                  15|                  30|                  31|                  31|                  30|                   31|                   11|              21|              30|              31|              31|              30|               31|               11|              21|              30|              31|              31|              30|               31|               11|
# |         10|                  12|                  26|                  18|                  30|                  30|                   31|                   11|               9|              11|               1|              10|              18|               21|               11|               9|               5|               3|              13|              28|               29|               11|
# |        100|                   0|                  13|                  11|                   6|                  24|                   31|                   11|               0|               1|               1|               1|              13|               31|               11|               0|               2|               2|               1|              11|               28|               11|
# |       1000|                  12|                  30|                  31|                  31|                  30|                   31|                   11|              13|              16|              26|              27|              27|               27|               11|              16|              22|              29|              27|              27|               30|               11|
# |       1001|                  12|                  30|                  21|                  29|                  27|                   28|                   11|              12|              16|               5|               2|               6|                7|                5|              11|              12|               1|               8|               9|                8|                8|
# |       1002|                  12|                  30|                  31|                  31|                  29|                   30|                   11|              14|              17|              19|              18|              20|               13|                4|               9|              17|               9|              16|              11|               10|               11|
# |       1003|                  12|                  29|                  23|                  23|                  29|                   30|                   11|               8|              10|               3|               7|               9|               15|                4|               7|              11|               3|               2|              12|               16|               11|
# |       1004|                  12|                  30|                  31|                  31|                  30|                   31|                   11|               7|              15|              11|              10|              14|               27|               10|              10|              30|              26|              25|              29|               31|               11|
# |       1005|                  12|                  29|                  31|                  31|                  29|                   31|                   11|              12|              12|              19|              17|               9|               18|                5|               8|               6|               5|              10|               5|               11|                9|
# |       1006|                   0|                   0|                   0|                   2|                  29|                   31|                   11|               0|               0|               0|               0|              11|               31|               11|               0|               0|               0|               1|              14|               31|               11|
# +-----------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+


In [63]:
mer_feature_0 = act_cnt_ratio_2 \
              .join(act_day_cnt_1, "merchant_id", "left") \
              .join(act_day_cnt_3, "merchant_id", "left") \
              .orderBy("merchant_id")

### Product Diversity Features

In [64]:
act_prod_div0 = train_log_0 \
            .groupBy("merchant_id", "action_type") \
            .agg(F.countDistinct("item_id").alias("item_cnt"), F.countDistinct("cat_id").alias("cat_cnt"), F.countDistinct("brand_id").alias("brd_cnt"), F.countDistinct("merchant_id").alias("mer_cnt")) 


In [65]:
def prod_diversity(profile, profile_id, df):
  prod_div_0 = df \
              .withColumn("clk_atc_item_div", F.when(df.action_type == "1", F.col("item_cnt")).otherwise(0)) \
              .withColumn("buy_item_div", F.when(df.action_type == "2", F.col("item_cnt")).otherwise(0)) \
              .withColumn("atf_item_div", F.when(df.action_type == "3", F.col("item_cnt")).otherwise(0)) \
              .withColumn("clk_atc_cat_div", F.when(df.action_type == "1", F.col("cat_cnt")).otherwise(0)) \
              .withColumn("buy_cat_div", F.when(df.action_type == "2", F.col("cat_cnt")).otherwise(0)) \
              .withColumn("atf_cat_div", F.when(df.action_type == "3", F.col("cat_cnt")).otherwise(0)) \
              .withColumn("clk_atc_brd_div", F.when(df.action_type == "1", F.col("brd_cnt")).otherwise(0)) \
              .withColumn("buy_brd_div", F.when(df.action_type == "2", F.col("brd_cnt")).otherwise(0)) \
              .withColumn("atf_brd_div", F.when(df.action_type == "3", F.col("brd_cnt")).otherwise(0)) \
              .withColumn("clk_atc_mer_div", F.when(df.action_type == "1", F.col("mer_cnt")).otherwise(0)) \
              .withColumn("buy_mer_div", F.when(df.action_type == "2", F.col("mer_cnt")).otherwise(0)) \
              .withColumn("atf_mer_div", F.when(df.action_type == "3", F.col("mer_cnt")).otherwise(0)) 

  prod_div_1 = prod_div_0 \
              .groupBy(profile_id) \
              .sum() \
              .withColumnRenamed("sum(clk_atc_item_div)", profile + "_clk_atc_item_div") \
              .withColumnRenamed("sum(buy_item_div)", profile + "_buy_item_div") \
              .withColumnRenamed("sum(atf_item_div)", profile + "_atf_item_div") \
              .withColumnRenamed("sum(clk_atc_cat_div)", profile + "_clk_atc_cat_div") \
              .withColumnRenamed("sum(buy_cat_div)", profile + "_buy_cat_div") \
              .withColumnRenamed("sum(atf_cat_div)", profile + "_atf_cat_div") \
              .withColumnRenamed("sum(clk_atc_brd_div)", profile + "_clk_atc_brd_div") \
              .withColumnRenamed("sum(buy_brd_div)", profile + "_buy_brd_div") \
              .withColumnRenamed("sum(atf_brd_div)", profile + "_atf_brd_div") \
              .withColumnRenamed("sum(clk_atc_mer_div)", profile + "_clk_atc_mer_div") \
              .withColumnRenamed("sum(buy_mer_div)", profile + "_buy_mer_div") \
              .withColumnRenamed("sum(atf_mer_div)", profile + "_atf_mer_div") \
              .drop(*["sum(item_cnt)", "sum(cat_cnt)", "sum(brd_cnt)", "sum(mer_cnt)"])
  return prod_div_1      

In [66]:
act_prod_div1 = prod_diversity("m", "merchant_id", act_prod_div0).orderBy("merchant_id")

# +-----------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+
# |merchant_id|m_clk_atc_item_div|m_buy_item_div|m_atf_item_div|m_clk_atc_cat_div|m_buy_cat_div|m_atf_cat_div|m_clk_atc_brd_div|m_buy_brd_div|m_atf_brd_div|m_clk_atc_mer_div|m_buy_mer_div|m_atf_mer_div|
# +-----------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+
# |          1|              2969|          1810|          1924|               44|           35|           36|                2|            2|            2|                1|            1|            1|
# |         10|               461|           149|           154|               13|           12|           12|                1|            1|            1|                1|            1|            1|
# |        100|                38|            19|            22|               13|            9|           10|                3|            2|            3|                1|            1|            1|
# |       1000|               189|            93|            89|               45|           30|           30|               13|            9|            8|                1|            1|            1|
# |       1001|                62|            25|            24|                4|            3|            3|                1|            1|            1|                1|            1|            1|
# |       1002|               233|            61|            58|               29|           15|           15|               27|            7|           14|                1|            1|            1|
# |       1003|                93|            27|            33|               10|            7|            6|                1|            1|            1|                1|            1|            1|
# |       1004|               207|            46|            91|               17|           12|           14|                1|            1|            1|                1|            1|            1|
# |       1005|               120|            27|            27|               14|            7|            6|                8|            5|            3|                1|            1|            1|
# |       1006|                45|            11|            18|                3|            2|            2|                1|            1|            1|                1|            1|            1|
# +-----------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+


In [67]:
mer_feature_1 = mer_feature_0.join(act_prod_div1, "merchant_id", "left")

### Monthly Aggregation Features

In [68]:
# df = spark.createDataFrame([(1, 2, 3, 4), (1, 4, 100, 5), (20, 30, 50, 10)],['a', 'b', 'c', 'd'])
# df1 = df.withColumn("mean", maximum(*(df.columns[0:4])))
# df1.show()

In [69]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import array, udf, array_sort, floor, col, size, sqrt, greatest
from pyspark.sql import Column

# function that calculates the row-wise average
def ssy_mean(*args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    n = len([col_(x) for x in args])
    avg = sum([col_(x) for x in args]) / n
    return F.round(avg, 2)

# function that calculates the row-wise percentage 
def ssy_percentile(p, *args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    xs = array_sort(array(*[col_(x) for x in args]))
    n = size(xs)
    h = (n - 1) * p
    i = floor(h).cast("int")
    x0, x1 = xs[i], xs[i + 1]
    return F.round((x0 + (h - i) * (x1 - x0)), 2)

# function that calculates the row-wise std
def ssy_std(*args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    n = len([col_(x) for x in args])
    avg = sum([col_(x) for x in args]) / n
    std = sqrt(sum([(col_(x) - avg) ** 2 for x in args]) / (n - 1))
    return F.round(std, 2)

# function that calculates the row-wise maximum
def ssy_maximum(*args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    max = greatest(*[col_(x) for x in args])
    return F.round(max, 2)

In [70]:
def monthly_aggregation(profile, df):
  mth_agg_0 = df \
              .withColumn(profile + "_clk_atc_mean", ssy_mean(*(df.columns[4:10]))) \
              .withColumn(profile + "_buy_mean", ssy_mean(*(df.columns[11:17]))) \
              .withColumn(profile + "_atf_mean", ssy_mean(*(df.columns[18:24]))) \
              .withColumn(profile + "_clk_atc_median", ssy_percentile(0.5, *(df.columns[4:10]))) \
              .withColumn(profile + "_buy_median", ssy_percentile(0.5, *(df.columns[11:17]))) \
              .withColumn(profile + "_atf_median", ssy_percentile(0.5, *(df.columns[18:24]))) \
              .withColumn(profile + "_clk_atc_std", ssy_std(*(df.columns[4:10]))) \
              .withColumn(profile + "_buy_std", ssy_std(*(df.columns[11:17]))) \
              .withColumn(profile + "_atf_std", ssy_std(*(df.columns[18:24]))) \
              .withColumn(profile + "_clk_atc_max", ssy_maximum(*(df.columns[4:10]))) \
              .withColumn(profile + "_buy_max", ssy_maximum(*(df.columns[11:17]))) \
              .withColumn(profile + "_atf_max", ssy_maximum(*(df.columns[18:24])))
  return mth_agg_0

In [71]:
mer_feature_2 = monthly_aggregation("m", mer_feature_1).orderBy("merchant_id")

### User Aggregation Features

#### User-action-day-aggregation

In [72]:
from pyspark.sql import DataFrameStatFunctions as statFunc
from pyspark.sql.functions import col
import numpy as np
from pyspark.sql.types import FloatType

def median(values_list):
    med = np.median(values_list)
    return float(med)
udf_median = F.udf(median, FloatType())

def user_day_cnt_aggregation(profile, profile_id, act_id, act_name):
  day_agg_0 = train_log_0 \
            .filter(train_log_0.action_type == act_id) \
            .groupBy("merchant_id", "user_id","action_type") \
            .agg(F.countDistinct("time_stamp")) \
            .withColumnRenamed("count(time_stamp)", "day_cnt")

  day_agg_1 = day_agg_0 \
            .groupBy(profile_id) \
            .agg(F.round(F.mean("day_cnt"), 2).alias(profile + "_" + act_name + "_day_cnt_mean"), 
                 F.round(F.max("day_cnt"), 2).alias(profile + "_" + act_name + "_day_cnt_max"), 
                 F.round(F.stddev("day_cnt"), 2).alias(profile + "_" + act_name + "_day_cnt_std"), 
                 F.round(udf_median(F.collect_list(col("day_cnt"))), 2).alias(profile + "_" + act_name + "_day_cnt_med")) 
  return day_agg_1

In [73]:
user_day_agg_0 = user_day_cnt_aggregation("m", "merchant_id", 1, "clk_atc") 
user_day_agg_1 = user_day_cnt_aggregation("m", "merchant_id", 2, "buy")
user_day_agg_2 = user_day_cnt_aggregation("m", "merchant_id", 3, "atf")
user_day_agg_3 = user_day_agg_0 \
              .join(user_day_agg_1, "merchant_id", "full") \
              .join(user_day_agg_2, "merchant_id", "full") \
              .orderBy("merchant_id")

In [74]:
mer_feature_3 = mer_feature_2.join(user_day_agg_3, "merchant_id", "left")

#### User-action-item-aggregation

In [75]:
from pyspark.sql import DataFrameStatFunctions as statFunc
from pyspark.sql.functions import col
import numpy as np
from pyspark.sql.types import FloatType

def median(values_list):
    med = np.median(values_list)
    return float(med)
udf_median = F.udf(median, FloatType())

def user_item_cnt_aggregation(profile, profile_id, act_id, act_name):
  item_agg_0 = train_log_0 \
            .filter(train_log_0.action_type == act_id) \
            .groupBy("merchant_id", "user_id","action_type") \
            .agg(F.countDistinct("item_id")) \
            .withColumnRenamed("count(item_id)", "item_cnt")

  item_agg_1 = item_agg_0 \
            .groupBy(profile_id) \
            .agg(F.round(F.mean("item_cnt"), 2).alias(profile + "_" + act_name + "_item_cnt_mean"), 
                 F.round(F.max("item_cnt"), 2).alias(profile + "_" + act_name + "_item_cnt_max"), 
                 F.round(F.stddev("item_cnt"), 2).alias(profile + "_" + act_name + "_item_cnt_std"), 
                 F.round(udf_median(F.collect_list(col("item_cnt"))), 2).alias(profile + "_" + act_name + "_item_cnt_med")) 
  return item_agg_1

In [76]:
user_item_agg_0 = user_item_cnt_aggregation("m", "merchant_id", 1, "clk_atc")
user_item_agg_1 = user_item_cnt_aggregation("m", "merchant_id", 2, "buy")
user_item_agg_2 = user_item_cnt_aggregation("m", "merchant_id", 3, "atf")
user_item_agg_3 = user_item_agg_0 \
                .join(user_item_agg_1, "merchant_id", "full") \
                .join(user_item_agg_2, "merchant_id", "full") \
                .orderBy("merchant_id")

In [77]:
mer_feature_4 = mer_feature_3.join(user_item_agg_3, "merchant_id", "left").orderBy("merchant_id")

# +-----------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+---------------------+---------------------+---------------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+-----------------------+----------------------+----------------------+----------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+
# |merchant_id|m_ttl_clk_atc_cnt|m_ttl_buy_cnt|m_ttl_atf_cnt|m_clk_atc_cnt_m5|m_clk_atc_cnt_m6|m_clk_atc_cnt_m7|m_clk_atc_cnt_m8|m_clk_atc_cnt_m9|m_clk_atc_cnt_m10|m_clk_atc_cnt_m11|m_buy_cnt_m5|m_buy_cnt_m6|m_buy_cnt_m7|m_buy_cnt_m8|m_buy_cnt_m9|m_buy_cnt_m10|m_buy_cnt_m11|m_atf_cnt_m5|m_atf_cnt_m6|m_atf_cnt_m7|m_atf_cnt_m8|m_atf_cnt_m9|m_atf_cnt_m10|m_atf_cnt_m11|m_ttl_clk_atc_ratio|m_ttl_buy_ratio|m_ttl_atf_ratio|m_clk_atc_ratio_m5|m_clk_atc_ratio_m6|m_clk_atc_ratio_m7|m_clk_atc_ratio_m8|m_clk_atc_ratio_m9|m_clk_atc_ratio_m10|m_clk_atc_ratio_m11|m_buy_ratio_m5|m_buy_ratio_m6|m_buy_ratio_m7|m_buy_ratio_m8|m_buy_ratio_m9|m_buy_ratio_m10|m_buy_ratio_m11|m_atf_ratio_m5|m_atf_ratio_m6|m_atf_ratio_m7|m_atf_ratio_m8|m_atf_ratio_m9|m_atf_ratio_m10|m_atf_ratio_m11|m_clk_atc_day_cnt|m_buy_day_cnt|m_atf_day_cnt|m_clk_atc_day_cnt_m5|m_clk_atc_day_cnt_m6|m_clk_atc_day_cnt_m7|m_clk_atc_day_cnt_m8|m_clk_atc_day_cnt_m9|m_clk_atc_day_cnt_m10|m_clk_atc_day_cnt_m11|m_buy_day_cnt_m5|m_buy_day_cnt_m6|m_buy_day_cnt_m7|m_buy_day_cnt_m8|m_buy_day_cnt_m9|m_buy_day_cnt_m10|m_buy_day_cnt_m11|m_atf_day_cnt_m5|m_atf_day_cnt_m6|m_atf_day_cnt_m7|m_atf_day_cnt_m8|m_atf_day_cnt_m9|m_atf_day_cnt_m10|m_atf_day_cnt_m11|m_clk_atc_item_div|m_buy_item_div|m_atf_item_div|m_clk_atc_cat_div|m_buy_cat_div|m_atf_cat_div|m_clk_atc_brd_div|m_buy_brd_div|m_atf_brd_div|m_clk_atc_mer_div|m_buy_mer_div|m_atf_mer_div|m_clk_atc_mean|m_buy_mean|m_atf_mean|m_clk_atc_median|m_buy_median|m_atf_median|m_clk_atc_std|m_buy_std|m_atf_std|m_clk_atc_max|m_buy_max|m_atf_max|m_clk_atc_day_cnt_mean|m_clk_atc_day_cnt_max|m_clk_atc_day_cnt_std|m_clk_atc_day_cnt_med|m_buy_day_cnt_mean|m_buy_day_cnt_max|m_buy_day_cnt_std|m_buy_day_cnt_med|m_atf_day_cnt_mean|m_atf_day_cnt_max|m_atf_day_cnt_std|m_atf_day_cnt_med|m_clk_atc_item_cnt_mean|m_clk_atc_item_cnt_max|m_clk_atc_item_cnt_std|m_clk_atc_item_cnt_med|m_buy_item_cnt_mean|m_buy_item_cnt_max|m_buy_item_cnt_std|m_buy_item_cnt_med|m_atf_item_cnt_mean|m_atf_item_cnt_max|m_atf_item_cnt_std|m_atf_item_cnt_med|
# +-----------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+---------------------+---------------------+---------------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+-----------------------+----------------------+----------------------+----------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+
# |          1|           308680|        17705|        12755|           24290|           25416|           25408|           25102|           35260|            38013|           135191|        2329|        1408|        1320|        1353|        2143|         1558|         7594|        1521|         828|         868|        1220|        1789|         2235|         4294|               0.91|           0.05|           0.04|              0.86|              0.92|              0.92|              0.91|               0.9|               0.91|               0.92|          0.08|          0.05|          0.05|          0.05|          0.05|           0.04|           0.05|          0.05|          0.03|          0.03|          0.04|          0.05|           0.05|           0.03|              179|          185|          185|                  15|                  30|                  31|                  31|                  30|                   31|                   11|              21|              30|              31|              31|              30|               31|               11|              21|              30|              31|              31|              30|               31|               11|              2969|          1810|          1924|               44|           35|           36|                2|            2|            2|                1|            1|            1|      28914.83|   1685.17|   1410.17|         25412.0|      1483.0|      1370.5|      6058.12|   438.36|   548.67|        38013|     2329|     2235|                  2.02|                   48|                 2.27|                  1.0|              1.25|                8|             0.63|              1.0|              1.51|               16|             1.19|              1.0|                   6.68|                   358|                 13.02|                   2.0|               2.18|                25|              1.97|               1.0|               2.52|                51|              3.34|               1.0|
# |         10|            19189|         1133|          866|             141|             323|              68|             302|            1407|             2008|            14940|          23|          24|           1|          14|          29|           45|          997|          17|           6|           3|          26|          73|          143|          598|               0.91|           0.05|           0.04|              0.78|              0.92|              0.94|              0.88|              0.93|               0.91|                0.9|          0.13|          0.07|          0.01|          0.04|          0.02|           0.02|           0.06|          0.09|          0.02|          0.04|          0.08|          0.05|           0.07|           0.04|              158|           81|           98|                  12|                  26|                  18|                  30|                  30|                   31|                   11|               9|              11|               1|              10|              18|               21|               11|               9|               5|               3|              13|              28|               29|               11|               461|           149|           154|               13|           12|           12|                1|            1|            1|                1|            1|            1|        708.17|     22.67|     44.67|           312.5|        23.5|        21.5|       802.84|    14.73|    54.44|         2008|       45|      143|                  1.23|                   15|                 0.69|                  1.0|              1.01|                3|             0.12|              1.0|               1.1|                3|             0.33|              1.0|                   2.13|                    51|                  2.68|                   1.0|               1.11|                 4|              0.38|               1.0|               1.42|                 7|              0.96|               1.0|
# |        100|             4062|          538|          181|               0|              23|              23|               7|             494|             1723|             1792|           0|           1|           2|           1|          68|          215|          251|           0|           2|           2|           1|          25|           87|           64|               0.85|           0.11|           0.04|               0.0|              0.88|              0.85|              0.78|              0.84|               0.85|               0.85|           0.0|          0.04|          0.07|          0.11|          0.12|           0.11|           0.12|           0.0|          0.08|          0.07|          0.11|          0.04|           0.04|           0.03|               96|           58|           55|                   0|                  13|                  11|                   6|                  24|                   31|                   11|               0|               1|               1|               1|              13|               31|               11|               0|               2|               2|               1|              11|               28|               11|                38|            19|            22|               13|            9|           10|                3|            2|            3|                1|            1|            1|        378.33|     47.83|      19.5|            23.0|         1.5|         2.0|        686.3|    86.17|    34.41|         1723|      215|       87|                  1.19|                    7|                 0.59|                  1.0|              1.09|                3|             0.32|              1.0|              1.07|                2|             0.25|              1.0|                   1.26|                    11|                  0.77|                   1.0|               1.15|                 5|              0.48|               1.0|               1.06|                 2|              0.24|               1.0|
# |       1000|            10513|          959|          781|             172|             798|             962|            1394|            1010|             1383|             4794|          27|          30|          80|         111|          83|           84|          544|          51|          70|          83|         100|          90|          137|          250|               0.86|           0.08|           0.06|              0.69|              0.89|              0.86|              0.87|              0.85|               0.86|               0.86|          0.11|          0.03|          0.07|          0.07|          0.07|           0.05|            0.1|           0.2|          0.08|          0.07|          0.06|          0.08|           0.09|           0.04|              176|          147|          162|                  12|                  30|                  31|                  31|                  30|                   31|                   11|              13|              16|              26|              27|              27|               27|               11|              16|              22|              29|              27|              27|               30|               11|               189|            93|            89|               45|           30|           30|               13|            9|            8|                1|            1|            1|        953.17|     69.17|      88.5|           986.0|        81.5|        86.5|       450.93|    33.44|    29.21|         1394|      111|      137|                  1.22|                   10|                 0.62|                  1.0|              1.02|                4|             0.19|              1.0|              1.09|                4|             0.34|              1.0|                   1.38|                    22|                  1.03|                   1.0|               1.09|                 4|              0.35|               1.0|               1.18|                10|              0.65|               1.0|
# |       1001|             2412|          196|          116|             281|             494|             146|             280|             268|              244|              699|          68|          34|          15|           8|          15|           12|           44|          27|          20|          12|          18|           9|           11|           19|               0.89|           0.07|           0.04|              0.75|               0.9|              0.84|              0.92|              0.92|               0.91|               0.92|          0.18|          0.06|          0.09|          0.03|          0.05|           0.04|           0.06|          0.07|          0.04|          0.07|          0.06|          0.03|           0.04|           0.02|              158|           53|           57|                  12|                  30|                  21|                  29|                  27|                   28|                   11|              12|              16|               5|               2|               6|                7|                5|              11|              12|               1|               8|               9|                8|                8|                62|            25|            24|                4|            3|            3|                1|            1|            1|                1|            1|            1|         285.5|     25.33|     16.17|           274.0|        15.0|        15.0|       114.03|    22.75|     6.79|          494|       68|       27|                  1.17|                   17|                 0.73|                  1.0|              1.04|                2|             0.19|              1.0|              1.05|                3|             0.25|              1.0|                   1.51|                    33|                  1.54|                   1.0|               1.16|                 3|              0.38|               1.0|               1.09|                 3|              0.35|               1.0|
# |       1002|             4788|          386|          222|             239|             680|             538|             617|             489|              317|             1908|          35|          58|          55|          49|          42|           33|          114|          19|          27|          16|          32|          22|           20|           86|               0.89|           0.07|           0.04|              0.82|              0.89|              0.88|              0.88|              0.88|               0.86|               0.91|          0.12|          0.08|          0.09|          0.07|          0.08|           0.09|           0.05|          0.06|          0.04|          0.03|          0.05|          0.04|           0.05|           0.04|              174|          105|           83|                  12|                  30|                  31|                  31|                  29|                   30|                   11|              14|              17|              19|              18|              20|               13|                4|               9|              17|               9|              16|              11|               10|               11|               233|            61|            58|               29|           15|           15|               27|            7|           14|                1|            1|            1|         480.0|     45.33|     22.67|           513.5|        45.5|        21.0|       171.38|    10.37|     5.85|          680|       58|       32|                  1.15|                    9|                 0.53|                  1.0|              1.08|                4|             0.32|              1.0|              1.05|                3|             0.24|              1.0|                   1.69|                    15|                  1.66|                   1.0|               1.74|                 9|              1.22|               1.0|               1.43|                 8|              1.04|               1.0|
# |       1003|             1896|          173|          125|              80|             256|              55|              84|             263|              397|              761|          10|          17|           4|           8|          19|           38|           77|           7|          16|           3|           2|          19|           36|           42|               0.86|           0.08|           0.06|              0.82|              0.89|              0.89|              0.89|              0.87|               0.84|               0.86|           0.1|          0.06|          0.06|          0.09|          0.06|           0.08|           0.09|          0.07|          0.06|          0.05|          0.02|          0.06|           0.08|           0.05|              157|           56|           62|                  12|                  29|                  23|                  23|                  29|                   30|                   11|               8|              10|               3|               7|               9|               15|                4|               7|              11|               3|               2|              12|               16|               11|                93|            27|            33|               10|            7|            6|                1|            1|            1|                1|            1|            1|        189.17|      16.0|     13.83|           170.0|        13.5|        11.5|       137.18|    12.15|    12.86|          397|       38|       36|                  1.15|                    6|                  0.5|                  1.0|              1.03|                2|             0.17|              1.0|              1.01|                2|             0.09|              1.0|                   1.31|                    19|                  1.05|                   1.0|               1.08|                 3|              0.35|               1.0|               1.06|                 3|              0.27|               1.0|
# |       1004|             9370|          337|          729|            1081|            2354|            1071|             536|             905|             2059|             1364|          36|          71|          32|          16|          29|           76|           77|          60|         165|          86|          48|          78|          208|           84|                0.9|           0.03|           0.07|              0.92|              0.91|               0.9|              0.89|              0.89|               0.88|               0.89|          0.03|          0.03|          0.03|          0.03|          0.03|           0.03|           0.05|          0.05|          0.06|          0.07|          0.08|          0.08|           0.09|           0.06|              176|           94|          162|                  12|                  30|                  31|                  31|                  30|                   31|                   11|               7|              15|              11|              10|              14|               27|               10|              10|              30|              26|              25|              29|               31|               11|               207|            46|            91|               17|           12|           14|                1|            1|            1|                1|            1|            1|       1334.33|     43.33|     107.5|          1076.0|        34.0|        82.0|       709.94|    24.36|    64.09|         2354|       76|      208|                   1.3|                   25|                 1.05|                  1.0|              1.11|                6|             0.47|              1.0|              1.07|                7|              0.4|              1.0|                   1.56|                    68|                  2.25|                   1.0|               1.16|                 4|              0.45|               1.0|               1.16|                 9|              0.66|               1.0|
# |       1005|             2315|          305|          136|              88|             205|             292|             228|             153|              843|              506|          21|          26|          28|          27|          11|          162|           30|          10|          12|           7|          12|           6|           59|           30|               0.84|           0.11|           0.05|              0.74|              0.84|              0.89|              0.85|               0.9|               0.79|               0.89|          0.18|          0.11|          0.09|           0.1|          0.06|           0.15|           0.05|          0.08|          0.05|          0.02|          0.04|          0.04|           0.06|           0.05|              174|           92|           54|                  12|                  29|                  31|                  31|                  29|                   31|                   11|              12|              12|              19|              17|               9|               18|                5|               8|               6|               5|              10|               5|               11|                9|               120|            27|            27|               14|            7|            6|                8|            5|            3|                1|            1|            1|         301.5|     45.83|     17.67|           216.5|        26.5|        11.0|        274.1|    57.26|     20.4|          843|      162|       59|                   1.2|                   26|                 0.98|                  1.0|              1.04|                4|             0.24|              1.0|              1.09|                3|             0.35|              1.0|                   1.33|                    11|                  0.92|                   1.0|               1.04|                 3|              0.22|               1.0|               1.12|                 3|              0.35|               1.0|
# |       1006|             4432|          447|          233|               0|               0|               0|              17|             513|             2576|             1326|           0|           0|           0|           0|          38|          273|          136|           0|           0|           0|           8|          40|          139|           46|               0.87|           0.09|           0.05|               0.0|               0.0|               0.0|              0.68|              0.87|               0.86|               0.88|           0.0|           0.0|           0.0|           0.0|          0.06|           0.09|           0.09|           0.0|           0.0|           0.0|          0.32|          0.07|           0.05|           0.03|               73|           53|           57|                   0|                   0|                   0|                   2|                  29|                   31|                   11|               0|               0|               0|               0|              11|               31|               11|               0|               0|               0|               1|              14|               31|               11|                45|            11|            18|                3|            2|            2|                1|            1|            1|                1|            1|            1|        517.67|     51.83|     31.17|             8.5|         0.0|         4.0|      1028.72|   109.41|    55.06|         2576|      273|      139|                  1.22|                    8|                 0.68|                  1.0|              1.07|                3|              0.3|              1.0|              1.02|                3|             0.18|              1.0|                   1.23|                    14|                  0.79|                   1.0|               1.05|                 3|              0.23|               1.0|               1.08|                11|              0.72|               1.0|
# +-----------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+---------------------+---------------------+---------------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+-----------------------+----------------------+----------------------+----------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+


### Recent Features

In [78]:
def recent_action_counts(profile, profile_id, act_cnt_log):
  # montly filter
  act_cnt_0 = act_cnt_log \
              .withColumn(profile + "_clk_atc_cnt_double11", F.when(act_cnt_log.time_stamp == "1111", act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_double11", F.when(act_cnt_log.time_stamp == "1111", act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_double11", F.when(act_cnt_log.time_stamp == "1111", act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_1wpre_double11", F.when(((act_cnt_log.time_stamp >= "1104") & (act_cnt_log.time_stamp <= "1110")), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_1wpre_double11", F.when(((act_cnt_log.time_stamp >= "1104") & (act_cnt_log.time_stamp <= "1110")), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_1wpre_double11", F.when(((act_cnt_log.time_stamp >= "1104") & (act_cnt_log.time_stamp <= "1110")), act_cnt_log.atf_cnt).otherwise(0))

  act_cnt_1 = act_cnt_0 \
              .groupBy(profile_id) \
              .sum() \
              .withColumnRenamed("sum(clk_atc_cnt)", profile + "_ttl_clk_atc_cnt") \
              .withColumnRenamed("sum(buy_cnt)", profile + "_ttl_buy_cnt") \
              .withColumnRenamed("sum(atf_cnt)", profile + "_ttl_atf_cnt") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_double11)", profile + "_clk_atc_cnt_double11") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_double11)", profile + "_buy_cnt_double11") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_double11)", profile + "_atf_cnt_double11") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_1wpre_double11)", profile + "_clk_atc_cnt_1wpre_double11") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_1wpre_double11)", profile + "_buy_cnt_1wpre_double11") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_1wpre_double11)", profile + "_atf_cnt_1wpre_double11") \
              .drop(*["m_ttl_clk_atc_cnt", "m_ttl_buy_cnt", "m_ttl_atf_cnt"])
  return act_cnt_1

In [79]:
mer_feature_5 = mer_feature_4 \
                .join(recent_action_counts("m", "merchant_id", act_cnt_ratio_0), "merchant_id", "left") \
                .orderBy("merchant_id")

In [80]:
def recent_action_ratio(profile, act_cnt_log):
  monthly_ttl_act_cnt_0 = act_cnt_log \
                         .withColumn("ttl_cnt_double11", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_double11" in col])) \
                         .withColumn("ttl_cnt_1wpre_double11", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_1wpre_double11" in col])) 

  act_ratio_0 = monthly_ttl_act_cnt_0 \
                    .withColumn(profile + "_clk_atc_ratio_double11", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_double11"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_1wpre_double11", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_1wpre_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_1wpre_double11"], 2)) \
                    .withColumn(profile + "_buy_ratio_double11", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_double11"], 2)) \
                    .withColumn(profile + "_buy_ratio_1wpre_double11", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_1wpre_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_1wpre_double11"], 2)) \
                    .withColumn(profile + "_atf_ratio_double11", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_double11"], 2)) \
                    .withColumn(profile + "_atf_ratio_1wpre_double11", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_1wpre_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_1wpre_double11"], 2)) \
                    .drop(*["ttl_cnt_double11", "ttl_cnt_1wpre_double11"])

  act_ratio_1 = act_ratio_0 \
               .fillna(0) 
  return act_ratio_1

In [81]:
mer_feature_6 = recent_action_ratio("m", mer_feature_5)

### Repeat Buyer Features

#### Repeat buyer number

In [82]:
def repeat_buy_number(profile, profile_id):
  rpt_byr_0 = train_log_0.filter(train_log_0.action_type == 2) \
          .groupBy(profile_id, "user_id") \
          .agg(F.count("action_type").alias("act_cnt")) \
          .orderBy(profile_id)

  rpt_byr_1 = rpt_byr_0 \
            .filter(rpt_byr_0.act_cnt >= 2) \
            .groupBy(profile_id) \
            .agg(F.countDistinct("user_id").alias(profile + "_rpt_byr_num")) \
            .orderBy(profile_id)
  return rpt_byr_1

In [83]:
mer_feature_7 = mer_feature_6 \
                .join(repeat_buy_number("m", "merchant_id"), "merchant_id", "left") \
                .orderBy("merchant_id")

#### Repeat day number

In [84]:
def repeat_day_number(profile, profile_id):
  rpt_byr_0 = train_log_0.filter(train_log_0.action_type == 2) \
          .groupBy(profile_id, "user_id") \
          .agg(F.count("action_type").alias("act_cnt"), F.countDistinct("time_stamp").alias("day_cnt")) \
          .orderBy(profile_id)

  rpt_byr_1 = rpt_byr_0 \
            .filter(rpt_byr_0.act_cnt >= 2) \
            .groupBy(profile_id) \
            .agg(F.sum("day_cnt").alias(profile + "_rpt_byr_day")) \
            .orderBy(profile_id)
  return rpt_byr_1

In [85]:
mer_feature_8 = mer_feature_7 \
                .join(repeat_day_number("m", "merchant_id"), "merchant_id", "left") \
                .orderBy("merchant_id")

### Age Related Features

In [86]:
def popular_age_gender_purchase(profile, profile_id, related_name):
  train_log_cnt = train_log_0.filter(train_log_0.action_type == 2) \
                .join(train_info_0, "user_id", "left") \
                .select("merchant_id", "user_id", "age", "gender") \
                .groupBy(profile_id, related_name) \
                .agg(F.countDistinct("user_id").alias("count"))

  train_log_cnt_max = train_log_cnt \
                    .groupBy(profile_id) \
                    .agg(F.max("count").alias("max_"))

  train_log_mode = train_log_cnt \
                  .join(train_log_cnt_max, profile_id, "left") \
                  .filter(col("count") == col("max_")) \
                  .groupBy(profile_id) \
                  .agg(F.first(related_name).alias(related_name)) \
                  .withColumnRenamed(related_name, profile + "_popular_" + related_name) \
                  .orderBy(profile_id)
  return train_log_mode

In [87]:
mer_feature_9 = mer_feature_8 \
                .join(popular_age_gender_purchase("m", "merchant_id", "age"), "merchant_id", "left") \
                .orderBy("merchant_id")

### Gender Related Features

In [88]:
mer_feature_10 = mer_feature_9 \
                .join(popular_age_gender_purchase("m", "merchant_id", "gender"), "merchant_id", "left") \
                .orderBy("merchant_id")

## Gathering

In [89]:
mer_feature_10.show(5)

+-----------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----

## Save Parquet

In [90]:
mer_feature_10.coalesce(50) \
              .write.format("parquet") \
              .mode("overwrite") \
              .save("./drive/MyDrive/Colab Notebooks/data/feature_mer_new")

spark.stop()