## Install Spark & Install Packages & Initial Spark

In [3]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark3.0.0
!wget -q https://www-us.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
# unzip it
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
# install findspark 
!pip install -q findspark

In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

In [5]:
import findspark
findspark.init()

In [6]:
import numpy as np
import pandas as pd 
import warnings
import zipfile
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.font_manager import FontProperties
from pyspark.sql import functions as F
from pyspark.sql import Window

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

%matplotlib inline

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Feature Engineering(Counts & Ratio)")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

## Read Data

In [8]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [9]:
train = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/train_format1.csv")
test = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/test_format1.csv")
train_info = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/user_info_format1.csv") 
train_log = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/user_log_format1.csv") \
    .withColumnRenamed("seller_id", "merchant_id")

In [10]:
train_info_0 = train_info \
    .withColumn("age", F.when(train_info.age_range == 8, 7).otherwise(train_info.age_range)) \
    .fillna({"age": 0, "gender": 2}) \
    .drop("age_range")

In [11]:
train_log_0 = train_log \
            .withColumn("action_type_tmp", F.when(train_log.action_type == 0, 1).otherwise(train_log.action_type)) \
            .drop("action_type") \
            .withColumnRenamed("action_type_tmp", "action_type")

## Show Data

In [None]:
train.show()

+-------+-----------+-----+
|user_id|merchant_id|label|
+-------+-----------+-----+
|  34176|       3906|    0|
|  34176|        121|    0|
|  34176|       4356|    1|
|  34176|       2217|    0|
| 230784|       4818|    0|
| 362112|       2618|    0|
|  34944|       2051|    0|
| 231552|       3828|    1|
| 231552|       2124|    0|
| 232320|       1168|    0|
| 232320|       4270|    0|
| 167040|        671|    0|
| 101760|       1760|    0|
| 298368|       2981|    0|
|  36480|       4730|    0|
| 299136|       2935|    0|
|  37248|       2615|    0|
| 103296|       2482|    0|
| 299904|       1742|    0|
|  38016|       1028|    0|
+-------+-----------+-----+
only showing top 20 rows



In [None]:
train_info.show()

+-------+---------+------+
|user_id|age_range|gender|
+-------+---------+------+
| 376517|        6|     1|
| 234512|        5|     0|
| 344532|        5|     0|
| 186135|        5|     0|
|  30230|        5|     0|
| 272389|        6|     1|
| 281071|        4|     0|
| 139859|        7|     0|
| 198411|        5|     1|
|  67037|        4|     1|
| 149002|        5|     2|
|   7468|        4|     0|
|  94292|        4|     0|
| 347414|        6|     1|
| 191719|        4|     0|
| 391524|        5|     1|
| 153790|        6|     0|
| 349112|        3|     1|
| 344766|        6|     0|
|  81816|        5|     0|
+-------+---------+------+
only showing top 20 rows



In [None]:
train_log.orderBy("merchant_id", "user_id").show()

+-------+-------+------+-----------+--------+----------+-----------+
|user_id|item_id|cat_id|merchant_id|brand_id|time_stamp|action_type|
+-------+-------+------+-----------+--------+----------+-----------+
|    100|1041304|   420|          1|    1662|      1108|          0|
|    100| 472260|   420|          1|    1662|      1108|          0|
|    100| 912479|   993|          1|    1662|      1018|          0|
|    100|  24620|   420|          1|    1662|      1008|          3|
|    100| 912479|   993|          1|    1662|      1016|          0|
|    100|1008023|   629|          1|    1662|      1008|          0|
|    100| 912479|   993|          1|    1662|      1016|          0|
|    100| 918789|   629|          1|    1662|      1008|          0|
|    100| 912479|   993|          1|    1662|      1016|          0|
|    100| 912479|   993|          1|    1662|      1020|          0|
|    100|  83998|   420|          1|    1662|      1108|          0|
|    100|1008023|   629|          

## Feature Engineering (U-M Profile)

### Count Features

In [12]:
# columns to rows
from pyspark.sql import functions as F

def dfPivot(df, keys, column):
    '''
    params: 
      df: dataframe
      keys: 待转换表中需要保留的主键key，以list[]类型传入
      column: 待转换的列名
    '''
    # .fillna(-999.0): 行转列时有时对应的列没有值，就会产生null值，fillna会对null做处理，转换为其他值，如不需要可以删除
    return df.groupBy(keys).pivot(column).agg(F.count(column)).fillna(0)

In [13]:
keys = ["user_id", "merchant_id", "time_stamp"]
column = "action_type"

act_cnt_ratio_0 = dfPivot(train_log_0, keys, column) \
            .withColumnRenamed("1", "clk_atc_cnt") \
            .withColumnRenamed("2", "buy_cnt") \
            .withColumnRenamed("3", "atf_cnt")

In [14]:
def monthly_action_counts(profile, profile_id, act_cnt_log):
  # montly filter
  act_cnt_0 = act_cnt_log \
              .withColumn(profile + "_clk_atc_cnt_m5", F.when(( (act_cnt_log.time_stamp >= "0501") & (act_cnt_log.time_stamp <= "0531") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m6", F.when(( (act_cnt_log.time_stamp >= "0601") & (act_cnt_log.time_stamp <= "0630") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m7", F.when(( (act_cnt_log.time_stamp >= "0701") & (act_cnt_log.time_stamp <= "0731") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m8", F.when(( (act_cnt_log.time_stamp >= "0801") & (act_cnt_log.time_stamp <= "0831") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m9", F.when(( (act_cnt_log.time_stamp >= "0901") & (act_cnt_log.time_stamp <= "0930") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m10", F.when(( (act_cnt_log.time_stamp >= "1001") & (act_cnt_log.time_stamp <= "1031") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m11", F.when(( act_cnt_log.time_stamp >= "1101" ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m5", F.when(( (act_cnt_log.time_stamp >= "0501") & (act_cnt_log.time_stamp <= "0531") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m6", F.when(( (act_cnt_log.time_stamp >= "0601") & (act_cnt_log.time_stamp <= "0630") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m7", F.when(( (act_cnt_log.time_stamp >= "0701") & (act_cnt_log.time_stamp <= "0731") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m8", F.when(( (act_cnt_log.time_stamp >= "0801") & (act_cnt_log.time_stamp <= "0831") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m9", F.when(( (act_cnt_log.time_stamp >= "0901") & (act_cnt_log.time_stamp <= "0930") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m10", F.when(( (act_cnt_log.time_stamp >= "1001") & (act_cnt_log.time_stamp <= "1031") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m11", F.when(( act_cnt_log.time_stamp >= "1101" ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m5", F.when(( (act_cnt_log.time_stamp >= "0501") & (act_cnt_log.time_stamp <= "0531") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m6", F.when(( (act_cnt_log.time_stamp >= "0601") & (act_cnt_log.time_stamp <= "0630") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m7", F.when(( (act_cnt_log.time_stamp >= "0701") & (act_cnt_log.time_stamp <= "0731") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m8", F.when(( (act_cnt_log.time_stamp >= "0801") & (act_cnt_log.time_stamp <= "0831") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m9", F.when(( (act_cnt_log.time_stamp >= "0901") & (act_cnt_log.time_stamp <= "0930") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m10", F.when(( (act_cnt_log.time_stamp >= "1001") & (act_cnt_log.time_stamp <= "1031") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m11", F.when(( act_cnt_log.time_stamp >= "1101" ), act_cnt_log.atf_cnt).otherwise(0))

  act_cnt_1 = act_cnt_0 \
              .groupBy(profile_id) \
              .sum() \
              .withColumnRenamed("sum(clk_atc_cnt)", profile + "_ttl_clk_atc_cnt") \
              .withColumnRenamed("sum(buy_cnt)", profile + "_ttl_buy_cnt") \
              .withColumnRenamed("sum(atf_cnt)", profile + "_ttl_atf_cnt") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m5)", profile + "_clk_atc_cnt_m5") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m6)", profile + "_clk_atc_cnt_m6") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m7)", profile + "_clk_atc_cnt_m7") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m8)", profile + "_clk_atc_cnt_m8") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m9)", profile + "_clk_atc_cnt_m9") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m10)", profile + "_clk_atc_cnt_m10") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m11)", profile + "_clk_atc_cnt_m11") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m5)", profile + "_buy_cnt_m5") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m6)", profile + "_buy_cnt_m6") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m7)", profile + "_buy_cnt_m7") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m8)", profile + "_buy_cnt_m8") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m9)", profile + "_buy_cnt_m9") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m10)", profile + "_buy_cnt_m10") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m11)", profile + "_buy_cnt_m11") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m5)", profile + "_atf_cnt_m5") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m6)", profile + "_atf_cnt_m6") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m7)", profile + "_atf_cnt_m7") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m8)", profile + "_atf_cnt_m8") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m9)", profile + "_atf_cnt_m9") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m10)", profile + "_atf_cnt_m10") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m11)", profile + "_atf_cnt_m11") 
  return act_cnt_1

In [15]:
act_cnt_ratio_1 = monthly_action_counts("um", ["user_id","merchant_id"], act_cnt_ratio_0).orderBy("merchant_id")

act_cnt_ratio_1.show(10)

+-------+-----------+------------------+--------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+
|user_id|merchant_id|um_ttl_clk_atc_cnt|um_ttl_buy_cnt|um_ttl_atf_cnt|um_clk_atc_cnt_m5|um_clk_atc_cnt_m6|um_clk_atc_cnt_m7|um_clk_atc_cnt_m8|um_clk_atc_cnt_m9|um_clk_atc_cnt_m10|um_clk_atc_cnt_m11|um_buy_cnt_m5|um_buy_cnt_m6|um_buy_cnt_m7|um_buy_cnt_m8|um_buy_cnt_m9|um_buy_cnt_m10|um_buy_cnt_m11|um_atf_cnt_m5|um_atf_cnt_m6|um_atf_cnt_m7|um_atf_cnt_m8|um_atf_cnt_m9|um_atf_cnt_m10|um_atf_cnt_m11|
+-------+-----------+------------------+--------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+----

### Ratio Features

In [16]:
def monthly_action_ratio(profile, act_cnt_log):
  monthly_ttl_act_cnt_0 = act_cnt_log \
                         .withColumn("ttl_cnt", sum([act_cnt_log[col] for col in act_cnt_log.columns if profile + "_ttl_" in col])) \
                         .withColumn("ttl_cnt_m5", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m5" in col])) \
                         .withColumn("ttl_cnt_m6", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m6" in col])) \
                         .withColumn("ttl_cnt_m7", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m7" in col])) \
                         .withColumn("ttl_cnt_m8", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m8" in col])) \
                         .withColumn("ttl_cnt_m9", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m9" in col])) \
                         .withColumn("ttl_cnt_m10", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m10" in col])) \
                         .withColumn("ttl_cnt_m11", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m11" in col])) 

  act_ratio_0 = monthly_ttl_act_cnt_0 \
                    .withColumn(profile + "_ttl_clk_atc_ratio", F.round(monthly_ttl_act_cnt_0[profile + "_ttl_clk_atc_cnt"] / monthly_ttl_act_cnt_0["ttl_cnt"], 2)) \
                    .withColumn(profile + "_ttl_buy_ratio", F.round(monthly_ttl_act_cnt_0[profile + "_ttl_buy_cnt"] / monthly_ttl_act_cnt_0["ttl_cnt"], 2)) \
                    .withColumn(profile + "_ttl_atf_ratio", F.round(monthly_ttl_act_cnt_0[profile + "_ttl_atf_cnt"] / monthly_ttl_act_cnt_0["ttl_cnt"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m5", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m5"] / monthly_ttl_act_cnt_0["ttl_cnt_m5"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m6", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m6"] / monthly_ttl_act_cnt_0["ttl_cnt_m6"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m7", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m7"] / monthly_ttl_act_cnt_0["ttl_cnt_m7"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m8", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m8"] / monthly_ttl_act_cnt_0["ttl_cnt_m8"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m9", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m9"] / monthly_ttl_act_cnt_0["ttl_cnt_m9"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m10", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m10"] / monthly_ttl_act_cnt_0["ttl_cnt_m10"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m11", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m11"] / monthly_ttl_act_cnt_0["ttl_cnt_m11"], 2)) \
                    .withColumn(profile + "_buy_ratio_m5", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m5"] / monthly_ttl_act_cnt_0["ttl_cnt_m5"], 2)) \
                    .withColumn(profile + "_buy_ratio_m6", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m6"] / monthly_ttl_act_cnt_0["ttl_cnt_m6"], 2)) \
                    .withColumn(profile + "_buy_ratio_m7", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m7"] / monthly_ttl_act_cnt_0["ttl_cnt_m7"], 2)) \
                    .withColumn(profile + "_buy_ratio_m8", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m8"] / monthly_ttl_act_cnt_0["ttl_cnt_m8"], 2)) \
                    .withColumn(profile + "_buy_ratio_m9", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m9"] / monthly_ttl_act_cnt_0["ttl_cnt_m9"], 2)) \
                    .withColumn(profile + "_buy_ratio_m10", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m10"] / monthly_ttl_act_cnt_0["ttl_cnt_m10"], 2)) \
                    .withColumn(profile + "_buy_ratio_m11", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m11"] / monthly_ttl_act_cnt_0["ttl_cnt_m11"], 2)) \
                    .withColumn(profile + "_atf_ratio_m5", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m5"] / monthly_ttl_act_cnt_0["ttl_cnt_m5"], 2)) \
                    .withColumn(profile + "_atf_ratio_m6", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m6"] / monthly_ttl_act_cnt_0["ttl_cnt_m6"], 2)) \
                    .withColumn(profile + "_atf_ratio_m7", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m7"] / monthly_ttl_act_cnt_0["ttl_cnt_m7"], 2)) \
                    .withColumn(profile + "_atf_ratio_m8", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m8"] / monthly_ttl_act_cnt_0["ttl_cnt_m8"], 2)) \
                    .withColumn(profile + "_atf_ratio_m9", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m9"] / monthly_ttl_act_cnt_0["ttl_cnt_m9"], 2)) \
                    .withColumn(profile + "_atf_ratio_m10", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m10"] / monthly_ttl_act_cnt_0["ttl_cnt_m10"], 2)) \
                    .withColumn(profile + "_atf_ratio_m11", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m11"] / monthly_ttl_act_cnt_0["ttl_cnt_m11"], 2)) 

  act_ratio_1 = act_ratio_0 \
               .fillna(0) \
               .drop(*["ttl_cnt", "ttl_cnt_m5", "ttl_cnt_m6", "ttl_cnt_m7", "ttl_cnt_m8", "ttl_cnt_m9", "ttl_cnt_m10", "ttl_cnt_m11"])
  return act_ratio_1

In [17]:
act_cnt_ratio_2 = monthly_action_ratio("um", act_cnt_ratio_1).orderBy("merchant_id")

# +-------+-----------+------------------+--------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------------+----------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+
# |user_id|merchant_id|um_ttl_clk_atc_cnt|um_ttl_buy_cnt|um_ttl_atf_cnt|um_clk_atc_cnt_m5|um_clk_atc_cnt_m6|um_clk_atc_cnt_m7|um_clk_atc_cnt_m8|um_clk_atc_cnt_m9|um_clk_atc_cnt_m10|um_clk_atc_cnt_m11|um_buy_cnt_m5|um_buy_cnt_m6|um_buy_cnt_m7|um_buy_cnt_m8|um_buy_cnt_m9|um_buy_cnt_m10|um_buy_cnt_m11|um_atf_cnt_m5|um_atf_cnt_m6|um_atf_cnt_m7|um_atf_cnt_m8|um_atf_cnt_m9|um_atf_cnt_m10|um_atf_cnt_m11|um_ttl_clk_atc_ratio|um_ttl_buy_ratio|um_ttl_atf_ratio|um_clk_atc_ratio_m5|um_clk_atc_ratio_m6|um_clk_atc_ratio_m7|um_clk_atc_ratio_m8|um_clk_atc_ratio_m9|um_clk_atc_ratio_m10|um_clk_atc_ratio_m11|um_buy_ratio_m5|um_buy_ratio_m6|um_buy_ratio_m7|um_buy_ratio_m8|um_buy_ratio_m9|um_buy_ratio_m10|um_buy_ratio_m11|um_atf_ratio_m5|um_atf_ratio_m6|um_atf_ratio_m7|um_atf_ratio_m8|um_atf_ratio_m9|um_atf_ratio_m10|um_atf_ratio_m11|
# +-------+-----------+------------------+--------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------------+----------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+
# | 145924|          1|                 7|             0|             0|                0|                0|                0|                1|                2|                 4|                 0|            0|            0|            0|            0|            0|             0|             0|            0|            0|            0|            0|            0|             0|             0|                 1.0|             0.0|             0.0|                0.0|                0.0|                0.0|                1.0|                1.0|                 1.0|                 0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|
# | 351067|          1|                 4|             6|             0|                0|                0|                0|                4|                0|                 0|                 0|            0|            0|            0|            3|            0|             0|             3|            0|            0|            0|            0|            0|             0|             0|                 0.4|             0.6|             0.0|                0.0|                0.0|                0.0|               0.57|                0.0|                 0.0|                 0.0|            0.0|            0.0|            0.0|           0.43|            0.0|             0.0|             1.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|
# | 108303|          1|                 3|             0|             0|                0|                0|                0|                0|                2|                 0|                 1|            0|            0|            0|            0|            0|             0|             0|            0|            0|            0|            0|            0|             0|             0|                 1.0|             0.0|             0.0|                0.0|                0.0|                0.0|                0.0|                1.0|                 0.0|                 1.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|
# | 286262|          1|                 4|             0|             0|                0|                0|                0|                0|                0|                 0|                 4|            0|            0|            0|            0|            0|             0|             0|            0|            0|            0|            0|            0|             0|             0|                 1.0|             0.0|             0.0|                0.0|                0.0|                0.0|                0.0|                0.0|                 0.0|                 1.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|
# |  20195|          1|                13|             0|             0|                0|                0|                0|                0|                0|                 0|                13|            0|            0|            0|            0|            0|             0|             0|            0|            0|            0|            0|            0|             0|             0|                 1.0|             0.0|             0.0|                0.0|                0.0|                0.0|                0.0|                0.0|                 0.0|                 1.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|
# | 137847|          1|               125|             9|             2|                0|                0|                0|                0|                0|                65|                60|            0|            0|            0|            0|            0|             3|             6|            0|            0|            0|            0|            0|             2|             0|                0.92|            0.07|            0.01|                0.0|                0.0|                0.0|                0.0|                0.0|                0.93|                0.91|            0.0|            0.0|            0.0|            0.0|            0.0|            0.04|            0.09|            0.0|            0.0|            0.0|            0.0|            0.0|            0.03|             0.0|
# |  55482|          1|                19|             0|             0|                1|                0|                5|               13|                0|                 0|                 0|            0|            0|            0|            0|            0|             0|             0|            0|            0|            0|            0|            0|             0|             0|                 1.0|             0.0|             0.0|                1.0|                0.0|                1.0|                1.0|                0.0|                 0.0|                 0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|
# |  27553|          1|                43|             3|             0|                5|                2|               13|                0|                0|                 0|                23|            2|            0|            0|            0|            0|             0|             1|            0|            0|            0|            0|            0|             0|             0|                0.93|            0.07|             0.0|               0.71|                1.0|                1.0|                0.0|                0.0|                 0.0|                0.96|           0.29|            0.0|            0.0|            0.0|            0.0|             0.0|            0.04|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|
# | 142023|          1|                 4|             0|             0|                0|                4|                0|                0|                0|                 0|                 0|            0|            0|            0|            0|            0|             0|             0|            0|            0|            0|            0|            0|             0|             0|                 1.0|             0.0|             0.0|                0.0|                1.0|                0.0|                0.0|                0.0|                 0.0|                 0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|
# |   4394|          1|                29|             1|             1|                0|               10|                1|                2|                4|                 6|                 6|            0|            1|            0|            0|            0|             0|             0|            1|            0|            0|            0|            0|             0|             0|                0.94|            0.03|            0.03|                0.0|               0.91|                1.0|                1.0|                1.0|                 1.0|                 1.0|            0.0|           0.09|            0.0|            0.0|            0.0|             0.0|             0.0|            1.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|
# +-------+-----------+------------------+--------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------------+----------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+


+-------+-----------+------------------+--------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------------+----------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+
|user_id|merchant_id|um_ttl_clk_atc_cnt|um_ttl_buy_cnt|um_ttl_atf_cnt|um_clk_atc_cnt_m5|um_clk_atc_cnt_m6|um_clk_atc_cnt_m7|um_clk_atc_cnt_m8|um_clk_atc_cnt_m9|um_clk_atc_cnt_m

### Day Counts Features

In [18]:
act_day_cnt_0 = train_log_0 \
            .groupBy("user_id", "merchant_id", "action_type") \
            .agg(F.countDistinct("time_stamp")) \
            .withColumnRenamed("count(time_stamp)", "day_cnt")

In [19]:
from pyspark.sql import functions as F

def dfPivot_value(df, keys, column, column_value):
    '''
    params: 
      df: dataframe
      keys: 待转换表中需要保留的主键key，以list[]类型传入
      column: 待转换的列名
      column_value_list: 需要成列值
    '''
    # .fillna(-999.0): 行转列时有时对应的列没有值，就会产生null值，fillna会对null做处理，转换为其他值，如不需要可以删除
    return df.groupBy(keys).pivot(column).agg(F.first(column_value, ignorenulls=True)).fillna(0)

In [20]:
keys = ["user_id", "merchant_id"]
column = "action_type"
column_value = "day_cnt"

act_day_cnt_1 = dfPivot_value(act_day_cnt_0, keys, column, column_value) \
            .withColumnRenamed("1", "um_clk_atc_day_cnt") \
            .withColumnRenamed("2", "um_buy_day_cnt") \
            .withColumnRenamed("3", "um_atf_day_cnt")


In [21]:
from pyspark.sql.types import *
dt_mth = F.udf(lambda x: x[:2], StringType())
train_log_1 = train_log_0 \
        .withColumn("time_stamp_mth", dt_mth("time_stamp")) \
        .select("user_id", "merchant_id", "time_stamp_mth", "time_stamp", "action_type")

act_day_cnt_2 = train_log_1 \
            .groupBy("user_id", "merchant_id", "time_stamp_mth", "action_type") \
            .agg(F.countDistinct("time_stamp")) \
            .withColumnRenamed("count(time_stamp)", "day_cnt") \
            .orderBy("user_id", "merchant_id", "time_stamp_mth")

In [22]:
def monthly_day_counts(profile, profile_id, df):
  day_cnt_0 = df \
              .withColumn(profile + "_clk_atc_day_cnt_m5", F.when(( (df.time_stamp_mth == "05") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m6", F.when(( (df.time_stamp_mth == "06") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m7", F.when(( (df.time_stamp_mth == "07") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m8", F.when(( (df.time_stamp_mth == "08") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m9", F.when(( (df.time_stamp_mth == "09") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m10", F.when(( (df.time_stamp_mth == "10") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m11", F.when(( (df.time_stamp_mth == "11") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m5", F.when(( (df.time_stamp_mth == "05") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m6", F.when(( (df.time_stamp_mth == "06") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m7", F.when(( (df.time_stamp_mth == "07") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m8", F.when(( (df.time_stamp_mth == "08") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m9", F.when(( (df.time_stamp_mth == "09") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m10", F.when(( (df.time_stamp_mth == "10") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m11", F.when(( (df.time_stamp_mth == "11") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m5", F.when(( (df.time_stamp_mth == "05") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m6", F.when(( (df.time_stamp_mth == "06") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m7", F.when(( (df.time_stamp_mth == "07") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m8", F.when(( (df.time_stamp_mth == "08") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m9", F.when(( (df.time_stamp_mth == "09") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m10", F.when(( (df.time_stamp_mth == "10") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m11", F.when(( (df.time_stamp_mth == "11") & (df.action_type == "3") ), df.day_cnt).otherwise(0))

  day_cnt_1 = day_cnt_0 \
            .groupBy(profile_id) \
            .sum() \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m5)", profile + "_clk_atc_day_cnt_m5") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m6)", profile + "_clk_atc_day_cnt_m6") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m7)", profile + "_clk_atc_day_cnt_m7") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m8)", profile + "_clk_atc_day_cnt_m8") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m9)", profile + "_clk_atc_day_cnt_m9") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m10)", profile + "_clk_atc_day_cnt_m10") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m11)", profile + "_clk_atc_day_cnt_m11") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m5)", profile + "_buy_day_cnt_m5") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m6)", profile + "_buy_day_cnt_m6") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m7)", profile + "_buy_day_cnt_m7") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m8)", profile + "_buy_day_cnt_m8") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m9)", profile + "_buy_day_cnt_m9") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m10)", profile + "_buy_day_cnt_m10") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m11)", profile + "_buy_day_cnt_m11") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m5)", profile + "_atf_day_cnt_m5") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m6)", profile + "_atf_day_cnt_m6") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m7)", profile + "_atf_day_cnt_m7") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m8)", profile + "_atf_day_cnt_m8") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m9)", profile + "_atf_day_cnt_m9") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m10)", profile + "_atf_day_cnt_m10") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m11)", profile + "_atf_day_cnt_m11") \
            .drop("sum(day_cnt)")
  return day_cnt_1

In [23]:
act_day_cnt_3 = monthly_day_counts("um", ["user_id", "merchant_id"], act_day_cnt_2).orderBy("user_id", "merchant_id")

act_day_cnt_3.show(10)

+-------+-----------+---------------------+---------------------+---------------------+---------------------+---------------------+----------------------+----------------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+
|user_id|merchant_id|um_clk_atc_day_cnt_m5|um_clk_atc_day_cnt_m6|um_clk_atc_day_cnt_m7|um_clk_atc_day_cnt_m8|um_clk_atc_day_cnt_m9|um_clk_atc_day_cnt_m10|um_clk_atc_day_cnt_m11|um_buy_day_cnt_m5|um_buy_day_cnt_m6|um_buy_day_cnt_m7|um_buy_day_cnt_m8|um_buy_day_cnt_m9|um_buy_day_cnt_m10|um_buy_day_cnt_m11|um_atf_day_cnt_m5|um_atf_day_cnt_m6|um_atf_day_cnt_m7|um_atf_day_cnt_m8|um_atf_day_cnt_m9|um_atf_day_cnt_m10|um_atf_day_cnt_m11|
+-------+-----------+---------------------+---------------------+---------------------+---------------------+---------------------+-

In [24]:
um_feature_0 = act_cnt_ratio_2 \
              .join(act_day_cnt_1, ["user_id", "merchant_id"], "left") \
              .join(act_day_cnt_3, ["user_id", "merchant_id"], "left") \
              .orderBy("merchant_id")

### Product Diversity Features

In [25]:
act_prod_div0 = train_log_0 \
            .groupBy("user_id", "merchant_id", "action_type") \
            .agg(F.countDistinct("item_id").alias("item_cnt"), F.countDistinct("cat_id").alias("cat_cnt"), F.countDistinct("brand_id").alias("brd_cnt"), F.countDistinct("merchant_id").alias("mer_cnt")) 

In [26]:
def prod_diversity(profile, profile_id, df):
  prod_div_0 = df \
              .withColumn("clk_atc_item_div", F.when(df.action_type == "1", F.col("item_cnt")).otherwise(0)) \
              .withColumn("buy_item_div", F.when(df.action_type == "2", F.col("item_cnt")).otherwise(0)) \
              .withColumn("atf_item_div", F.when(df.action_type == "3", F.col("item_cnt")).otherwise(0)) \
              .withColumn("clk_atc_cat_div", F.when(df.action_type == "1", F.col("cat_cnt")).otherwise(0)) \
              .withColumn("buy_cat_div", F.when(df.action_type == "2", F.col("cat_cnt")).otherwise(0)) \
              .withColumn("atf_cat_div", F.when(df.action_type == "3", F.col("cat_cnt")).otherwise(0)) \
              .withColumn("clk_atc_brd_div", F.when(df.action_type == "1", F.col("brd_cnt")).otherwise(0)) \
              .withColumn("buy_brd_div", F.when(df.action_type == "2", F.col("brd_cnt")).otherwise(0)) \
              .withColumn("atf_brd_div", F.when(df.action_type == "3", F.col("brd_cnt")).otherwise(0)) \
              .withColumn("clk_atc_mer_div", F.when(df.action_type == "1", F.col("mer_cnt")).otherwise(0)) \
              .withColumn("buy_mer_div", F.when(df.action_type == "2", F.col("mer_cnt")).otherwise(0)) \
              .withColumn("atf_mer_div", F.when(df.action_type == "3", F.col("mer_cnt")).otherwise(0)) 

  prod_div_1 = prod_div_0 \
              .groupBy(profile_id) \
              .sum() \
              .withColumnRenamed("sum(clk_atc_item_div)", profile + "_clk_atc_item_div") \
              .withColumnRenamed("sum(buy_item_div)", profile + "_buy_item_div") \
              .withColumnRenamed("sum(atf_item_div)", profile + "_atf_item_div") \
              .withColumnRenamed("sum(clk_atc_cat_div)", profile + "_clk_atc_cat_div") \
              .withColumnRenamed("sum(buy_cat_div)", profile + "_buy_cat_div") \
              .withColumnRenamed("sum(atf_cat_div)", profile + "_atf_cat_div") \
              .withColumnRenamed("sum(clk_atc_brd_div)", profile + "_clk_atc_brd_div") \
              .withColumnRenamed("sum(buy_brd_div)", profile + "_buy_brd_div") \
              .withColumnRenamed("sum(atf_brd_div)", profile + "_atf_brd_div") \
              .withColumnRenamed("sum(clk_atc_mer_div)", profile + "_clk_atc_mer_div") \
              .withColumnRenamed("sum(buy_mer_div)", profile + "_buy_mer_div") \
              .withColumnRenamed("sum(atf_mer_div)", profile + "_atf_mer_div") \
              .drop(*["sum(item_cnt)", "sum(cat_cnt)", "sum(brd_cnt)", "sum(mer_cnt)"])
  return prod_div_1      

In [27]:
act_prod_div1 = prod_diversity("um", ["user_id", "merchant_id"], act_prod_div0).orderBy("user_id", "merchant_id")

In [28]:
um_feature_1 = um_feature_0.join(act_prod_div1, ["user_id", "merchant_id"], "left")

um_feature_1.show(10)

+-------+-----------+------------------+--------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------------+----------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+------------------+--------------+--------------+---------------------+---------------------+---------------------+---------------------+---------------------+------------------

### Monthly Aggregation Features

In [29]:
# df = spark.createDataFrame([(1, 2, 3, 4), (1, 4, 100, 5), (20, 30, 50, 10)],['a', 'b', 'c', 'd'])
# df1 = df.withColumn("mean", maximum(*(df.columns[0:4])))
# df1.show()

In [30]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import array, udf, array_sort, floor, col, size, sqrt, greatest
from pyspark.sql import Column

# function that calculates the row-wise average
def ssy_mean(*args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    n = len([col_(x) for x in args])
    avg = sum([col_(x) for x in args]) / n
    return F.round(avg, 2)

# function that calculates the row-wise percentage 
def ssy_percentile(p, *args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    xs = array_sort(array(*[col_(x) for x in args]))
    n = size(xs)
    h = (n - 1) * p
    i = floor(h).cast("int")
    x0, x1 = xs[i], xs[i + 1]
    return F.round((x0 + (h - i) * (x1 - x0)), 2)

# function that calculates the row-wise std
def ssy_std(*args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    n = len([col_(x) for x in args])
    avg = sum([col_(x) for x in args]) / n
    std = sqrt(sum([(col_(x) - avg) ** 2 for x in args]) / (n - 1))
    return F.round(std, 2)

# function that calculates the row-wise maximum
def ssy_maximum(*args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    max = greatest(*[col_(x) for x in args])
    return F.round(max, 2)

In [31]:
def monthly_aggregation(profile, df):
  mth_agg_0 = df \
              .withColumn(profile + "_clk_atc_mean", ssy_mean(*(df.columns[5:11]))) \
              .withColumn(profile + "_buy_mean", ssy_mean(*(df.columns[12:18]))) \
              .withColumn(profile + "_atf_mean", ssy_mean(*(df.columns[19:25]))) \
              .withColumn(profile + "_clk_atc_median", ssy_percentile(0.5, *(df.columns[4:10]))) \
              .withColumn(profile + "_buy_median", ssy_percentile(0.5, *(df.columns[12:18]))) \
              .withColumn(profile + "_atf_median", ssy_percentile(0.5, *(df.columns[19:25]))) \
              .withColumn(profile + "_clk_atc_std", ssy_std(*(df.columns[5:11]))) \
              .withColumn(profile + "_buy_std", ssy_std(*(df.columns[12:18]))) \
              .withColumn(profile + "_atf_std", ssy_std(*(df.columns[19:25]))) \
              .withColumn(profile + "_clk_atc_max", ssy_maximum(*(df.columns[5:11]))) \
              .withColumn(profile + "_buy_max", ssy_maximum(*(df.columns[12:18]))) \
              .withColumn(profile + "_atf_max", ssy_maximum(*(df.columns[19:25])))
  return mth_agg_0

In [32]:
um_feature_2 = monthly_aggregation("um", um_feature_1).orderBy("user_id", "merchant_id")

um_feature_2.show(10)

+-------+-----------+------------------+--------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------------+----------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+------------------+--------------+--------------+---------------------+---------------------+---------------------+---------------------+---------------------+------------------

### Recent Features

In [33]:
def recent_action_counts(profile, profile_id, act_cnt_log):
  # montly filter
  act_cnt_0 = act_cnt_log \
              .withColumn(profile + "_clk_atc_cnt_double11", F.when(act_cnt_log.time_stamp == "1111", act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_double11", F.when(act_cnt_log.time_stamp == "1111", act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_double11", F.when(act_cnt_log.time_stamp == "1111", act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_1wpre_double11", F.when(((act_cnt_log.time_stamp >= "1104") & (act_cnt_log.time_stamp <= "1110")), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_1wpre_double11", F.when(((act_cnt_log.time_stamp >= "1104") & (act_cnt_log.time_stamp <= "1110")), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_1wpre_double11", F.when(((act_cnt_log.time_stamp >= "1104") & (act_cnt_log.time_stamp <= "1110")), act_cnt_log.atf_cnt).otherwise(0))

  act_cnt_1 = act_cnt_0 \
              .groupBy(profile_id) \
              .sum() \
              .withColumnRenamed("sum(clk_atc_cnt)", profile + "_ttl_clk_atc_cnt") \
              .withColumnRenamed("sum(buy_cnt)", profile + "_ttl_buy_cnt") \
              .withColumnRenamed("sum(atf_cnt)", profile + "_ttl_atf_cnt") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_double11)", profile + "_clk_atc_cnt_double11") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_double11)", profile + "_buy_cnt_double11") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_double11)", profile + "_atf_cnt_double11") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_1wpre_double11)", profile + "_clk_atc_cnt_1wpre_double11") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_1wpre_double11)", profile + "_buy_cnt_1wpre_double11") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_1wpre_double11)", profile + "_atf_cnt_1wpre_double11") \
              .drop(*["um_ttl_clk_atc_cnt", "um_ttl_buy_cnt", "um_ttl_atf_cnt"])
  return act_cnt_1

In [34]:
um_feature_3 = um_feature_2 \
                .join(recent_action_counts("um", ["user_id", "merchant_id"], act_cnt_ratio_0), ["user_id", "merchant_id"], "left") \
                .orderBy("user_id", "merchant_id")

In [35]:
def recent_action_ratio(profile, act_cnt_log):
  monthly_ttl_act_cnt_0 = act_cnt_log \
                         .withColumn("ttl_cnt_double11", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_double11" in col])) \
                         .withColumn("ttl_cnt_1wpre_double11", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_1wpre_double11" in col])) 

  act_ratio_0 = monthly_ttl_act_cnt_0 \
                    .withColumn(profile + "_clk_atc_ratio_double11", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_double11"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_1wpre_double11", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_1wpre_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_1wpre_double11"], 2)) \
                    .withColumn(profile + "_buy_ratio_double11", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_double11"], 2)) \
                    .withColumn(profile + "_buy_ratio_1wpre_double11", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_1wpre_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_1wpre_double11"], 2)) \
                    .withColumn(profile + "_atf_ratio_double11", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_double11"], 2)) \
                    .withColumn(profile + "_atf_ratio_1wpre_double11", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_1wpre_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_1wpre_double11"], 2)) \
                    .drop(*["ttl_cnt_double11", "ttl_cnt_1wpre_double11"])

  act_ratio_1 = act_ratio_0 \
               .fillna(0) 
  return act_ratio_1

In [36]:
um_feature_4 = recent_action_ratio("um", um_feature_3).orderBy("user_id", "merchant_id")

## Gathering

In [37]:
um_feature_4.show(5)

# +-------+-----------+------------------+--------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------------+----------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+------------------+--------------+--------------+---------------------+---------------------+---------------------+---------------------+---------------------+----------------------+----------------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------------+---------------+---------------+------------------+--------------+--------------+------------------+--------------+--------------+------------------+--------------+--------------+---------------+-----------+-----------+-----------------+-------------+-------------+--------------+----------+----------+--------------+----------+----------+-----------------------+-------------------+-------------------+-----------------------------+-------------------------+-------------------------+-------------------------+-------------------------------+---------------------+---------------------------+---------------------+---------------------------+
# |user_id|merchant_id|um_ttl_clk_atc_cnt|um_ttl_buy_cnt|um_ttl_atf_cnt|um_clk_atc_cnt_m5|um_clk_atc_cnt_m6|um_clk_atc_cnt_m7|um_clk_atc_cnt_m8|um_clk_atc_cnt_m9|um_clk_atc_cnt_m10|um_clk_atc_cnt_m11|um_buy_cnt_m5|um_buy_cnt_m6|um_buy_cnt_m7|um_buy_cnt_m8|um_buy_cnt_m9|um_buy_cnt_m10|um_buy_cnt_m11|um_atf_cnt_m5|um_atf_cnt_m6|um_atf_cnt_m7|um_atf_cnt_m8|um_atf_cnt_m9|um_atf_cnt_m10|um_atf_cnt_m11|um_ttl_clk_atc_ratio|um_ttl_buy_ratio|um_ttl_atf_ratio|um_clk_atc_ratio_m5|um_clk_atc_ratio_m6|um_clk_atc_ratio_m7|um_clk_atc_ratio_m8|um_clk_atc_ratio_m9|um_clk_atc_ratio_m10|um_clk_atc_ratio_m11|um_buy_ratio_m5|um_buy_ratio_m6|um_buy_ratio_m7|um_buy_ratio_m8|um_buy_ratio_m9|um_buy_ratio_m10|um_buy_ratio_m11|um_atf_ratio_m5|um_atf_ratio_m6|um_atf_ratio_m7|um_atf_ratio_m8|um_atf_ratio_m9|um_atf_ratio_m10|um_atf_ratio_m11|um_clk_atc_day_cnt|um_buy_day_cnt|um_atf_day_cnt|um_clk_atc_day_cnt_m5|um_clk_atc_day_cnt_m6|um_clk_atc_day_cnt_m7|um_clk_atc_day_cnt_m8|um_clk_atc_day_cnt_m9|um_clk_atc_day_cnt_m10|um_clk_atc_day_cnt_m11|um_buy_day_cnt_m5|um_buy_day_cnt_m6|um_buy_day_cnt_m7|um_buy_day_cnt_m8|um_buy_day_cnt_m9|um_buy_day_cnt_m10|um_buy_day_cnt_m11|um_atf_day_cnt_m5|um_atf_day_cnt_m6|um_atf_day_cnt_m7|um_atf_day_cnt_m8|um_atf_day_cnt_m9|um_atf_day_cnt_m10|um_atf_day_cnt_m11|um_clk_atc_item_div|um_buy_item_div|um_atf_item_div|um_clk_atc_cat_div|um_buy_cat_div|um_atf_cat_div|um_clk_atc_brd_div|um_buy_brd_div|um_atf_brd_div|um_clk_atc_mer_div|um_buy_mer_div|um_atf_mer_div|um_clk_atc_mean|um_buy_mean|um_atf_mean|um_clk_atc_median|um_buy_median|um_atf_median|um_clk_atc_std|um_buy_std|um_atf_std|um_clk_atc_max|um_buy_max|um_atf_max|um_clk_atc_cnt_double11|um_buy_cnt_double11|um_atf_cnt_double11|um_clk_atc_cnt_1wpre_double11|um_buy_cnt_1wpre_double11|um_atf_cnt_1wpre_double11|um_clk_atc_ratio_double11|um_clk_atc_ratio_1wpre_double11|um_buy_ratio_double11|um_buy_ratio_1wpre_double11|um_atf_ratio_double11|um_atf_ratio_1wpre_double11|
# +-------+-----------+------------------+--------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------------+----------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+------------------+--------------+--------------+---------------------+---------------------+---------------------+---------------------+---------------------+----------------------+----------------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------------+---------------+---------------+------------------+--------------+--------------+------------------+--------------+--------------+------------------+--------------+--------------+---------------+-----------+-----------+-----------------+-------------+-------------+--------------+----------+----------+--------------+----------+----------+-----------------------+-------------------+-------------------+-----------------------------+-------------------------+-------------------------+-------------------------+-------------------------------+---------------------+---------------------------+---------------------+---------------------------+
# |      1|       1019|                10|             4|             0|                0|                0|                0|                0|                0|                 0|                10|            0|            0|            0|            0|            0|             0|             4|            0|            0|            0|            0|            0|             0|             0|                0.71|            0.29|             0.0|                0.0|                0.0|                0.0|                0.0|                0.0|                 0.0|                0.71|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|            0.29|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|                 1|             1|             0|                    0|                    0|                    0|                    0|                    0|                     0|                     1|                0|                0|                0|                0|                0|                 0|                 1|                0|                0|                0|                0|                0|                 0|                 0|                  1|              1|              0|                 1|             1|             0|                 1|             1|             0|                 1|             1|             0|            0.0|        0.0|        0.0|              0.0|          0.0|          0.0|           0.0|       0.0|       0.0|             0|         0|         0|                     10|                  4|                  0|                            0|                        0|                        0|                     0.71|                            0.0|                 0.29|                        0.0|                  0.0|                        0.0|
# |      1|       1156|                 1|             0|             0|                0|                0|                0|                0|                0|                 0|                 1|            0|            0|            0|            0|            0|             0|             0|            0|            0|            0|            0|            0|             0|             0|                 1.0|             0.0|             0.0|                0.0|                0.0|                0.0|                0.0|                0.0|                 0.0|                 1.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|                 1|             0|             0|                    0|                    0|                    0|                    0|                    0|                     0|                     1|                0|                0|                0|                0|                0|                 0|                 0|                0|                0|                0|                0|                0|                 0|                 0|                  1|              0|              0|                 1|             0|             0|                 1|             0|             0|                 1|             0|             0|            0.0|        0.0|        0.0|              0.0|          0.0|          0.0|           0.0|       0.0|       0.0|             0|         0|         0|                      1|                  0|                  0|                            0|                        0|                        0|                      1.0|                            0.0|                  0.0|                        0.0|                  0.0|                        0.0|
# |      1|       2245|                 5|             0|             0|                0|                0|                0|                0|                0|                 5|                 0|            0|            0|            0|            0|            0|             0|             0|            0|            0|            0|            0|            0|             0|             0|                 1.0|             0.0|             0.0|                0.0|                0.0|                0.0|                0.0|                0.0|                 1.0|                 0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|                 1|             0|             0|                    0|                    0|                    0|                    0|                    0|                     1|                     0|                0|                0|                0|                0|                0|                 0|                 0|                0|                0|                0|                0|                0|                 0|                 0|                  4|              0|              0|                 1|             0|             0|                 1|             0|             0|                 1|             0|             0|           0.83|        0.0|        0.0|              0.0|          0.0|          0.0|          2.04|       0.0|       0.0|             5|         0|         0|                      0|                  0|                  0|                            0|                        0|                        0|                      0.0|                            0.0|                  0.0|                        0.0|                  0.0|                        0.0|
# |      1|       4026|                 4|             1|             0|                0|                0|                0|                0|                0|                 4|                 0|            0|            0|            0|            0|            0|             1|             0|            0|            0|            0|            0|            0|             0|             0|                 0.8|             0.2|             0.0|                0.0|                0.0|                0.0|                0.0|                0.0|                 0.8|                 0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.2|             0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|                 2|             1|             0|                    0|                    0|                    0|                    0|                    0|                     2|                     0|                0|                0|                0|                0|                0|                 1|                 0|                0|                0|                0|                0|                0|                 0|                 0|                  1|              1|              0|                 1|             1|             0|                 1|             1|             0|                 1|             1|             0|           0.67|       0.17|        0.0|              0.0|          0.0|          0.0|          1.63|      0.41|       0.0|             4|         1|         0|                      0|                  0|                  0|                            0|                        0|                        0|                      0.0|                            0.0|                  0.0|                        0.0|                  0.0|                        0.0|
# |      1|       4177|                 1|             0|             0|                0|                0|                0|                0|                0|                 1|                 0|            0|            0|            0|            0|            0|             0|             0|            0|            0|            0|            0|            0|             0|             0|                 1.0|             0.0|             0.0|                0.0|                0.0|                0.0|                0.0|                0.0|                 1.0|                 0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|            0.0|            0.0|            0.0|            0.0|            0.0|             0.0|             0.0|                 1|             0|             0|                    0|                    0|                    0|                    0|                    0|                     1|                     0|                0|                0|                0|                0|                0|                 0|                 0|                0|                0|                0|                0|                0|                 0|                 0|                  1|              0|              0|                 1|             0|             0|                 1|             0|             0|                 1|             0|             0|           0.17|        0.0|        0.0|              0.0|          0.0|          0.0|          0.41|       0.0|       0.0|             1|         0|         0|                      0|                  0|                  0|                            0|                        0|                        0|                      0.0|                            0.0|                  0.0|                        0.0|                  0.0|                        0.0|
# +-------+-----------+------------------+--------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------------+----------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+------------------+--------------+--------------+---------------------+---------------------+---------------------+---------------------+---------------------+----------------------+----------------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------------+---------------+---------------+------------------+--------------+--------------+------------------+--------------+--------------+------------------+--------------+--------------+---------------+-----------+-----------+-----------------+-------------+-------------+--------------+----------+----------+--------------+----------+----------+-----------------------+-------------------+-------------------+-----------------------------+-------------------------+-------------------------+-------------------------+-------------------------------+---------------------+---------------------------+---------------------+---------------------------+


+-------+-----------+------------------+--------------+--------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------------+----------------+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+------------------+--------------+--------------+---------------------+---------------------+---------------------+---------------------+---------------------+------------------

## Save Parquet

In [38]:
um_feature_4.coalesce(50) \
              .write.format("parquet") \
              .mode("overwrite") \
              .save("./drive/MyDrive/Colab Notebooks/data/feature_um_new")