## Install Spark & Install Packages & Initial Spark

In [1]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark3.0.0
!wget -q https://www-us.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
# unzip it
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
# install findspark 
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

In [3]:
import findspark
findspark.init()

In [4]:
import numpy as np
import pandas as pd 
import warnings
import zipfile
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.font_manager import FontProperties
from pyspark.sql import functions as F
from pyspark.sql import Window

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

%matplotlib inline

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Feature Engineering(Counts & Ratio)")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

## Read Data

In [6]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [7]:
train = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/train_format1.csv")
test = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/test_format1.csv")
train_info = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/user_info_format1.csv")
train_log = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/user_log_format1.csv")

In [8]:
train_info_0 = train_info \
    .withColumn("age", F.when(train_info.age_range == 8, 7).otherwise(train_info.age_range)) \
    .fillna({"age": 0, "gender": 2}) \
    .drop("age_range")

In [9]:
train_log_0 = train_log \
            .withColumn("action_type_tmp", F.when(train_log.action_type == 0, 1).otherwise(train_log.action_type)) \
            .drop("action_type") \
            .withColumnRenamed("action_type_tmp", "action_type")

## Show Data

In [10]:
train.show()

+-------+-----------+-----+
|user_id|merchant_id|label|
+-------+-----------+-----+
|  34176|       3906|    0|
|  34176|        121|    0|
|  34176|       4356|    1|
|  34176|       2217|    0|
| 230784|       4818|    0|
| 362112|       2618|    0|
|  34944|       2051|    0|
| 231552|       3828|    1|
| 231552|       2124|    0|
| 232320|       1168|    0|
| 232320|       4270|    0|
| 167040|        671|    0|
| 101760|       1760|    0|
| 298368|       2981|    0|
|  36480|       4730|    0|
| 299136|       2935|    0|
|  37248|       2615|    0|
| 103296|       2482|    0|
| 299904|       1742|    0|
|  38016|       1028|    0|
+-------+-----------+-----+
only showing top 20 rows



In [11]:
train_info.show()

+-------+---------+------+
|user_id|age_range|gender|
+-------+---------+------+
| 376517|        6|     1|
| 234512|        5|     0|
| 344532|        5|     0|
| 186135|        5|     0|
|  30230|        5|     0|
| 272389|        6|     1|
| 281071|        4|     0|
| 139859|        7|     0|
| 198411|        5|     1|
|  67037|        4|     1|
| 149002|        5|     2|
|   7468|        4|     0|
|  94292|        4|     0|
| 347414|        6|     1|
| 191719|        4|     0|
| 391524|        5|     1|
| 153790|        6|     0|
| 349112|        3|     1|
| 344766|        6|     0|
|  81816|        5|     0|
+-------+---------+------+
only showing top 20 rows



In [12]:
train_log.show()

+-------+-------+------+---------+--------+----------+-----------+
|user_id|item_id|cat_id|seller_id|brand_id|time_stamp|action_type|
+-------+-------+------+---------+--------+----------+-----------+
| 328862| 323294|   833|     2882|    2661|      0829|          0|
| 328862| 844400|  1271|     2882|    2661|      0829|          0|
| 328862| 575153|  1271|     2882|    2661|      0829|          0|
| 328862| 996875|  1271|     2882|    2661|      0829|          0|
| 328862|1086186|  1271|     1253|    1049|      0829|          0|
| 328862| 623866|  1271|     2882|    2661|      0829|          0|
| 328862| 542871|  1467|     2882|    2661|      0829|          0|
| 328862| 536347|  1095|      883|    1647|      0829|          0|
| 328862| 364513|  1271|     2882|    2661|      0829|          0|
| 328862| 575153|  1271|     2882|    2661|      0829|          0|
| 328862| 239288|   602|      420|    4953|      0801|          0|
| 328862| 950862|   962|     4605|    7622|      0801|        

In [13]:
train_log_0.show()

+-------+-------+------+---------+--------+----------+-----------+
|user_id|item_id|cat_id|seller_id|brand_id|time_stamp|action_type|
+-------+-------+------+---------+--------+----------+-----------+
| 328862| 323294|   833|     2882|    2661|      0829|          1|
| 328862| 844400|  1271|     2882|    2661|      0829|          1|
| 328862| 575153|  1271|     2882|    2661|      0829|          1|
| 328862| 996875|  1271|     2882|    2661|      0829|          1|
| 328862|1086186|  1271|     1253|    1049|      0829|          1|
| 328862| 623866|  1271|     2882|    2661|      0829|          1|
| 328862| 542871|  1467|     2882|    2661|      0829|          1|
| 328862| 536347|  1095|      883|    1647|      0829|          1|
| 328862| 364513|  1271|     2882|    2661|      0829|          1|
| 328862| 575153|  1271|     2882|    2661|      0829|          1|
| 328862| 239288|   602|      420|    4953|      0801|          1|
| 328862| 950862|   962|     4605|    7622|      0801|        

## Feature Engineering (User Profile)

### Count Features

In [14]:
# columns to rows
from pyspark.sql import functions as F

def dfPivot(df, keys, column):
    '''
    params: 
      df: dataframe
      keys: 待转换表中需要保留的主键key，以list[]类型传入
      column: 待转换的列名
    '''
    # .fillna(-999.0): 行转列时有时对应的列没有值，就会产生null值，fillna会对null做处理，转换为其他值，如不需要可以删除
    return df.groupBy(keys).pivot(column).agg(F.count(column)).fillna(0)

In [17]:
keys = ["user_id", "time_stamp"]
column = "action_type"

act_cnt_ratio_0 = dfPivot(train_log_0, keys, column) \
            .withColumnRenamed("1", "clk_atc_cnt") \
            .withColumnRenamed("2", "buy_cnt") \
            .withColumnRenamed("3", "atf_cnt")

# +-------+----------+-----------+-------+-------+
# |user_id|time_stamp|clk_atc_cnt|buy_cnt|atf_cnt|
# +-------+----------+-----------+-------+-------+
# |      1|      1111|         13|      4|      0|
# |      1|      1018|          5|      0|      0|
# |      1|      1021|          1|      1|      0|
# |      1|      1009|          5|      0|      0|
# |      1|      1011|          3|      1|      0|
# |     10|      1101|          4|      0|      0|
# |     10|      0821|          2|      0|      0|
# |     10|      0901|          2|      0|      0|
# |     10|      1102|          1|      0|      0|
# |     10|      0624|          3|      0|      0|
# +-------+----------+-----------+-------+-------+

In [20]:
def monthly_action_counts(profile, profile_id, act_cnt_log):
  # montly filter
  act_cnt_0 = act_cnt_log \
              .withColumn(profile + "_clk_atc_cnt_m5", F.when(( (act_cnt_log.time_stamp >= "0501") & (act_cnt_log.time_stamp <= "0531") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m6", F.when(( (act_cnt_log.time_stamp >= "0601") & (act_cnt_log.time_stamp <= "0630") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m7", F.when(( (act_cnt_log.time_stamp >= "0701") & (act_cnt_log.time_stamp <= "0731") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m8", F.when(( (act_cnt_log.time_stamp >= "0801") & (act_cnt_log.time_stamp <= "0831") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m9", F.when(( (act_cnt_log.time_stamp >= "0901") & (act_cnt_log.time_stamp <= "0930") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m10", F.when(( (act_cnt_log.time_stamp >= "1001") & (act_cnt_log.time_stamp <= "1031") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m11", F.when(( act_cnt_log.time_stamp >= "1101" ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m5", F.when(( (act_cnt_log.time_stamp >= "0501") & (act_cnt_log.time_stamp <= "0531") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m6", F.when(( (act_cnt_log.time_stamp >= "0601") & (act_cnt_log.time_stamp <= "0630") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m7", F.when(( (act_cnt_log.time_stamp >= "0701") & (act_cnt_log.time_stamp <= "0731") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m8", F.when(( (act_cnt_log.time_stamp >= "0801") & (act_cnt_log.time_stamp <= "0831") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m9", F.when(( (act_cnt_log.time_stamp >= "0901") & (act_cnt_log.time_stamp <= "0930") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m10", F.when(( (act_cnt_log.time_stamp >= "1001") & (act_cnt_log.time_stamp <= "1031") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m11", F.when(( act_cnt_log.time_stamp >= "1101" ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m5", F.when(( (act_cnt_log.time_stamp >= "0501") & (act_cnt_log.time_stamp <= "0531") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m6", F.when(( (act_cnt_log.time_stamp >= "0601") & (act_cnt_log.time_stamp <= "0630") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m7", F.when(( (act_cnt_log.time_stamp >= "0701") & (act_cnt_log.time_stamp <= "0731") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m8", F.when(( (act_cnt_log.time_stamp >= "0801") & (act_cnt_log.time_stamp <= "0831") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m9", F.when(( (act_cnt_log.time_stamp >= "0901") & (act_cnt_log.time_stamp <= "0930") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m10", F.when(( (act_cnt_log.time_stamp >= "1001") & (act_cnt_log.time_stamp <= "1031") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m11", F.when(( act_cnt_log.time_stamp >= "1101" ), act_cnt_log.atf_cnt).otherwise(0))

  act_cnt_1 = act_cnt_0 \
              .groupBy(profile_id) \
              .sum() \
              .withColumnRenamed("sum(clk_atc_cnt)", profile + "_ttl_clk_atc_cnt") \
              .withColumnRenamed("sum(buy_cnt)", profile + "_ttl_buy_cnt") \
              .withColumnRenamed("sum(atf_cnt)", profile + "_ttl_atf_cnt") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m5)", profile + "_clk_atc_cnt_m5") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m6)", profile + "_clk_atc_cnt_m6") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m7)", profile + "_clk_atc_cnt_m7") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m8)", profile + "_clk_atc_cnt_m8") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m9)", profile + "_clk_atc_cnt_m9") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m10)", profile + "_clk_atc_cnt_m10") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m11)", profile + "_clk_atc_cnt_m11") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m5)", profile + "_buy_cnt_m5") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m6)", profile + "_buy_cnt_m6") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m7)", profile + "_buy_cnt_m7") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m8)", profile + "_buy_cnt_m8") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m9)", profile + "_buy_cnt_m9") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m10)", profile + "_buy_cnt_m10") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m11)", profile + "_buy_cnt_m11") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m5)", profile + "_atf_cnt_m5") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m6)", profile + "_atf_cnt_m6") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m7)", profile + "_atf_cnt_m7") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m8)", profile + "_atf_cnt_m8") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m9)", profile + "_atf_cnt_m9") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m10)", profile + "_atf_cnt_m10") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m11)", profile + "_atf_cnt_m11") 
  return act_cnt_1

In [21]:
act_cnt_ratio_1 = monthly_action_counts("u", "user_id", act_cnt_ratio_0).orderBy("user_id")

# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+
# |user_id|u_ttl_clk_atc_cnt|u_ttl_buy_cnt|u_ttl_atf_cnt|u_clk_atc_cnt_m5|u_clk_atc_cnt_m6|u_clk_atc_cnt_m7|u_clk_atc_cnt_m8|u_clk_atc_cnt_m9|u_clk_atc_cnt_m10|u_clk_atc_cnt_m11|u_buy_cnt_m5|u_buy_cnt_m6|u_buy_cnt_m7|u_buy_cnt_m8|u_buy_cnt_m9|u_buy_cnt_m10|u_buy_cnt_m11|u_atf_cnt_m5|u_atf_cnt_m6|u_atf_cnt_m7|u_atf_cnt_m8|u_atf_cnt_m9|u_atf_cnt_m10|u_atf_cnt_m11|
# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+
# |      1|               27|            6|            0|               0|               0|               0|               0|               0|               14|               13|           0|           0|           0|           0|           0|            2|            4|           0|           0|           0|           0|           0|            0|            0|
# |     10|               56|            7|            1|               0|              18|               0|               2|               2|               18|               16|           0|           0|           0|           0|           0|            5|            2|           0|           0|           0|           0|           0|            1|            0|
# |    100|              291|           10|            1|               2|              26|               4|               0|              36|              138|               85|           0|           0|           0|           0|           2|            3|            5|           0|           0|           0|           0|           0|            1|            0|
# |   1000|               66|            7|            6|               0|               0|               0|               0|               0|                0|               66|           0|           0|           0|           0|           0|            0|            7|           0|           0|           0|           0|           0|            0|            6|
# |  10000|              402|           17|            0|              28|             160|              22|              13|              10|               45|              124|           2|           3|           2|           2|           0|            2|            6|           0|           0|           0|           0|           0|            0|            0|
# | 100000|              207|           10|            4|               0|               0|               4|              43|              86|               24|               50|           0|           0|           0|           6|           3|            0|            1|           0|           0|           0|           1|           0|            1|            2|
# | 100001|               87|            3|            0|               0|               0|               0|               1|               1|                1|               84|           0|           0|           0|           0|           0|            0|            3|           0|           0|           0|           0|           0|            0|            0|
# | 100002|               90|            5|            1|              20|               1|               5|              25|               7|                3|               29|           0|           0|           0|           2|           1|            0|            2|           0|           0|           0|           0|           0|            0|            1|
# | 100003|               88|            1|            0|               2|               1|               8|               6|               3|                9|               59|           0|           0|           0|           0|           0|            0|            1|           0|           0|           0|           0|           0|            0|            0|
# | 100004|               54|            3|            0|               0|               0|               1|              17|               0|                0|               36|           0|           0|           0|           1|           0|            0|            2|           0|           0|           0|           0|           0|            0|            0|
# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+


### Ratio Features

In [23]:
def monthly_action_ratio(profile, act_cnt_log):
  monthly_ttl_act_cnt_0 = act_cnt_log \
                         .withColumn("ttl_cnt", sum([act_cnt_log[col] for col in act_cnt_log.columns if profile + "_ttl_" in col])) \
                         .withColumn("ttl_cnt_m5", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m5" in col])) \
                         .withColumn("ttl_cnt_m6", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m6" in col])) \
                         .withColumn("ttl_cnt_m7", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m7" in col])) \
                         .withColumn("ttl_cnt_m8", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m8" in col])) \
                         .withColumn("ttl_cnt_m9", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m9" in col])) \
                         .withColumn("ttl_cnt_m10", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m10" in col])) \
                         .withColumn("ttl_cnt_m11", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m11" in col])) 

  act_ratio_0 = monthly_ttl_act_cnt_0 \
                    .withColumn(profile + "_ttl_clk_atc_ratio", F.round(monthly_ttl_act_cnt_0[profile + "_ttl_clk_atc_cnt"] / monthly_ttl_act_cnt_0["ttl_cnt"], 2)) \
                    .withColumn(profile + "_ttl_buy_ratio", F.round(monthly_ttl_act_cnt_0[profile + "_ttl_buy_cnt"] / monthly_ttl_act_cnt_0["ttl_cnt"], 2)) \
                    .withColumn(profile + "_ttl_atf_ratio", F.round(monthly_ttl_act_cnt_0[profile + "_ttl_atf_cnt"] / monthly_ttl_act_cnt_0["ttl_cnt"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m5", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m5"] / monthly_ttl_act_cnt_0["ttl_cnt_m5"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m6", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m6"] / monthly_ttl_act_cnt_0["ttl_cnt_m6"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m7", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m7"] / monthly_ttl_act_cnt_0["ttl_cnt_m7"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m8", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m8"] / monthly_ttl_act_cnt_0["ttl_cnt_m8"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m9", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m9"] / monthly_ttl_act_cnt_0["ttl_cnt_m9"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m10", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m10"] / monthly_ttl_act_cnt_0["ttl_cnt_m10"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m11", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m11"] / monthly_ttl_act_cnt_0["ttl_cnt_m11"], 2)) \
                    .withColumn(profile + "_buy_ratio_m5", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m5"] / monthly_ttl_act_cnt_0["ttl_cnt_m5"], 2)) \
                    .withColumn(profile + "_buy_ratio_m6", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m6"] / monthly_ttl_act_cnt_0["ttl_cnt_m6"], 2)) \
                    .withColumn(profile + "_buy_ratio_m7", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m7"] / monthly_ttl_act_cnt_0["ttl_cnt_m7"], 2)) \
                    .withColumn(profile + "_buy_ratio_m8", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m8"] / monthly_ttl_act_cnt_0["ttl_cnt_m8"], 2)) \
                    .withColumn(profile + "_buy_ratio_m9", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m9"] / monthly_ttl_act_cnt_0["ttl_cnt_m9"], 2)) \
                    .withColumn(profile + "_buy_ratio_m10", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m10"] / monthly_ttl_act_cnt_0["ttl_cnt_m10"], 2)) \
                    .withColumn(profile + "_buy_ratio_m11", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m11"] / monthly_ttl_act_cnt_0["ttl_cnt_m11"], 2)) \
                    .withColumn(profile + "_atf_ratio_m5", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m5"] / monthly_ttl_act_cnt_0["ttl_cnt_m5"], 2)) \
                    .withColumn(profile + "_atf_ratio_m6", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m6"] / monthly_ttl_act_cnt_0["ttl_cnt_m6"], 2)) \
                    .withColumn(profile + "_atf_ratio_m7", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m7"] / monthly_ttl_act_cnt_0["ttl_cnt_m7"], 2)) \
                    .withColumn(profile + "_atf_ratio_m8", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m8"] / monthly_ttl_act_cnt_0["ttl_cnt_m8"], 2)) \
                    .withColumn(profile + "_atf_ratio_m9", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m9"] / monthly_ttl_act_cnt_0["ttl_cnt_m9"], 2)) \
                    .withColumn(profile + "_atf_ratio_m10", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m10"] / monthly_ttl_act_cnt_0["ttl_cnt_m10"], 2)) \
                    .withColumn(profile + "_atf_ratio_m11", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m11"] / monthly_ttl_act_cnt_0["ttl_cnt_m11"], 2)) 

  act_ratio_1 = act_ratio_0 \
               .fillna(0) \
               .drop(*["ttl_cnt", "ttl_cnt_m5", "ttl_cnt_m6", "ttl_cnt_m7", "ttl_cnt_m8", "ttl_cnt_m9", "ttl_cnt_m10", "ttl_cnt_m11"])
  return act_ratio_1

In [24]:
act_cnt_ratio_2 = monthly_action_ratio("u", act_cnt_ratio_1).orderBy("user_id")

# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+
# |user_id|u_ttl_clk_atc_cnt|u_ttl_buy_cnt|u_ttl_atf_cnt|u_clk_atc_cnt_m5|u_clk_atc_cnt_m6|u_clk_atc_cnt_m7|u_clk_atc_cnt_m8|u_clk_atc_cnt_m9|u_clk_atc_cnt_m10|u_clk_atc_cnt_m11|u_buy_cnt_m5|u_buy_cnt_m6|u_buy_cnt_m7|u_buy_cnt_m8|u_buy_cnt_m9|u_buy_cnt_m10|u_buy_cnt_m11|u_atf_cnt_m5|u_atf_cnt_m6|u_atf_cnt_m7|u_atf_cnt_m8|u_atf_cnt_m9|u_atf_cnt_m10|u_atf_cnt_m11|u_ttl_clk_atc_ratio|u_ttl_buy_ratio|u_ttl_atf_ratio|u_clk_atc_ratio_m5|u_clk_atc_ratio_m6|u_clk_atc_ratio_m7|u_clk_atc_ratio_m8|u_clk_atc_ratio_m9|u_clk_atc_ratio_m10|u_clk_atc_ratio_m11|u_buy_ratio_m5|u_buy_ratio_m6|u_buy_ratio_m7|u_buy_ratio_m8|u_buy_ratio_m9|u_buy_ratio_m10|u_buy_ratio_m11|u_atf_ratio_m5|u_atf_ratio_m6|u_atf_ratio_m7|u_atf_ratio_m8|u_atf_ratio_m9|u_atf_ratio_m10|u_atf_ratio_m11|
# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+
# |      1|               27|            6|            0|               0|               0|               0|               0|               0|               14|               13|           0|           0|           0|           0|           0|            2|            4|           0|           0|           0|           0|           0|            0|            0|               0.82|           0.18|            0.0|               0.0|               0.0|               0.0|               0.0|               0.0|               0.88|               0.76|           0.0|           0.0|           0.0|           0.0|           0.0|           0.13|           0.24|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|
# |     10|               56|            7|            1|               0|              18|               0|               2|               2|               18|               16|           0|           0|           0|           0|           0|            5|            2|           0|           0|           0|           0|           0|            1|            0|               0.88|           0.11|           0.02|               0.0|               1.0|               0.0|               1.0|               1.0|               0.75|               0.89|           0.0|           0.0|           0.0|           0.0|           0.0|           0.21|           0.11|           0.0|           0.0|           0.0|           0.0|           0.0|           0.04|            0.0|
# |    100|              291|           10|            1|               2|              26|               4|               0|              36|              138|               85|           0|           0|           0|           0|           2|            3|            5|           0|           0|           0|           0|           0|            1|            0|               0.96|           0.03|            0.0|               1.0|               1.0|               1.0|               0.0|              0.95|               0.97|               0.94|           0.0|           0.0|           0.0|           0.0|          0.05|           0.02|           0.06|           0.0|           0.0|           0.0|           0.0|           0.0|           0.01|            0.0|
# |   1000|               66|            7|            6|               0|               0|               0|               0|               0|                0|               66|           0|           0|           0|           0|           0|            0|            7|           0|           0|           0|           0|           0|            0|            6|               0.84|           0.09|           0.08|               0.0|               0.0|               0.0|               0.0|               0.0|                0.0|               0.84|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.09|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.08|
# |  10000|              402|           17|            0|              28|             160|              22|              13|              10|               45|              124|           2|           3|           2|           2|           0|            2|            6|           0|           0|           0|           0|           0|            0|            0|               0.96|           0.04|            0.0|              0.93|              0.98|              0.92|              0.87|               1.0|               0.96|               0.95|          0.07|          0.02|          0.08|          0.13|           0.0|           0.04|           0.05|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|
# | 100000|              207|           10|            4|               0|               0|               4|              43|              86|               24|               50|           0|           0|           0|           6|           3|            0|            1|           0|           0|           0|           1|           0|            1|            2|               0.94|           0.05|           0.02|               0.0|               0.0|               1.0|              0.86|              0.97|               0.96|               0.94|           0.0|           0.0|           0.0|          0.12|          0.03|            0.0|           0.02|           0.0|           0.0|           0.0|          0.02|           0.0|           0.04|           0.04|
# | 100001|               87|            3|            0|               0|               0|               0|               1|               1|                1|               84|           0|           0|           0|           0|           0|            0|            3|           0|           0|           0|           0|           0|            0|            0|               0.97|           0.03|            0.0|               0.0|               0.0|               0.0|               1.0|               1.0|                1.0|               0.97|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.03|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|
# | 100002|               90|            5|            1|              20|               1|               5|              25|               7|                3|               29|           0|           0|           0|           2|           1|            0|            2|           0|           0|           0|           0|           0|            0|            1|               0.94|           0.05|           0.01|               1.0|               1.0|               1.0|              0.93|              0.88|                1.0|               0.91|           0.0|           0.0|           0.0|          0.07|          0.13|            0.0|           0.06|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.03|
# | 100003|               88|            1|            0|               2|               1|               8|               6|               3|                9|               59|           0|           0|           0|           0|           0|            0|            1|           0|           0|           0|           0|           0|            0|            0|               0.99|           0.01|            0.0|               1.0|               1.0|               1.0|               1.0|               1.0|                1.0|               0.98|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.02|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|
# | 100004|               54|            3|            0|               0|               0|               1|              17|               0|                0|               36|           0|           0|           0|           1|           0|            0|            2|           0|           0|           0|           0|           0|            0|            0|               0.95|           0.05|            0.0|               0.0|               0.0|               1.0|              0.94|               0.0|                0.0|               0.95|           0.0|           0.0|           0.0|          0.06|           0.0|            0.0|           0.05|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|
# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+


### Day Counts Features

In [26]:
act_day_cnt_0 = train_log_0 \
            .groupBy("user_id", "action_type") \
            .agg(F.countDistinct("time_stamp")) \
            .withColumnRenamed("count(time_stamp)", "day_cnt")

In [27]:
from pyspark.sql import functions as F

def dfPivot_value(df, keys, column, column_value):
    '''
    params: 
      df: dataframe
      keys: 待转换表中需要保留的主键key，以list[]类型传入
      column: 待转换的列名
      column_value_list: 需要成列值
    '''
    # .fillna(-999.0): 行转列时有时对应的列没有值，就会产生null值，fillna会对null做处理，转换为其他值，如不需要可以删除
    return df.groupBy(keys).pivot(column).agg(F.first(column_value, ignorenulls=True)).fillna(0)

In [28]:
keys = "user_id"
column = "action_type"
column_value = "day_cnt"

act_day_cnt_1 = dfPivot_value(act_day_cnt_0, keys, column, column_value) \
            .withColumnRenamed("1", "u_clk_atc_day_cnt") \
            .withColumnRenamed("2", "u_buy_day_cnt") \
            .withColumnRenamed("3", "u_atf_day_cnt")

# +-------+-----------------+-------------+-------------+
# |user_id|u_clk_atc_day_cnt|u_buy_day_cnt|u_atf_day_cnt|
# +-------+-----------------+-------------+-------------+
# |  74468|                5|            2|            1|
# | 406103|                9|            5|           10|
# | 215726|               39|            2|           18|
# | 336362|               18|           10|            1|
# | 269122|               16|            5|            0|
# | 415117|               13|            2|            0|
# |  30923|               21|            2|            7|
# |  22728|               12|            4|           19|
# | 236387|               20|           12|            1|
# | 284301|               33|            4|           40|
# +-------+-----------------+-------------+-------------+

In [30]:
from pyspark.sql.types import *
dt_mth = F.udf(lambda x: x[:2], StringType())
train_log_1 = train_log_0 \
        .withColumn("time_stamp_mth", dt_mth("time_stamp")) \
        .select("user_id", "time_stamp_mth", "time_stamp", "action_type")

act_day_cnt_2 = train_log_1 \
            .groupBy("user_id", "time_stamp_mth", "action_type") \
            .agg(F.countDistinct("time_stamp")) \
            .withColumnRenamed("count(time_stamp)", "day_cnt") \
            .orderBy("user_id", "time_stamp_mth")

# +-------+--------------+-----------+-------+
# |user_id|time_stamp_mth|action_type|day_cnt|
# +-------+--------------+-----------+-------+
# |      1|            10|          1|      4|
# |      1|            10|          2|      2|
# |      1|            11|          2|      1|
# |      1|            11|          1|      1|
# |     10|            06|          1|      4|
# |     10|            08|          1|      1|
# |     10|            09|          1|      1|
# |     10|            10|          1|      1|
# |     10|            10|          3|      1|
# |     10|            10|          2|      1|
# +-------+--------------+-----------+-------+


In [32]:
def monthly_day_counts(profile, profile_id, df):
  day_cnt_0 = df \
              .withColumn(profile + "_clk_atc_day_cnt_m5", F.when(( (df.time_stamp_mth == "05") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m6", F.when(( (df.time_stamp_mth == "06") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m7", F.when(( (df.time_stamp_mth == "07") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m8", F.when(( (df.time_stamp_mth == "08") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m9", F.when(( (df.time_stamp_mth == "09") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m10", F.when(( (df.time_stamp_mth == "10") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m11", F.when(( (df.time_stamp_mth == "11") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m5", F.when(( (df.time_stamp_mth == "05") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m6", F.when(( (df.time_stamp_mth == "06") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m7", F.when(( (df.time_stamp_mth == "07") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m8", F.when(( (df.time_stamp_mth == "08") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m9", F.when(( (df.time_stamp_mth == "09") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m10", F.when(( (df.time_stamp_mth == "10") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m11", F.when(( (df.time_stamp_mth == "11") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m5", F.when(( (df.time_stamp_mth == "05") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m6", F.when(( (df.time_stamp_mth == "06") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m7", F.when(( (df.time_stamp_mth == "07") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m8", F.when(( (df.time_stamp_mth == "08") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m9", F.when(( (df.time_stamp_mth == "09") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m10", F.when(( (df.time_stamp_mth == "10") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m11", F.when(( (df.time_stamp_mth == "11") & (df.action_type == "3") ), df.day_cnt).otherwise(0))

  day_cnt_1 = day_cnt_0 \
            .groupBy(profile_id) \
            .sum() \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m5)", profile + "_clk_atc_day_cnt_m5") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m6)", profile + "_clk_atc_day_cnt_m6") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m7)", profile + "_clk_atc_day_cnt_m7") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m8)", profile + "_clk_atc_day_cnt_m8") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m9)", profile + "_clk_atc_day_cnt_m9") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m10)", profile + "_clk_atc_day_cnt_m10") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m11)", profile + "_clk_atc_day_cnt_m11") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m5)", profile + "_buy_day_cnt_m5") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m6)", profile + "_buy_day_cnt_m6") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m7)", profile + "_buy_day_cnt_m7") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m8)", profile + "_buy_day_cnt_m8") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m9)", profile + "_buy_day_cnt_m9") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m10)", profile + "_buy_day_cnt_m10") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m11)", profile + "_buy_day_cnt_m11") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m5)", profile + "_atf_day_cnt_m5") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m6)", profile + "_atf_day_cnt_m6") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m7)", profile + "_atf_day_cnt_m7") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m8)", profile + "_atf_day_cnt_m8") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m9)", profile + "_atf_day_cnt_m9") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m10)", profile + "_atf_day_cnt_m10") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m11)", profile + "_atf_day_cnt_m11") \
            .drop("sum(day_cnt)")
  return day_cnt_1

In [None]:
act_day_cnt_3 = monthly_day_counts("u", "user_id", act_day_cnt_2).orderBy("user_id")

# +-------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+
# |user_id|u_clk_atc_day_cnt_m5|u_clk_atc_day_cnt_m6|u_clk_atc_day_cnt_m7|u_clk_atc_day_cnt_m8|u_clk_atc_day_cnt_m9|u_clk_atc_day_cnt_m10|u_clk_atc_day_cnt_m11|u_buy_day_cnt_m5|u_buy_day_cnt_m6|u_buy_day_cnt_m7|u_buy_day_cnt_m8|u_buy_day_cnt_m9|u_buy_day_cnt_m10|u_buy_day_cnt_m11|u_atf_day_cnt_m5|u_atf_day_cnt_m6|u_atf_day_cnt_m7|u_atf_day_cnt_m8|u_atf_day_cnt_m9|u_atf_day_cnt_m10|u_atf_day_cnt_m11|
# +-------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+
# |      1|                   0|                   0|                   0|                   0|                   0|                    4|                    1|               0|               0|               0|               0|               0|                2|                1|               0|               0|               0|               0|               0|                0|                0|
# |     10|                   0|                   4|                   0|                   1|                   1|                    1|                    5|               0|               0|               0|               0|               0|                1|                1|               0|               0|               0|               0|               0|                1|                0|
# |    100|                   1|                   1|                   2|                   0|                   3|                   15|                    2|               0|               0|               0|               0|               2|                2|                1|               0|               0|               0|               0|               0|                1|                0|
# |   1000|                   0|                   0|                   0|                   0|                   0|                    0|                    2|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                1|
# |  10000|                   2|                  12|                   6|                   5|                   4|                    7|                    7|               1|               1|               1|               1|               0|                1|                3|               0|               0|               0|               0|               0|                0|                0|
# | 100000|                   0|                   0|                   1|                   7|                   8|                    6|                    3|               0|               0|               0|               3|               1|                0|                1|               0|               0|               0|               1|               0|                1|                1|
# | 100001|                   0|                   0|                   0|                   1|                   1|                    1|                    1|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|
# | 100002|                   3|                   1|                   4|                   4|                   3|                    2|                    2|               0|               0|               0|               1|               1|                0|                1|               0|               0|               0|               0|               0|                0|                1|
# | 100003|                   1|                   1|                   3|                   2|                   3|                    5|                    2|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|
# | 100004|                   0|                   0|                   1|                   4|                   0|                    0|                    3|               0|               0|               0|               1|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|
# +-------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+


In [None]:
user_feature_0 = act_cnt_ratio_2 \
              .join(act_day_cnt_1, "user_id", "left") \
              .join(act_day_cnt_3, "user_id", "left") \
              .orderBy("user_id")

# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+
# |user_id|u_ttl_clk_atc_cnt|u_ttl_buy_cnt|u_ttl_atf_cnt|u_clk_atc_cnt_m5|u_clk_atc_cnt_m6|u_clk_atc_cnt_m7|u_clk_atc_cnt_m8|u_clk_atc_cnt_m9|u_clk_atc_cnt_m10|u_clk_atc_cnt_m11|u_buy_cnt_m5|u_buy_cnt_m6|u_buy_cnt_m7|u_buy_cnt_m8|u_buy_cnt_m9|u_buy_cnt_m10|u_buy_cnt_m11|u_atf_cnt_m5|u_atf_cnt_m6|u_atf_cnt_m7|u_atf_cnt_m8|u_atf_cnt_m9|u_atf_cnt_m10|u_atf_cnt_m11|u_ttl_clk_atc_ratio|u_ttl_buy_ratio|u_ttl_atf_ratio|u_clk_atc_ratio_m5|u_clk_atc_ratio_m6|u_clk_atc_ratio_m7|u_clk_atc_ratio_m8|u_clk_atc_ratio_m9|u_clk_atc_ratio_m10|u_clk_atc_ratio_m11|u_buy_ratio_m5|u_buy_ratio_m6|u_buy_ratio_m7|u_buy_ratio_m8|u_buy_ratio_m9|u_buy_ratio_m10|u_buy_ratio_m11|u_atf_ratio_m5|u_atf_ratio_m6|u_atf_ratio_m7|u_atf_ratio_m8|u_atf_ratio_m9|u_atf_ratio_m10|u_atf_ratio_m11|u_clk_atc_day_cnt|u_buy_day_cnt|u_atf_day_cnt|u_clk_atc_day_cnt_m5|u_clk_atc_day_cnt_m6|u_clk_atc_day_cnt_m7|u_clk_atc_day_cnt_m8|u_clk_atc_day_cnt_m9|u_clk_atc_day_cnt_m10|u_clk_atc_day_cnt_m11|u_buy_day_cnt_m5|u_buy_day_cnt_m6|u_buy_day_cnt_m7|u_buy_day_cnt_m8|u_buy_day_cnt_m9|u_buy_day_cnt_m10|u_buy_day_cnt_m11|u_atf_day_cnt_m5|u_atf_day_cnt_m6|u_atf_day_cnt_m7|u_atf_day_cnt_m8|u_atf_day_cnt_m9|u_atf_day_cnt_m10|u_atf_day_cnt_m11|
# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+
# |      1|               27|            6|            0|               0|               0|               0|               0|               0|               14|               13|           0|           0|           0|           0|           0|            2|            4|           0|           0|           0|           0|           0|            0|            0|               0.82|           0.18|            0.0|               0.0|               0.0|               0.0|               0.0|               0.0|               0.88|               0.76|           0.0|           0.0|           0.0|           0.0|           0.0|           0.13|           0.24|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                5|            3|            0|                   0|                   0|                   0|                   0|                   0|                    4|                    1|               0|               0|               0|               0|               0|                2|                1|               0|               0|               0|               0|               0|                0|                0|
# |     10|               56|            7|            1|               0|              18|               0|               2|               2|               18|               16|           0|           0|           0|           0|           0|            5|            2|           0|           0|           0|           0|           0|            1|            0|               0.88|           0.11|           0.02|               0.0|               1.0|               0.0|               1.0|               1.0|               0.75|               0.89|           0.0|           0.0|           0.0|           0.0|           0.0|           0.21|           0.11|           0.0|           0.0|           0.0|           0.0|           0.0|           0.04|            0.0|               12|            2|            1|                   0|                   4|                   0|                   1|                   1|                    1|                    5|               0|               0|               0|               0|               0|                1|                1|               0|               0|               0|               0|               0|                1|                0|
# |    100|              291|           10|            1|               2|              26|               4|               0|              36|              138|               85|           0|           0|           0|           0|           2|            3|            5|           0|           0|           0|           0|           0|            1|            0|               0.96|           0.03|            0.0|               1.0|               1.0|               1.0|               0.0|              0.95|               0.97|               0.94|           0.0|           0.0|           0.0|           0.0|          0.05|           0.02|           0.06|           0.0|           0.0|           0.0|           0.0|           0.0|           0.01|            0.0|               24|            5|            1|                   1|                   1|                   2|                   0|                   3|                   15|                    2|               0|               0|               0|               0|               2|                2|                1|               0|               0|               0|               0|               0|                1|                0|
# |   1000|               66|            7|            6|               0|               0|               0|               0|               0|                0|               66|           0|           0|           0|           0|           0|            0|            7|           0|           0|           0|           0|           0|            0|            6|               0.84|           0.09|           0.08|               0.0|               0.0|               0.0|               0.0|               0.0|                0.0|               0.84|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.09|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.08|                2|            1|            1|                   0|                   0|                   0|                   0|                   0|                    0|                    2|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                1|
# |  10000|              402|           17|            0|              28|             160|              22|              13|              10|               45|              124|           2|           3|           2|           2|           0|            2|            6|           0|           0|           0|           0|           0|            0|            0|               0.96|           0.04|            0.0|              0.93|              0.98|              0.92|              0.87|               1.0|               0.96|               0.95|          0.07|          0.02|          0.08|          0.13|           0.0|           0.04|           0.05|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|               43|            8|            0|                   2|                  12|                   6|                   5|                   4|                    7|                    7|               1|               1|               1|               1|               0|                1|                3|               0|               0|               0|               0|               0|                0|                0|
# | 100000|              207|           10|            4|               0|               0|               4|              43|              86|               24|               50|           0|           0|           0|           6|           3|            0|            1|           0|           0|           0|           1|           0|            1|            2|               0.94|           0.05|           0.02|               0.0|               0.0|               1.0|              0.86|              0.97|               0.96|               0.94|           0.0|           0.0|           0.0|          0.12|          0.03|            0.0|           0.02|           0.0|           0.0|           0.0|          0.02|           0.0|           0.04|           0.04|               25|            5|            3|                   0|                   0|                   1|                   7|                   8|                    6|                    3|               0|               0|               0|               3|               1|                0|                1|               0|               0|               0|               1|               0|                1|                1|
# | 100001|               87|            3|            0|               0|               0|               0|               1|               1|                1|               84|           0|           0|           0|           0|           0|            0|            3|           0|           0|           0|           0|           0|            0|            0|               0.97|           0.03|            0.0|               0.0|               0.0|               0.0|               1.0|               1.0|                1.0|               0.97|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.03|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                4|            1|            0|                   0|                   0|                   0|                   1|                   1|                    1|                    1|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|
# | 100002|               90|            5|            1|              20|               1|               5|              25|               7|                3|               29|           0|           0|           0|           2|           1|            0|            2|           0|           0|           0|           0|           0|            0|            1|               0.94|           0.05|           0.01|               1.0|               1.0|               1.0|              0.93|              0.88|                1.0|               0.91|           0.0|           0.0|           0.0|          0.07|          0.13|            0.0|           0.06|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.03|               19|            3|            1|                   3|                   1|                   4|                   4|                   3|                    2|                    2|               0|               0|               0|               1|               1|                0|                1|               0|               0|               0|               0|               0|                0|                1|
# | 100003|               88|            1|            0|               2|               1|               8|               6|               3|                9|               59|           0|           0|           0|           0|           0|            0|            1|           0|           0|           0|           0|           0|            0|            0|               0.99|           0.01|            0.0|               1.0|               1.0|               1.0|               1.0|               1.0|                1.0|               0.98|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.02|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|               17|            1|            0|                   1|                   1|                   3|                   2|                   3|                    5|                    2|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|
# | 100004|               54|            3|            0|               0|               0|               1|              17|               0|                0|               36|           0|           0|           0|           1|           0|            0|            2|           0|           0|           0|           0|           0|            0|            0|               0.95|           0.05|            0.0|               0.0|               0.0|               1.0|              0.94|               0.0|                0.0|               0.95|           0.0|           0.0|           0.0|          0.06|           0.0|            0.0|           0.05|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                8|            2|            0|                   0|                   0|                   1|                   4|                   0|                    0|                    3|               0|               0|               0|               1|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|
# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+


### Product Diversity Features

In [None]:
act_prod_div0 = train_log_0 \
            .groupBy("user_id", "action_type") \
            .agg(F.countDistinct("item_id").alias("item_cnt"), F.countDistinct("cat_id").alias("cat_cnt"), F.countDistinct("brand_id").alias("brd_cnt"), F.countDistinct("seller_id").alias("mer_cnt")) 

# +-------+-----------+--------+-------+-------+-------+
# |user_id|action_type|item_cnt|cat_cnt|brd_cnt|mer_cnt|
# +-------+-----------+--------+-------+-------+-------+
# | 105612|          2|       5|      5|      4|      4|
# | 105764|          1|      37|     10|     25|     26|
# | 106838|          1|     137|     39|     87|     87|
# | 107471|          1|     507|     52|    198|    203|
# | 116422|          1|      65|     15|     27|     27|
# |  12505|          2|      15|     12|      9|      9|
# | 130042|          1|      99|     37|     61|     63|
# | 134087|          2|      20|     10|      7|      7|
# | 136392|          1|     229|     37|     93|     93|
# | 139055|          1|      20|     14|     10|     10|
# +-------+-----------+--------+-------+-------+-------+


In [37]:
def prod_diversity(profile, profile_id, df):
  prod_div_0 = df \
              .withColumn("clk_atc_item_div", F.when(df.action_type == "1", F.col("item_cnt")).otherwise(0)) \
              .withColumn("buy_item_div", F.when(df.action_type == "2", F.col("item_cnt")).otherwise(0)) \
              .withColumn("atf_item_div", F.when(df.action_type == "3", F.col("item_cnt")).otherwise(0)) \
              .withColumn("clk_atc_cat_div", F.when(df.action_type == "1", F.col("cat_cnt")).otherwise(0)) \
              .withColumn("buy_cat_div", F.when(df.action_type == "2", F.col("cat_cnt")).otherwise(0)) \
              .withColumn("atf_cat_div", F.when(df.action_type == "3", F.col("cat_cnt")).otherwise(0)) \
              .withColumn("clk_atc_brd_div", F.when(df.action_type == "1", F.col("brd_cnt")).otherwise(0)) \
              .withColumn("buy_brd_div", F.when(df.action_type == "2", F.col("brd_cnt")).otherwise(0)) \
              .withColumn("atf_brd_div", F.when(df.action_type == "3", F.col("brd_cnt")).otherwise(0)) \
              .withColumn("clk_atc_mer_div", F.when(df.action_type == "1", F.col("mer_cnt")).otherwise(0)) \
              .withColumn("buy_mer_div", F.when(df.action_type == "2", F.col("mer_cnt")).otherwise(0)) \
              .withColumn("atf_mer_div", F.when(df.action_type == "3", F.col("mer_cnt")).otherwise(0)) 

  prod_div_1 = prod_div_0 \
              .groupBy(profile_id) \
              .sum() \
              .withColumnRenamed("sum(clk_atc_item_div)", profile + "_clk_atc_item_div") \
              .withColumnRenamed("sum(buy_item_div)", profile + "_buy_item_div") \
              .withColumnRenamed("sum(atf_item_div)", profile + "_atf_item_div") \
              .withColumnRenamed("sum(clk_atc_cat_div)", profile + "_clk_atc_cat_div") \
              .withColumnRenamed("sum(buy_cat_div)", profile + "_buy_cat_div") \
              .withColumnRenamed("sum(atf_cat_div)", profile + "_atf_cat_div") \
              .withColumnRenamed("sum(clk_atc_brd_div)", profile + "_clk_atc_brd_div") \
              .withColumnRenamed("sum(buy_brd_div)", profile + "_buy_brd_div") \
              .withColumnRenamed("sum(atf_brd_div)", profile + "_atf_brd_div") \
              .withColumnRenamed("sum(clk_atc_mer_div)", profile + "_clk_atc_mer_div") \
              .withColumnRenamed("sum(buy_mer_div)", profile + "_buy_mer_div") \
              .withColumnRenamed("sum(atf_mer_div)", profile + "_atf_mer_div") \
              .drop(*["sum(item_cnt)", "sum(cat_cnt)", "sum(brd_cnt)", "sum(mer_cnt)"])
  return prod_div_1      

In [None]:
act_prod_div1 = prod_diversity("u", "user_id", act_prod_div0).orderBy("user_id")

# +-------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+
# |user_id|u_clk_atc_item_div|u_buy_item_div|u_atf_item_div|u_clk_atc_cat_div|u_buy_cat_div|u_atf_cat_div|u_clk_atc_brd_div|u_buy_brd_div|u_atf_brd_div|u_clk_atc_mer_div|u_buy_mer_div|u_atf_mer_div|
# +-------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+
# |      1|                12|             3|             0|                6|            3|            0|                9|            3|            0|                9|            3|            0|
# |     10|                39|             5|             1|               17|            3|            1|               17|            2|            1|               17|            2|            1|
# |    100|               131|             9|             1|               37|            8|            1|               63|            6|            1|               66|            6|            1|
# |   1000|                21|             7|             6|                5|            4|            4|                9|            5|            3|                9|            5|            3|
# |  10000|               272|            14|             0|               76|           12|            0|              128|           13|            0|              128|           13|            0|
# | 100000|               122|             9|             4|               44|            9|            3|               68|            6|            4|               74|            6|            4|
# | 100001|                74|             3|             0|               20|            3|            0|               35|            3|            0|               40|            3|            0|
# | 100002|                69|             5|             1|               37|            5|            1|               29|            5|            1|               29|            4|            1|
# | 100003|                84|             1|             0|               26|            1|            0|               30|            1|            0|               31|            1|            0|
# | 100004|                28|             3|             0|                9|            3|            0|               15|            3|            0|               16|            3|            0|
# +-------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+


In [39]:
user_feature_1 = user_feature_0.join(act_prod_div1, "user_id", "left")

### Monthly Aggregation Features

In [40]:
# df = spark.createDataFrame([(1, 2, 3, 4), (1, 4, 100, 5), (20, 30, 50, 10)],['a', 'b', 'c', 'd'])
# df1 = df.withColumn("mean", maximum(*(df.columns[0:4])))
# df1.show()

In [43]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import array, udf, array_sort, floor, col, size, sqrt, greatest
from pyspark.sql import Column

# function that calculates the row-wise average
def ssy_mean(*args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    n = len([col_(x) for x in args])
    avg = sum([col_(x) for x in args]) / n
    return F.round(avg, 2)

# function that calculates the row-wise percentage 
def ssy_percentile(p, *args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    xs = array_sort(array(*[col_(x) for x in args]))
    n = size(xs)
    h = (n - 1) * p
    i = floor(h).cast("int")
    x0, x1 = xs[i], xs[i + 1]
    return F.round((x0 + (h - i) * (x1 - x0)), 2)

# function that calculates the row-wise std
def ssy_std(*args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    n = len([col_(x) for x in args])
    avg = sum([col_(x) for x in args]) / n
    std = sqrt(sum([(col_(x) - avg) ** 2 for x in args]) / (n - 1))
    return F.round(std, 2)

# function that calculates the row-wise maximum
def ssy_maximum(*args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    max = greatest(*[col_(x) for x in args])
    return F.round(max, 2)

In [49]:
def monthly_aggregation(profile, df):
  mth_agg_0 = df \
              .withColumn(profile + "_clk_atc_mean", ssy_mean(*(df.columns[4:10]))) \
              .withColumn(profile + "_buy_mean", ssy_mean(*(df.columns[11:17]))) \
              .withColumn(profile + "_atf_mean", ssy_mean(*(df.columns[18:24]))) \
              .withColumn(profile + "_clk_atc_median", ssy_percentile(0.5, *(df.columns[4:10]))) \
              .withColumn(profile + "_buy_median", ssy_percentile(0.5, *(df.columns[11:17]))) \
              .withColumn(profile + "_atf_median", ssy_percentile(0.5, *(df.columns[18:24]))) \
              .withColumn(profile + "_clk_atc_std", ssy_std(*(df.columns[4:10]))) \
              .withColumn(profile + "_buy_std", ssy_std(*(df.columns[11:17]))) \
              .withColumn(profile + "_atf_std", ssy_std(*(df.columns[18:24]))) \
              .withColumn(profile + "_clk_atc_max", ssy_maximum(*(df.columns[4:10]))) \
              .withColumn(profile + "_buy_max", ssy_maximum(*(df.columns[11:17]))) \
              .withColumn(profile + "_atf_max", ssy_maximum(*(df.columns[18:24])))
  return mth_agg_0

In [50]:
user_feature_2 = monthly_aggregation("u", user_feature_1).orderBy("user_id")

# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+
# |user_id|u_ttl_clk_atc_cnt|u_ttl_buy_cnt|u_ttl_atf_cnt|u_clk_atc_cnt_m5|u_clk_atc_cnt_m6|u_clk_atc_cnt_m7|u_clk_atc_cnt_m8|u_clk_atc_cnt_m9|u_clk_atc_cnt_m10|u_clk_atc_cnt_m11|u_buy_cnt_m5|u_buy_cnt_m6|u_buy_cnt_m7|u_buy_cnt_m8|u_buy_cnt_m9|u_buy_cnt_m10|u_buy_cnt_m11|u_atf_cnt_m5|u_atf_cnt_m6|u_atf_cnt_m7|u_atf_cnt_m8|u_atf_cnt_m9|u_atf_cnt_m10|u_atf_cnt_m11|u_ttl_clk_atc_ratio|u_ttl_buy_ratio|u_ttl_atf_ratio|u_clk_atc_ratio_m5|u_clk_atc_ratio_m6|u_clk_atc_ratio_m7|u_clk_atc_ratio_m8|u_clk_atc_ratio_m9|u_clk_atc_ratio_m10|u_clk_atc_ratio_m11|u_buy_ratio_m5|u_buy_ratio_m6|u_buy_ratio_m7|u_buy_ratio_m8|u_buy_ratio_m9|u_buy_ratio_m10|u_buy_ratio_m11|u_atf_ratio_m5|u_atf_ratio_m6|u_atf_ratio_m7|u_atf_ratio_m8|u_atf_ratio_m9|u_atf_ratio_m10|u_atf_ratio_m11|u_clk_atc_day_cnt|u_buy_day_cnt|u_atf_day_cnt|u_clk_atc_day_cnt_m5|u_clk_atc_day_cnt_m6|u_clk_atc_day_cnt_m7|u_clk_atc_day_cnt_m8|u_clk_atc_day_cnt_m9|u_clk_atc_day_cnt_m10|u_clk_atc_day_cnt_m11|u_buy_day_cnt_m5|u_buy_day_cnt_m6|u_buy_day_cnt_m7|u_buy_day_cnt_m8|u_buy_day_cnt_m9|u_buy_day_cnt_m10|u_buy_day_cnt_m11|u_atf_day_cnt_m5|u_atf_day_cnt_m6|u_atf_day_cnt_m7|u_atf_day_cnt_m8|u_atf_day_cnt_m9|u_atf_day_cnt_m10|u_atf_day_cnt_m11|u_clk_atc_item_div|u_buy_item_div|u_atf_item_div|u_clk_atc_cat_div|u_buy_cat_div|u_atf_cat_div|u_clk_atc_brd_div|u_buy_brd_div|u_atf_brd_div|u_clk_atc_mer_div|u_buy_mer_div|u_atf_mer_div|u_clk_atc_mean|u_buy_mean|u_atf_mean|u_clk_atc_median|u_buy_median|u_atf_median|u_clk_atc_std|u_buy_std|u_atf_std|u_clk_atc_max|u_buy_max|u_atf_max|
# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+
# |      1|               27|            6|            0|               0|               0|               0|               0|               0|               14|               13|           0|           0|           0|           0|           0|            2|            4|           0|           0|           0|           0|           0|            0|            0|               0.82|           0.18|            0.0|               0.0|               0.0|               0.0|               0.0|               0.0|               0.88|               0.76|           0.0|           0.0|           0.0|           0.0|           0.0|           0.13|           0.24|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                5|            3|            0|                   0|                   0|                   0|                   0|                   0|                    4|                    1|               0|               0|               0|               0|               0|                2|                1|               0|               0|               0|               0|               0|                0|                0|                12|             3|             0|                6|            3|            0|                9|            3|            0|                9|            3|            0|          2.33|      0.33|       0.0|             0.0|         0.0|         0.0|         5.72|     0.82|      0.0|           14|        2|        0|
# |     10|               56|            7|            1|               0|              18|               0|               2|               2|               18|               16|           0|           0|           0|           0|           0|            5|            2|           0|           0|           0|           0|           0|            1|            0|               0.88|           0.11|           0.02|               0.0|               1.0|               0.0|               1.0|               1.0|               0.75|               0.89|           0.0|           0.0|           0.0|           0.0|           0.0|           0.21|           0.11|           0.0|           0.0|           0.0|           0.0|           0.0|           0.04|            0.0|               12|            2|            1|                   0|                   4|                   0|                   1|                   1|                    1|                    5|               0|               0|               0|               0|               0|                1|                1|               0|               0|               0|               0|               0|                1|                0|                39|             5|             1|               17|            3|            1|               17|            2|            1|               17|            2|            1|          6.67|      0.83|      0.17|             2.0|         0.0|         0.0|         8.82|     2.04|     0.41|           18|        5|        1|
# |    100|              291|           10|            1|               2|              26|               4|               0|              36|              138|               85|           0|           0|           0|           0|           2|            3|            5|           0|           0|           0|           0|           0|            1|            0|               0.96|           0.03|            0.0|               1.0|               1.0|               1.0|               0.0|              0.95|               0.97|               0.94|           0.0|           0.0|           0.0|           0.0|          0.05|           0.02|           0.06|           0.0|           0.0|           0.0|           0.0|           0.0|           0.01|            0.0|               24|            5|            1|                   1|                   1|                   2|                   0|                   3|                   15|                    2|               0|               0|               0|               0|               2|                2|                1|               0|               0|               0|               0|               0|                1|                0|               131|             9|             1|               37|            8|            1|               63|            6|            1|               66|            6|            1|         34.33|      0.83|      0.17|            15.0|         0.0|         0.0|        52.85|     1.33|     0.41|          138|        3|        1|
# |   1000|               66|            7|            6|               0|               0|               0|               0|               0|                0|               66|           0|           0|           0|           0|           0|            0|            7|           0|           0|           0|           0|           0|            0|            6|               0.84|           0.09|           0.08|               0.0|               0.0|               0.0|               0.0|               0.0|                0.0|               0.84|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.09|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.08|                2|            1|            1|                   0|                   0|                   0|                   0|                   0|                    0|                    2|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                1|                21|             7|             6|                5|            4|            4|                9|            5|            3|                9|            5|            3|           0.0|       0.0|       0.0|             0.0|         0.0|         0.0|          0.0|      0.0|      0.0|            0|        0|        0|
# |  10000|              402|           17|            0|              28|             160|              22|              13|              10|               45|              124|           2|           3|           2|           2|           0|            2|            6|           0|           0|           0|           0|           0|            0|            0|               0.96|           0.04|            0.0|              0.93|              0.98|              0.92|              0.87|               1.0|               0.96|               0.95|          0.07|          0.02|          0.08|          0.13|           0.0|           0.04|           0.05|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|               43|            8|            0|                   2|                  12|                   6|                   5|                   4|                    7|                    7|               1|               1|               1|               1|               0|                1|                3|               0|               0|               0|               0|               0|                0|                0|               272|            14|             0|               76|           12|            0|              128|           13|            0|              128|           13|            0|         46.33|      1.83|       0.0|            25.0|         2.0|         0.0|        57.06|     0.98|      0.0|          160|        3|        0|
# | 100000|              207|           10|            4|               0|               0|               4|              43|              86|               24|               50|           0|           0|           0|           6|           3|            0|            1|           0|           0|           0|           1|           0|            1|            2|               0.94|           0.05|           0.02|               0.0|               0.0|               1.0|              0.86|              0.97|               0.96|               0.94|           0.0|           0.0|           0.0|          0.12|          0.03|            0.0|           0.02|           0.0|           0.0|           0.0|          0.02|           0.0|           0.04|           0.04|               25|            5|            3|                   0|                   0|                   1|                   7|                   8|                    6|                    3|               0|               0|               0|               3|               1|                0|                1|               0|               0|               0|               1|               0|                1|                1|               122|             9|             4|               44|            9|            3|               68|            6|            4|               74|            6|            4|         26.17|       1.5|      0.33|            14.0|         0.0|         0.0|        33.85|     2.51|     0.52|           86|        6|        1|
# | 100001|               87|            3|            0|               0|               0|               0|               1|               1|                1|               84|           0|           0|           0|           0|           0|            0|            3|           0|           0|           0|           0|           0|            0|            0|               0.97|           0.03|            0.0|               0.0|               0.0|               0.0|               1.0|               1.0|                1.0|               0.97|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.03|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                4|            1|            0|                   0|                   0|                   0|                   1|                   1|                    1|                    1|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|                74|             3|             0|               20|            3|            0|               35|            3|            0|               40|            3|            0|           0.5|       0.0|       0.0|             0.5|         0.0|         0.0|         0.55|      0.0|      0.0|            1|        0|        0|
# | 100002|               90|            5|            1|              20|               1|               5|              25|               7|                3|               29|           0|           0|           0|           2|           1|            0|            2|           0|           0|           0|           0|           0|            0|            1|               0.94|           0.05|           0.01|               1.0|               1.0|               1.0|              0.93|              0.88|                1.0|               0.91|           0.0|           0.0|           0.0|          0.07|          0.13|            0.0|           0.06|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.03|               19|            3|            1|                   3|                   1|                   4|                   4|                   3|                    2|                    2|               0|               0|               0|               1|               1|                0|                1|               0|               0|               0|               0|               0|                0|                1|                69|             5|             1|               37|            5|            1|               29|            5|            1|               29|            4|            1|         10.17|       0.5|       0.0|             6.0|         0.0|         0.0|         9.89|     0.84|      0.0|           25|        2|        0|
# | 100003|               88|            1|            0|               2|               1|               8|               6|               3|                9|               59|           0|           0|           0|           0|           0|            0|            1|           0|           0|           0|           0|           0|            0|            0|               0.99|           0.01|            0.0|               1.0|               1.0|               1.0|               1.0|               1.0|                1.0|               0.98|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.02|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|               17|            1|            0|                   1|                   1|                   3|                   2|                   3|                    5|                    2|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|                84|             1|             0|               26|            1|            0|               30|            1|            0|               31|            1|            0|          4.83|       0.0|       0.0|             4.5|         0.0|         0.0|         3.31|      0.0|      0.0|            9|        0|        0|
# | 100004|               54|            3|            0|               0|               0|               1|              17|               0|                0|               36|           0|           0|           0|           1|           0|            0|            2|           0|           0|           0|           0|           0|            0|            0|               0.95|           0.05|            0.0|               0.0|               0.0|               1.0|              0.94|               0.0|                0.0|               0.95|           0.0|           0.0|           0.0|          0.06|           0.0|            0.0|           0.05|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                8|            2|            0|                   0|                   0|                   1|                   4|                   0|                    0|                    3|               0|               0|               0|               1|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|                28|             3|             0|                9|            3|            0|               15|            3|            0|               16|            3|            0|           3.0|      0.17|       0.0|             0.0|         0.0|         0.0|         6.87|     0.41|      0.0|           17|        1|        0|
# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+


### Merchant Aggregation Features

#### Merchant-action-day-aggregation

In [52]:
from pyspark.sql import DataFrameStatFunctions as statFunc
from pyspark.sql.functions import col
import numpy as np
from pyspark.sql.types import FloatType

def median(values_list):
    med = np.median(values_list)
    return float(med)
udf_median = F.udf(median, FloatType())

def merchant_day_cnt_aggregation(profile, profile_id, act_id, act_name):
  day_agg_0 = train_log_0 \
            .filter(train_log_0.action_type == act_id) \
            .groupBy("user_id", "seller_id","action_type") \
            .agg(F.countDistinct("time_stamp")) \
            .withColumnRenamed("count(time_stamp)", "day_cnt")

  day_agg_1 = day_agg_0 \
            .groupBy(profile_id) \
            .agg(F.round(F.mean("day_cnt"), 2).alias(profile + "_" + act_name + "_day_cnt_mean"), 
                 F.round(F.max("day_cnt"), 2).alias(profile + "_" + act_name + "_day_cnt_max"), 
                 F.round(F.stddev("day_cnt"), 2).alias(profile + "_" + act_name + "_day_cnt_std"), 
                 F.round(udf_median(F.collect_list(col("day_cnt"))), 2).alias(profile + "_" + act_name + "_day_cnt_med")) 
  return day_agg_1

In [53]:
mer_day_agg_0 = merchant_day_cnt_aggregation("u", "user_id", 1, "clk_atc") 
mer_day_agg_1 = merchant_day_cnt_aggregation("u", "user_id", 2, "buy")
mer_day_agg_2 = merchant_day_cnt_aggregation("u", "user_id", 3, "atf")
mer_day_agg_3 = mer_day_agg_0 \
              .join(mer_day_agg_1, "user_id", "full") \
              .join(mer_day_agg_2, "user_id", "full") \
              .orderBy("user_id")

In [55]:
user_feature_3 = user_feature_2.join(mer_day_agg_3, "user_id", "left")

#### Merchant-action-item-aggregation

In [56]:
from pyspark.sql import DataFrameStatFunctions as statFunc
from pyspark.sql.functions import col
import numpy as np
from pyspark.sql.types import FloatType

def median(values_list):
    med = np.median(values_list)
    return float(med)
udf_median = F.udf(median, FloatType())

def merchant_item_cnt_aggregation(profile, profile_id, act_id, act_name):
  item_agg_0 = train_log_0 \
            .filter(train_log_0.action_type == act_id) \
            .groupBy("user_id", "seller_id","action_type") \
            .agg(F.countDistinct("item_id")) \
            .withColumnRenamed("count(item_id)", "item_cnt")

  item_agg_1 = item_agg_0 \
            .groupBy(profile_id) \
            .agg(F.round(F.mean("item_cnt"), 2).alias(profile + "_" + act_name + "_item_cnt_mean"), 
                 F.round(F.max("item_cnt"), 2).alias(profile + "_" + act_name + "_item_cnt_max"), 
                 F.round(F.stddev("item_cnt"), 2).alias(profile + "_" + act_name + "_item_cnt_std"), 
                 F.round(udf_median(F.collect_list(col("item_cnt"))), 2).alias(profile + "_" + act_name + "_item_cnt_med")) 
  return item_agg_1

In [57]:
mer_item_agg_0 = merchant_item_cnt_aggregation("u", "user_id", 1, "clk_atc")
mer_item_agg_1 = merchant_item_cnt_aggregation("u", "user_id", 2, "buy")
mer_item_agg_2 = merchant_item_cnt_aggregation("u", "user_id", 3, "atf")
mer_item_agg_3 = mer_item_agg_0 \
                .join(mer_item_agg_1, "user_id", "full") \
                .join(mer_item_agg_2, "user_id", "full") \
                .orderBy("user_id")

In [58]:
user_feature_4 = user_feature_3.join(mer_item_agg_3, "user_id", "left").orderBy("user_id")

# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+---------------------+---------------------+---------------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+-----------------------+----------------------+----------------------+----------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+
# |user_id|u_ttl_clk_atc_cnt|u_ttl_buy_cnt|u_ttl_atf_cnt|u_clk_atc_cnt_m5|u_clk_atc_cnt_m6|u_clk_atc_cnt_m7|u_clk_atc_cnt_m8|u_clk_atc_cnt_m9|u_clk_atc_cnt_m10|u_clk_atc_cnt_m11|u_buy_cnt_m5|u_buy_cnt_m6|u_buy_cnt_m7|u_buy_cnt_m8|u_buy_cnt_m9|u_buy_cnt_m10|u_buy_cnt_m11|u_atf_cnt_m5|u_atf_cnt_m6|u_atf_cnt_m7|u_atf_cnt_m8|u_atf_cnt_m9|u_atf_cnt_m10|u_atf_cnt_m11|u_ttl_clk_atc_ratio|u_ttl_buy_ratio|u_ttl_atf_ratio|u_clk_atc_ratio_m5|u_clk_atc_ratio_m6|u_clk_atc_ratio_m7|u_clk_atc_ratio_m8|u_clk_atc_ratio_m9|u_clk_atc_ratio_m10|u_clk_atc_ratio_m11|u_buy_ratio_m5|u_buy_ratio_m6|u_buy_ratio_m7|u_buy_ratio_m8|u_buy_ratio_m9|u_buy_ratio_m10|u_buy_ratio_m11|u_atf_ratio_m5|u_atf_ratio_m6|u_atf_ratio_m7|u_atf_ratio_m8|u_atf_ratio_m9|u_atf_ratio_m10|u_atf_ratio_m11|u_clk_atc_day_cnt|u_buy_day_cnt|u_atf_day_cnt|u_clk_atc_day_cnt_m5|u_clk_atc_day_cnt_m6|u_clk_atc_day_cnt_m7|u_clk_atc_day_cnt_m8|u_clk_atc_day_cnt_m9|u_clk_atc_day_cnt_m10|u_clk_atc_day_cnt_m11|u_buy_day_cnt_m5|u_buy_day_cnt_m6|u_buy_day_cnt_m7|u_buy_day_cnt_m8|u_buy_day_cnt_m9|u_buy_day_cnt_m10|u_buy_day_cnt_m11|u_atf_day_cnt_m5|u_atf_day_cnt_m6|u_atf_day_cnt_m7|u_atf_day_cnt_m8|u_atf_day_cnt_m9|u_atf_day_cnt_m10|u_atf_day_cnt_m11|u_clk_atc_item_div|u_buy_item_div|u_atf_item_div|u_clk_atc_cat_div|u_buy_cat_div|u_atf_cat_div|u_clk_atc_brd_div|u_buy_brd_div|u_atf_brd_div|u_clk_atc_mer_div|u_buy_mer_div|u_atf_mer_div|u_clk_atc_mean|u_buy_mean|u_atf_mean|u_clk_atc_median|u_buy_median|u_atf_median|u_clk_atc_std|u_buy_std|u_atf_std|u_clk_atc_max|u_buy_max|u_atf_max|u_clk_atc_day_cnt_mean|u_clk_atc_day_cnt_max|u_clk_atc_day_cnt_std|u_clk_atc_day_cnt_med|u_buy_day_cnt_mean|u_buy_day_cnt_max|u_buy_day_cnt_std|u_buy_day_cnt_med|u_atf_day_cnt_mean|u_atf_day_cnt_max|u_atf_day_cnt_std|u_atf_day_cnt_med|u_clk_atc_item_cnt_mean|u_clk_atc_item_cnt_max|u_clk_atc_item_cnt_std|u_clk_atc_item_cnt_med|u_buy_item_cnt_mean|u_buy_item_cnt_max|u_buy_item_cnt_std|u_buy_item_cnt_med|u_atf_item_cnt_mean|u_atf_item_cnt_max|u_atf_item_cnt_std|u_atf_item_cnt_med|
# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+---------------------+---------------------+---------------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+-----------------------+----------------------+----------------------+----------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+
# |      1|               27|            6|            0|               0|               0|               0|               0|               0|               14|               13|           0|           0|           0|           0|           0|            2|            4|           0|           0|           0|           0|           0|            0|            0|               0.82|           0.18|            0.0|               0.0|               0.0|               0.0|               0.0|               0.0|               0.88|               0.76|           0.0|           0.0|           0.0|           0.0|           0.0|           0.13|           0.24|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                5|            3|            0|                   0|                   0|                   0|                   0|                   0|                    4|                    1|               0|               0|               0|               0|               0|                2|                1|               0|               0|               0|               0|               0|                0|                0|                12|             3|             0|                6|            3|            0|                9|            3|            0|                9|            3|            0|          2.33|      0.33|       0.0|             0.0|         0.0|         0.0|         5.72|     0.82|      0.0|           14|        2|        0|                  1.11|                    2|                 0.33|                  1.0|               1.0|                1|              0.0|              1.0|              null|             null|             null|             null|                   1.33|                     4|                   1.0|                   1.0|                1.0|                 1|               0.0|               1.0|               null|              null|              null|              null|
# |     10|               56|            7|            1|               0|              18|               0|               2|               2|               18|               16|           0|           0|           0|           0|           0|            5|            2|           0|           0|           0|           0|           0|            1|            0|               0.88|           0.11|           0.02|               0.0|               1.0|               0.0|               1.0|               1.0|               0.75|               0.89|           0.0|           0.0|           0.0|           0.0|           0.0|           0.21|           0.11|           0.0|           0.0|           0.0|           0.0|           0.0|           0.04|            0.0|               12|            2|            1|                   0|                   4|                   0|                   1|                   1|                    1|                    5|               0|               0|               0|               0|               0|                1|                1|               0|               0|               0|               0|               0|                1|                0|                39|             5|             1|               17|            3|            1|               17|            2|            1|               17|            2|            1|          6.67|      0.83|      0.17|             2.0|         0.0|         0.0|         8.82|     2.04|     0.41|           18|        5|        1|                  1.41|                    3|                  0.8|                  1.0|               1.0|                1|              0.0|              1.0|               1.0|                1|              NaN|              1.0|                   2.29|                     7|                  1.76|                   1.0|                2.5|                 3|              0.71|               2.5|                1.0|                 1|               NaN|               1.0|
# |    100|              291|           10|            1|               2|              26|               4|               0|              36|              138|               85|           0|           0|           0|           0|           2|            3|            5|           0|           0|           0|           0|           0|            1|            0|               0.96|           0.03|            0.0|               1.0|               1.0|               1.0|               0.0|              0.95|               0.97|               0.94|           0.0|           0.0|           0.0|           0.0|          0.05|           0.02|           0.06|           0.0|           0.0|           0.0|           0.0|           0.0|           0.01|            0.0|               24|            5|            1|                   1|                   1|                   2|                   0|                   3|                   15|                    2|               0|               0|               0|               0|               2|                2|                1|               0|               0|               0|               0|               0|                1|                0|               131|             9|             1|               37|            8|            1|               63|            6|            1|               66|            6|            1|         34.33|      0.83|      0.17|            15.0|         0.0|         0.0|        52.85|     1.33|     0.41|          138|        3|        1|                  1.38|                   11|                 1.32|                  1.0|              1.33|                3|             0.82|              1.0|               1.0|                1|              NaN|              1.0|                   1.98|                    28|                  3.44|                   1.0|                1.5|                 4|              1.22|               1.0|                1.0|                 1|               NaN|               1.0|
# |   1000|               66|            7|            6|               0|               0|               0|               0|               0|                0|               66|           0|           0|           0|           0|           0|            0|            7|           0|           0|           0|           0|           0|            0|            6|               0.84|           0.09|           0.08|               0.0|               0.0|               0.0|               0.0|               0.0|                0.0|               0.84|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.09|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.08|                2|            1|            1|                   0|                   0|                   0|                   0|                   0|                    0|                    2|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                1|                21|             7|             6|                5|            4|            4|                9|            5|            3|                9|            5|            3|           0.0|       0.0|       0.0|             0.0|         0.0|         0.0|          0.0|      0.0|      0.0|            0|        0|        0|                  1.56|                    2|                 0.53|                  2.0|               1.0|                1|              0.0|              1.0|               1.0|                1|              0.0|              1.0|                   2.33|                    11|                  3.28|                   1.0|                1.4|                 3|              0.89|               1.0|                2.0|                 4|              1.73|               1.0|
# |  10000|              402|           17|            0|              28|             160|              22|              13|              10|               45|              124|           2|           3|           2|           2|           0|            2|            6|           0|           0|           0|           0|           0|            0|            0|               0.96|           0.04|            0.0|              0.93|              0.98|              0.92|              0.87|               1.0|               0.96|               0.95|          0.07|          0.02|          0.08|          0.13|           0.0|           0.04|           0.05|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|               43|            8|            0|                   2|                  12|                   6|                   5|                   4|                    7|                    7|               1|               1|               1|               1|               0|                1|                3|               0|               0|               0|               0|               0|                0|                0|               272|            14|             0|               76|           12|            0|              128|           13|            0|              128|           13|            0|         46.33|      1.83|       0.0|            25.0|         2.0|         0.0|        57.06|     0.98|      0.0|          160|        3|        0|                  1.28|                    4|                 0.61|                  1.0|               1.0|                1|              0.0|              1.0|              null|             null|             null|             null|                   2.13|                    16|                  2.28|                   1.0|               1.08|                 2|              0.28|               1.0|               null|              null|              null|              null|
# | 100000|              207|           10|            4|               0|               0|               4|              43|              86|               24|               50|           0|           0|           0|           6|           3|            0|            1|           0|           0|           0|           1|           0|            1|            2|               0.94|           0.05|           0.02|               0.0|               0.0|               1.0|              0.86|              0.97|               0.96|               0.94|           0.0|           0.0|           0.0|          0.12|          0.03|            0.0|           0.02|           0.0|           0.0|           0.0|          0.02|           0.0|           0.04|           0.04|               25|            5|            3|                   0|                   0|                   1|                   7|                   8|                    6|                    3|               0|               0|               0|               3|               1|                0|                1|               0|               0|               0|               1|               0|                1|                1|               122|             9|             4|               44|            9|            3|               68|            6|            4|               74|            6|            4|         26.17|       1.5|      0.33|            14.0|         0.0|         0.0|        33.85|     2.51|     0.52|           86|        6|        1|                   1.3|                    7|                 0.89|                  1.0|              1.17|                2|             0.41|              1.0|               1.0|                1|              0.0|              1.0|                   1.65|                     9|                  1.38|                   1.0|                1.5|                 4|              1.22|               1.0|                1.0|                 1|               0.0|               1.0|
# | 100001|               87|            3|            0|               0|               0|               0|               1|               1|                1|               84|           0|           0|           0|           0|           0|            0|            3|           0|           0|           0|           0|           0|            0|            0|               0.97|           0.03|            0.0|               0.0|               0.0|               0.0|               1.0|               1.0|                1.0|               0.97|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.03|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                4|            1|            0|                   0|                   0|                   0|                   1|                   1|                    1|                    1|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|                74|             3|             0|               20|            3|            0|               35|            3|            0|               40|            3|            0|           0.5|       0.0|       0.0|             0.5|         0.0|         0.0|         0.55|      0.0|      0.0|            1|        0|        0|                   1.0|                    1|                  0.0|                  1.0|               1.0|                1|              0.0|              1.0|              null|             null|             null|             null|                   1.85|                    10|                  1.97|                   1.0|                1.0|                 1|               0.0|               1.0|               null|              null|              null|              null|
# | 100002|               90|            5|            1|              20|               1|               5|              25|               7|                3|               29|           0|           0|           0|           2|           1|            0|            2|           0|           0|           0|           0|           0|            0|            1|               0.94|           0.05|           0.01|               1.0|               1.0|               1.0|              0.93|              0.88|                1.0|               0.91|           0.0|           0.0|           0.0|          0.07|          0.13|            0.0|           0.06|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.03|               19|            3|            1|                   3|                   1|                   4|                   4|                   3|                    2|                    2|               0|               0|               0|               1|               1|                0|                1|               0|               0|               0|               0|               0|                0|                1|                69|             5|             1|               37|            5|            1|               29|            5|            1|               29|            4|            1|         10.17|       0.5|       0.0|             6.0|         0.0|         0.0|         9.89|     0.84|      0.0|           25|        2|        0|                  1.21|                    3|                 0.49|                  1.0|               1.0|                1|              0.0|              1.0|               1.0|                1|              NaN|              1.0|                   2.38|                    12|                  2.72|                   1.0|               1.25|                 2|               0.5|               1.0|                1.0|                 1|               NaN|               1.0|
# | 100003|               88|            1|            0|               2|               1|               8|               6|               3|                9|               59|           0|           0|           0|           0|           0|            0|            1|           0|           0|           0|           0|           0|            0|            0|               0.99|           0.01|            0.0|               1.0|               1.0|               1.0|               1.0|               1.0|                1.0|               0.98|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.02|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|               17|            1|            0|                   1|                   1|                   3|                   2|                   3|                    5|                    2|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|                84|             1|             0|               26|            1|            0|               30|            1|            0|               31|            1|            0|          4.83|       0.0|       0.0|             4.5|         0.0|         0.0|         3.31|      0.0|      0.0|            9|        0|        0|                   1.1|                    2|                  0.3|                  1.0|               1.0|                1|              NaN|              1.0|              null|             null|             null|             null|                   2.71|                    41|                   7.2|                   1.0|                1.0|                 1|               NaN|               1.0|               null|              null|              null|              null|
# | 100004|               54|            3|            0|               0|               0|               1|              17|               0|                0|               36|           0|           0|           0|           1|           0|            0|            2|           0|           0|           0|           0|           0|            0|            0|               0.95|           0.05|            0.0|               0.0|               0.0|               1.0|              0.94|               0.0|                0.0|               0.95|           0.0|           0.0|           0.0|          0.06|           0.0|            0.0|           0.05|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                8|            2|            0|                   0|                   0|                   1|                   4|                   0|                    0|                    3|               0|               0|               0|               1|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|                28|             3|             0|                9|            3|            0|               15|            3|            0|               16|            3|            0|           3.0|      0.17|       0.0|             0.0|         0.0|         0.0|         6.87|     0.41|      0.0|           17|        1|        0|                  1.19|                    3|                 0.54|                  1.0|               1.0|                1|              0.0|              1.0|              null|             null|             null|             null|                   1.75|                     6|                  1.29|                   1.0|                1.0|                 1|               0.0|               1.0|               null|              null|              null|              null|
# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+---------------------+---------------------+---------------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+-----------------------+----------------------+----------------------+----------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+


+-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+--------

### Recent Features

In [59]:
def recent_action_counts(profile, profile_id, act_cnt_log):
  # montly filter
  act_cnt_0 = act_cnt_log \
              .withColumn(profile + "_clk_atc_cnt_double11", F.when(act_cnt_log.time_stamp == "1111", act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_double11", F.when(act_cnt_log.time_stamp == "1111", act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_double11", F.when(act_cnt_log.time_stamp == "1111", act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_1wpre_double11", F.when(((act_cnt_log.time_stamp >= "1104") & (act_cnt_log.time_stamp <= "1110")), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_1wpre_double11", F.when(((act_cnt_log.time_stamp >= "1104") & (act_cnt_log.time_stamp <= "1110")), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_1wpre_double11", F.when(((act_cnt_log.time_stamp >= "1104") & (act_cnt_log.time_stamp <= "1110")), act_cnt_log.atf_cnt).otherwise(0))

  act_cnt_1 = act_cnt_0 \
              .groupBy(profile_id) \
              .sum() \
              .withColumnRenamed("sum(clk_atc_cnt)", profile + "_ttl_clk_atc_cnt") \
              .withColumnRenamed("sum(buy_cnt)", profile + "_ttl_buy_cnt") \
              .withColumnRenamed("sum(atf_cnt)", profile + "_ttl_atf_cnt") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_double11)", profile + "_clk_atc_cnt_double11") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_double11)", profile + "_buy_cnt_double11") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_double11)", profile + "_atf_cnt_double11") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_1wpre_double11)", profile + "_clk_atc_cnt_1wpre_double11") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_1wpre_double11)", profile + "_buy_cnt_1wpre_double11") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_1wpre_double11)", profile + "_atf_cnt_1wpre_double11") \
              .drop(*["u_ttl_clk_atc_cnt", "u_ttl_buy_cnt", "u_ttl_atf_cnt"])
  return act_cnt_1

In [60]:
user_feature_5 = user_feature_4 \
                .join(recent_action_counts("u", "user_id", act_cnt_ratio_0), "user_id", "left") \
                .orderBy("user_id")

In [61]:
def recent_action_ratio(profile, act_cnt_log):
  monthly_ttl_act_cnt_0 = act_cnt_log \
                         .withColumn("ttl_cnt_double11", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_double11" in col])) \
                         .withColumn("ttl_cnt_1wpre_double11", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_1wpre_double11" in col])) 

  act_ratio_0 = monthly_ttl_act_cnt_0 \
                    .withColumn(profile + "_clk_atc_ratio_double11", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_double11"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_1wpre_double11", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_1wpre_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_1wpre_double11"], 2)) \
                    .withColumn(profile + "_buy_ratio_double11", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_double11"], 2)) \
                    .withColumn(profile + "_buy_ratio_1wpre_double11", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_1wpre_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_1wpre_double11"], 2)) \
                    .withColumn(profile + "_atf_ratio_double11", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_double11"], 2)) \
                    .withColumn(profile + "_atf_ratio_1wpre_double11", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_1wpre_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_1wpre_double11"], 2)) \
                    .drop(*["ttl_cnt_double11", "ttl_cnt_1wpre_double11"])

  act_ratio_1 = act_ratio_0 \
               .fillna(0) 
  return act_ratio_1

In [62]:
user_feature_6 = recent_action_ratio("u", user_feature_5)

## Gathering

In [63]:
user_feature_6.orderBy("user_id").show(5)

# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+---------------------+---------------------+---------------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+-----------------------+----------------------+----------------------+----------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+----------------------+------------------+------------------+----------------------------+------------------------+------------------------+------------------------+------------------------------+--------------------+--------------------------+--------------------+--------------------------+
# |user_id|u_ttl_clk_atc_cnt|u_ttl_buy_cnt|u_ttl_atf_cnt|u_clk_atc_cnt_m5|u_clk_atc_cnt_m6|u_clk_atc_cnt_m7|u_clk_atc_cnt_m8|u_clk_atc_cnt_m9|u_clk_atc_cnt_m10|u_clk_atc_cnt_m11|u_buy_cnt_m5|u_buy_cnt_m6|u_buy_cnt_m7|u_buy_cnt_m8|u_buy_cnt_m9|u_buy_cnt_m10|u_buy_cnt_m11|u_atf_cnt_m5|u_atf_cnt_m6|u_atf_cnt_m7|u_atf_cnt_m8|u_atf_cnt_m9|u_atf_cnt_m10|u_atf_cnt_m11|u_ttl_clk_atc_ratio|u_ttl_buy_ratio|u_ttl_atf_ratio|u_clk_atc_ratio_m5|u_clk_atc_ratio_m6|u_clk_atc_ratio_m7|u_clk_atc_ratio_m8|u_clk_atc_ratio_m9|u_clk_atc_ratio_m10|u_clk_atc_ratio_m11|u_buy_ratio_m5|u_buy_ratio_m6|u_buy_ratio_m7|u_buy_ratio_m8|u_buy_ratio_m9|u_buy_ratio_m10|u_buy_ratio_m11|u_atf_ratio_m5|u_atf_ratio_m6|u_atf_ratio_m7|u_atf_ratio_m8|u_atf_ratio_m9|u_atf_ratio_m10|u_atf_ratio_m11|u_clk_atc_day_cnt|u_buy_day_cnt|u_atf_day_cnt|u_clk_atc_day_cnt_m5|u_clk_atc_day_cnt_m6|u_clk_atc_day_cnt_m7|u_clk_atc_day_cnt_m8|u_clk_atc_day_cnt_m9|u_clk_atc_day_cnt_m10|u_clk_atc_day_cnt_m11|u_buy_day_cnt_m5|u_buy_day_cnt_m6|u_buy_day_cnt_m7|u_buy_day_cnt_m8|u_buy_day_cnt_m9|u_buy_day_cnt_m10|u_buy_day_cnt_m11|u_atf_day_cnt_m5|u_atf_day_cnt_m6|u_atf_day_cnt_m7|u_atf_day_cnt_m8|u_atf_day_cnt_m9|u_atf_day_cnt_m10|u_atf_day_cnt_m11|u_clk_atc_item_div|u_buy_item_div|u_atf_item_div|u_clk_atc_cat_div|u_buy_cat_div|u_atf_cat_div|u_clk_atc_brd_div|u_buy_brd_div|u_atf_brd_div|u_clk_atc_mer_div|u_buy_mer_div|u_atf_mer_div|u_clk_atc_mean|u_buy_mean|u_atf_mean|u_clk_atc_median|u_buy_median|u_atf_median|u_clk_atc_std|u_buy_std|u_atf_std|u_clk_atc_max|u_buy_max|u_atf_max|u_clk_atc_day_cnt_mean|u_clk_atc_day_cnt_max|u_clk_atc_day_cnt_std|u_clk_atc_day_cnt_med|u_buy_day_cnt_mean|u_buy_day_cnt_max|u_buy_day_cnt_std|u_buy_day_cnt_med|u_atf_day_cnt_mean|u_atf_day_cnt_max|u_atf_day_cnt_std|u_atf_day_cnt_med|u_clk_atc_item_cnt_mean|u_clk_atc_item_cnt_max|u_clk_atc_item_cnt_std|u_clk_atc_item_cnt_med|u_buy_item_cnt_mean|u_buy_item_cnt_max|u_buy_item_cnt_std|u_buy_item_cnt_med|u_atf_item_cnt_mean|u_atf_item_cnt_max|u_atf_item_cnt_std|u_atf_item_cnt_med|u_clk_atc_cnt_double11|u_buy_cnt_double11|u_atf_cnt_double11|u_clk_atc_cnt_1wpre_double11|u_buy_cnt_1wpre_double11|u_atf_cnt_1wpre_double11|u_clk_atc_ratio_double11|u_clk_atc_ratio_1wpre_double11|u_buy_ratio_double11|u_buy_ratio_1wpre_double11|u_atf_ratio_double11|u_atf_ratio_1wpre_double11|
# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+---------------------+---------------------+---------------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+-----------------------+----------------------+----------------------+----------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+----------------------+------------------+------------------+----------------------------+------------------------+------------------------+------------------------+------------------------------+--------------------+--------------------------+--------------------+--------------------------+
# |      1|               27|            6|            0|               0|               0|               0|               0|               0|               14|               13|           0|           0|           0|           0|           0|            2|            4|           0|           0|           0|           0|           0|            0|            0|               0.82|           0.18|            0.0|               0.0|               0.0|               0.0|               0.0|               0.0|               0.88|               0.76|           0.0|           0.0|           0.0|           0.0|           0.0|           0.13|           0.24|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                5|            3|            0|                   0|                   0|                   0|                   0|                   0|                    4|                    1|               0|               0|               0|               0|               0|                2|                1|               0|               0|               0|               0|               0|                0|                0|                12|             3|             0|                6|            3|            0|                9|            3|            0|                9|            3|            0|          2.33|      0.33|       0.0|             0.0|         0.0|         0.0|         5.72|     0.82|      0.0|           14|        2|        0|                  1.11|                    2|                 0.33|                  1.0|               1.0|                1|              0.0|              1.0|               0.0|                0|              0.0|              0.0|                   1.33|                     4|                   1.0|                   1.0|                1.0|                 1|               0.0|               1.0|                0.0|                 0|               0.0|               0.0|                    13|                 4|                 0|                           0|                       0|                       0|                    0.76|                           0.0|                0.24|                       0.0|                 0.0|                       0.0|
# |     10|               56|            7|            1|               0|              18|               0|               2|               2|               18|               16|           0|           0|           0|           0|           0|            5|            2|           0|           0|           0|           0|           0|            1|            0|               0.88|           0.11|           0.02|               0.0|               1.0|               0.0|               1.0|               1.0|               0.75|               0.89|           0.0|           0.0|           0.0|           0.0|           0.0|           0.21|           0.11|           0.0|           0.0|           0.0|           0.0|           0.0|           0.04|            0.0|               12|            2|            1|                   0|                   4|                   0|                   1|                   1|                    1|                    5|               0|               0|               0|               0|               0|                1|                1|               0|               0|               0|               0|               0|                1|                0|                39|             5|             1|               17|            3|            1|               17|            2|            1|               17|            2|            1|          6.67|      0.83|      0.17|             2.0|         0.0|         0.0|         8.82|     2.04|     0.41|           18|        5|        1|                  1.41|                    3|                  0.8|                  1.0|               1.0|                1|              0.0|              1.0|               1.0|                1|              0.0|              1.0|                   2.29|                     7|                  1.76|                   1.0|                2.5|                 3|              0.71|               2.5|                1.0|                 1|               0.0|               1.0|                     9|                 2|                 0|                           1|                       0|                       0|                    0.82|                           1.0|                0.18|                       0.0|                 0.0|                       0.0|
# |    100|              291|           10|            1|               2|              26|               4|               0|              36|              138|               85|           0|           0|           0|           0|           2|            3|            5|           0|           0|           0|           0|           0|            1|            0|               0.96|           0.03|            0.0|               1.0|               1.0|               1.0|               0.0|              0.95|               0.97|               0.94|           0.0|           0.0|           0.0|           0.0|          0.05|           0.02|           0.06|           0.0|           0.0|           0.0|           0.0|           0.0|           0.01|            0.0|               24|            5|            1|                   1|                   1|                   2|                   0|                   3|                   15|                    2|               0|               0|               0|               0|               2|                2|                1|               0|               0|               0|               0|               0|                1|                0|               131|             9|             1|               37|            8|            1|               63|            6|            1|               66|            6|            1|         34.33|      0.83|      0.17|            15.0|         0.0|         0.0|        52.85|     1.33|     0.41|          138|        3|        1|                  1.38|                   11|                 1.32|                  1.0|              1.33|                3|             0.82|              1.0|               1.0|                1|              0.0|              1.0|                   1.98|                    28|                  3.44|                   1.0|                1.5|                 4|              1.22|               1.0|                1.0|                 1|               0.0|               1.0|                    48|                 5|                 0|                          37|                       0|                       0|                    0.91|                           1.0|                0.09|                       0.0|                 0.0|                       0.0|
# |   1000|               66|            7|            6|               0|               0|               0|               0|               0|                0|               66|           0|           0|           0|           0|           0|            0|            7|           0|           0|           0|           0|           0|            0|            6|               0.84|           0.09|           0.08|               0.0|               0.0|               0.0|               0.0|               0.0|                0.0|               0.84|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.09|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.08|                2|            1|            1|                   0|                   0|                   0|                   0|                   0|                    0|                    2|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                1|                21|             7|             6|                5|            4|            4|                9|            5|            3|                9|            5|            3|           0.0|       0.0|       0.0|             0.0|         0.0|         0.0|          0.0|      0.0|      0.0|            0|        0|        0|                  1.56|                    2|                 0.53|                  2.0|               1.0|                1|              0.0|              1.0|               1.0|                1|              0.0|              1.0|                   2.33|                    11|                  3.28|                   1.0|                1.4|                 3|              0.89|               1.0|                2.0|                 4|              1.73|               1.0|                    18|                 7|                 0|                          48|                       0|                       6|                    0.72|                          0.89|                0.28|                       0.0|                 0.0|                      0.11|
# |  10000|              402|           17|            0|              28|             160|              22|              13|              10|               45|              124|           2|           3|           2|           2|           0|            2|            6|           0|           0|           0|           0|           0|            0|            0|               0.96|           0.04|            0.0|              0.93|              0.98|              0.92|              0.87|               1.0|               0.96|               0.95|          0.07|          0.02|          0.08|          0.13|           0.0|           0.04|           0.05|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|               43|            8|            0|                   2|                  12|                   6|                   5|                   4|                    7|                    7|               1|               1|               1|               1|               0|                1|                3|               0|               0|               0|               0|               0|                0|                0|               272|            14|             0|               76|           12|            0|              128|           13|            0|              128|           13|            0|         46.33|      1.83|       0.0|            25.0|         2.0|         0.0|        57.06|     0.98|      0.0|          160|        3|        0|                  1.28|                    4|                 0.61|                  1.0|               1.0|                1|              0.0|              1.0|               0.0|                0|              0.0|              0.0|                   2.13|                    16|                  2.28|                   1.0|               1.08|                 2|              0.28|               1.0|                0.0|                 0|               0.0|               0.0|                    18|                 4|                 0|                          98|                       2|                       0|                    0.82|                          0.98|                0.18|                      0.02|                 0.0|                       0.0|
# +-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------------+--------------+--------------+-----------------+-------------+-------------+-----------------+-------------+-------------+-----------------+-------------+-------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+---------------------+---------------------+---------------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+-----------------------+----------------------+----------------------+----------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+----------------------+------------------+------------------+----------------------------+------------------------+------------------------+------------------------+------------------------------+--------------------+--------------------------+--------------------+--------------------------+


+-------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+--------

## Save Parquet

In [64]:
user_feature_6.coalesce(50) \
              .write.format("parquet") \
              .mode("overwrite") \
              .save("./drive/MyDrive/Colab Notebooks/data/feature_user_new")