## Install Spark & Install Packages & Initial Spark

In [27]:
!pip install pyspark==3.0.1 py4j==0.10.9 

Collecting pyspark==3.0.1
[?25l  Downloading https://files.pythonhosted.org/packages/f0/26/198fc8c0b98580f617cb03cb298c6056587b8f0447e20fa40c5b634ced77/pyspark-3.0.1.tar.gz (204.2MB)
[K     |████████████████████████████████| 204.2MB 60kB/s 
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612242 sha256=65eee67960857df659d1f8b13e7cf999243c52e7400c25f6d7209def710b3808
  Stored in directory: /root/.cache/pip/wheels/5e/bd/07/031766ca628adec8435bb40f0bd83bb676ce65ff4007f8e73f
Successfully built pyspark
Installing collected packages: pyspark
  Found existing installation: pyspark 3.0.2
    Uninstalling pyspark-3.0.2:
      Successfully uninstalled pyspark-3.0.2
Successfully installed pyspark-3.0.1


In [3]:
import numpy as np
import pandas as pd 
import warnings
import zipfile
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.font_manager import FontProperties
from pyspark.sql import functions as F
from pyspark.sql import Window

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

%matplotlib inline

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Feature Engineering(Counts & Ratio)")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

## Read Data

In [4]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [5]:
train = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/train_format1.csv")
test = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/test_format1.csv")
train_info = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/user_info_format1.csv") 
train_log = spark.read.option("header",True) \
    .csv("/content/drive/MyDrive/Colab Notebooks/data/data_format1/user_log_format1.csv") \
    .withColumnRenamed("seller_id", "merchant_id")

In [6]:
train_info_0 = train_info \
    .withColumn("age", F.when(train_info.age_range == 8, 7).otherwise(train_info.age_range)) \
    .fillna({"age": 0, "gender": 2}) \
    .drop("age_range")

In [7]:
train_log_0 = train_log \
            .withColumn("action_type_tmp", F.when(train_log.action_type == 0, 1).otherwise(train_log.action_type)) \
            .drop("action_type") \
            .withColumnRenamed("action_type_tmp", "action_type")

## Show Data

In [None]:
train.show()

+-------+-----------+-----+
|user_id|merchant_id|label|
+-------+-----------+-----+
|  34176|       3906|    0|
|  34176|        121|    0|
|  34176|       4356|    1|
|  34176|       2217|    0|
| 230784|       4818|    0|
| 362112|       2618|    0|
|  34944|       2051|    0|
| 231552|       3828|    1|
| 231552|       2124|    0|
| 232320|       1168|    0|
| 232320|       4270|    0|
| 167040|        671|    0|
| 101760|       1760|    0|
| 298368|       2981|    0|
|  36480|       4730|    0|
| 299136|       2935|    0|
|  37248|       2615|    0|
| 103296|       2482|    0|
| 299904|       1742|    0|
|  38016|       1028|    0|
+-------+-----------+-----+
only showing top 20 rows



In [None]:
train_info_0.show()

+-------+------+---+
|user_id|gender|age|
+-------+------+---+
| 376517|     1|  6|
| 234512|     0|  5|
| 344532|     0|  5|
| 186135|     0|  5|
|  30230|     0|  5|
| 272389|     1|  6|
| 281071|     0|  4|
| 139859|     0|  7|
| 198411|     1|  5|
|  67037|     1|  4|
| 149002|     2|  5|
|   7468|     0|  4|
|  94292|     0|  4|
| 347414|     1|  6|
| 191719|     0|  4|
| 391524|     1|  5|
| 153790|     0|  6|
| 349112|     1|  3|
| 344766|     0|  6|
|  81816|     0|  5|
+-------+------+---+
only showing top 20 rows



In [None]:
train_log.orderBy("merchant_id", "user_id").show()

+-------+-------+------+-----------+--------+----------+-----------+
|user_id|item_id|cat_id|merchant_id|brand_id|time_stamp|action_type|
+-------+-------+------+-----------+--------+----------+-----------+
|    100|1041304|   420|          1|    1662|      1108|          0|
|    100| 472260|   420|          1|    1662|      1108|          0|
|    100| 912479|   993|          1|    1662|      1018|          0|
|    100|  24620|   420|          1|    1662|      1008|          3|
|    100| 912479|   993|          1|    1662|      1016|          0|
|    100|1008023|   629|          1|    1662|      1008|          0|
|    100| 912479|   993|          1|    1662|      1016|          0|
|    100| 918789|   629|          1|    1662|      1008|          0|
|    100| 912479|   993|          1|    1662|      1016|          0|
|    100| 912479|   993|          1|    1662|      1020|          0|
|    100|  83998|   420|          1|    1662|      1108|          0|
|    100|1008023|   629|          

## Feature Engineering (Category Profile)

### Count Features

In [8]:
# columns to rows
from pyspark.sql import functions as F

def dfPivot(df, keys, column):
    '''
    params: 
      df: dataframe
      keys: 待转换表中需要保留的主键key，以list[]类型传入
      column: 待转换的列名
    '''
    # .fillna(-999.0): 行转列时有时对应的列没有值，就会产生null值，fillna会对null做处理，转换为其他值，如不需要可以删除
    return df.groupBy(keys).pivot(column).agg(F.count(column)).fillna(0)

In [9]:
keys = ["cat_id", "time_stamp"]
column = "action_type"

act_cnt_ratio_0 = dfPivot(train_log_0, keys, column) \
            .withColumnRenamed("1", "clk_atc_cnt") \
            .withColumnRenamed("2", "buy_cnt") \
            .withColumnRenamed("3", "atf_cnt")

# +------+----------+-----------+-------+-------+
# |cat_id|time_stamp|clk_atc_cnt|buy_cnt|atf_cnt|
# +------+----------+-----------+-------+-------+
# |     1|      0724|          2|      0|      0|
# |     1|      0611|          1|      1|      0|
# |     1|      1016|          1|      0|      0|
# |     1|      0614|          3|      1|      0|
# |     1|      0918|          1|      0|      0|
# |     1|      0921|          3|      2|      0|
# |     1|      0519|          0|      1|      1|
# |     1|      0818|          1|      0|      0|
# |     1|      0908|          1|      0|      0|
# |     1|      0811|          1|      1|      0|
# +------+----------+-----------+-------+-------+


In [11]:
def monthly_action_counts(profile, profile_id, act_cnt_log):
  # montly filter
  act_cnt_0 = act_cnt_log \
              .withColumn(profile + "_clk_atc_cnt_m5", F.when(( (act_cnt_log.time_stamp >= "0501") & (act_cnt_log.time_stamp <= "0531") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m6", F.when(( (act_cnt_log.time_stamp >= "0601") & (act_cnt_log.time_stamp <= "0630") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m7", F.when(( (act_cnt_log.time_stamp >= "0701") & (act_cnt_log.time_stamp <= "0731") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m8", F.when(( (act_cnt_log.time_stamp >= "0801") & (act_cnt_log.time_stamp <= "0831") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m9", F.when(( (act_cnt_log.time_stamp >= "0901") & (act_cnt_log.time_stamp <= "0930") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m10", F.when(( (act_cnt_log.time_stamp >= "1001") & (act_cnt_log.time_stamp <= "1031") ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_m11", F.when(( act_cnt_log.time_stamp >= "1101" ), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m5", F.when(( (act_cnt_log.time_stamp >= "0501") & (act_cnt_log.time_stamp <= "0531") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m6", F.when(( (act_cnt_log.time_stamp >= "0601") & (act_cnt_log.time_stamp <= "0630") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m7", F.when(( (act_cnt_log.time_stamp >= "0701") & (act_cnt_log.time_stamp <= "0731") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m8", F.when(( (act_cnt_log.time_stamp >= "0801") & (act_cnt_log.time_stamp <= "0831") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m9", F.when(( (act_cnt_log.time_stamp >= "0901") & (act_cnt_log.time_stamp <= "0930") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m10", F.when(( (act_cnt_log.time_stamp >= "1001") & (act_cnt_log.time_stamp <= "1031") ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_m11", F.when(( act_cnt_log.time_stamp >= "1101" ), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m5", F.when(( (act_cnt_log.time_stamp >= "0501") & (act_cnt_log.time_stamp <= "0531") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m6", F.when(( (act_cnt_log.time_stamp >= "0601") & (act_cnt_log.time_stamp <= "0630") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m7", F.when(( (act_cnt_log.time_stamp >= "0701") & (act_cnt_log.time_stamp <= "0731") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m8", F.when(( (act_cnt_log.time_stamp >= "0801") & (act_cnt_log.time_stamp <= "0831") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m9", F.when(( (act_cnt_log.time_stamp >= "0901") & (act_cnt_log.time_stamp <= "0930") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m10", F.when(( (act_cnt_log.time_stamp >= "1001") & (act_cnt_log.time_stamp <= "1031") ), act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_m11", F.when(( act_cnt_log.time_stamp >= "1101" ), act_cnt_log.atf_cnt).otherwise(0))

  act_cnt_1 = act_cnt_0 \
              .groupBy(profile_id) \
              .sum() \
              .withColumnRenamed("sum(clk_atc_cnt)", profile + "_ttl_clk_atc_cnt") \
              .withColumnRenamed("sum(buy_cnt)", profile + "_ttl_buy_cnt") \
              .withColumnRenamed("sum(atf_cnt)", profile + "_ttl_atf_cnt") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m5)", profile + "_clk_atc_cnt_m5") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m6)", profile + "_clk_atc_cnt_m6") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m7)", profile + "_clk_atc_cnt_m7") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m8)", profile + "_clk_atc_cnt_m8") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m9)", profile + "_clk_atc_cnt_m9") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m10)", profile + "_clk_atc_cnt_m10") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_m11)", profile + "_clk_atc_cnt_m11") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m5)", profile + "_buy_cnt_m5") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m6)", profile + "_buy_cnt_m6") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m7)", profile + "_buy_cnt_m7") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m8)", profile + "_buy_cnt_m8") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m9)", profile + "_buy_cnt_m9") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m10)", profile + "_buy_cnt_m10") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_m11)", profile + "_buy_cnt_m11") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m5)", profile + "_atf_cnt_m5") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m6)", profile + "_atf_cnt_m6") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m7)", profile + "_atf_cnt_m7") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m8)", profile + "_atf_cnt_m8") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m9)", profile + "_atf_cnt_m9") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m10)", profile + "_atf_cnt_m10") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_m11)", profile + "_atf_cnt_m11") 
  return act_cnt_1

In [12]:
act_cnt_ratio_1 = monthly_action_counts("c", "cat_id", act_cnt_ratio_0).orderBy("cat_id")

# +-----------+-------------+-------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+
# |merchant_id|m_ttl_clk_cnt|m_ttl_atc_cnt|m_ttl_buy_cnt|m_ttl_atf_cnt|m_clk_cnt_m5|m_clk_cnt_m6|m_clk_cnt_m7|m_clk_cnt_m8|m_clk_cnt_m9|m_clk_cnt_m10|m_clk_cnt_m11|m_atc_cnt_m5|m_atc_cnt_m6|m_atc_cnt_m7|m_atc_cnt_m8|m_atc_cnt_m9|m_atc_cnt_m10|m_atc_cnt_m11|m_buy_cnt_m5|m_buy_cnt_m6|m_buy_cnt_m7|m_buy_cnt_m8|m_buy_cnt_m9|m_buy_cnt_m10|m_buy_cnt_m11|m_atf_cnt_m5|m_atf_cnt_m6|m_atf_cnt_m7|m_atf_cnt_m8|m_atf_cnt_m9|m_atf_cnt_m10|m_atf_cnt_m11|
# +-----------+-------------+-------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+
# |          1|       308236|          444|        17705|        12755|       24285|       25416|       25405|       25070|       35245|        37968|       134847|           5|           0|           3|          32|          15|           45|          344|        2329|        1408|        1320|        1353|        2143|         1558|         7594|        1521|         828|         868|        1220|        1789|         2235|         4294|
# |         10|        19125|           64|         1133|          866|         141|         323|          68|         301|        1407|         2007|        14878|           0|           0|           0|           1|           0|            1|           62|          23|          24|           1|          14|          29|           45|          997|          17|           6|           3|          26|          73|          143|          598|
# |        100|         4055|            7|          538|          181|           0|          23|          23|           7|         494|         1722|         1786|           0|           0|           0|           0|           0|            1|            6|           0|           1|           2|           1|          68|          215|          251|           0|           2|           2|           1|          25|           87|           64|
# |       1000|        10499|           14|          959|          781|         172|         798|         962|        1393|        1009|         1380|         4785|           0|           0|           0|           1|           1|            3|            9|          27|          30|          80|         111|          83|           84|          544|          51|          70|          83|         100|          90|          137|          250|
# |       1001|         2410|            2|          196|          116|         281|         494|         146|         280|         268|          244|          697|           0|           0|           0|           0|           0|            0|            2|          68|          34|          15|           8|          15|           12|           44|          27|          20|          12|          18|           9|           11|           19|
# +-----------+-------------+-------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+


### Ratio Features

In [13]:
def monthly_action_ratio(profile, act_cnt_log):
  monthly_ttl_act_cnt_0 = act_cnt_log \
                         .withColumn("ttl_cnt", sum([act_cnt_log[col] for col in act_cnt_log.columns if profile + "_ttl_" in col])) \
                         .withColumn("ttl_cnt_m5", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m5" in col])) \
                         .withColumn("ttl_cnt_m6", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m6" in col])) \
                         .withColumn("ttl_cnt_m7", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m7" in col])) \
                         .withColumn("ttl_cnt_m8", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m8" in col])) \
                         .withColumn("ttl_cnt_m9", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m9" in col])) \
                         .withColumn("ttl_cnt_m10", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m10" in col])) \
                         .withColumn("ttl_cnt_m11", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_m11" in col])) 

  act_ratio_0 = monthly_ttl_act_cnt_0 \
                    .withColumn(profile + "_ttl_clk_atc_ratio", F.round(monthly_ttl_act_cnt_0[profile + "_ttl_clk_atc_cnt"] / monthly_ttl_act_cnt_0["ttl_cnt"], 2)) \
                    .withColumn(profile + "_ttl_buy_ratio", F.round(monthly_ttl_act_cnt_0[profile + "_ttl_buy_cnt"] / monthly_ttl_act_cnt_0["ttl_cnt"], 2)) \
                    .withColumn(profile + "_ttl_atf_ratio", F.round(monthly_ttl_act_cnt_0[profile + "_ttl_atf_cnt"] / monthly_ttl_act_cnt_0["ttl_cnt"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m5", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m5"] / monthly_ttl_act_cnt_0["ttl_cnt_m5"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m6", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m6"] / monthly_ttl_act_cnt_0["ttl_cnt_m6"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m7", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m7"] / monthly_ttl_act_cnt_0["ttl_cnt_m7"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m8", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m8"] / monthly_ttl_act_cnt_0["ttl_cnt_m8"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m9", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m9"] / monthly_ttl_act_cnt_0["ttl_cnt_m9"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m10", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m10"] / monthly_ttl_act_cnt_0["ttl_cnt_m10"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_m11", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_m11"] / monthly_ttl_act_cnt_0["ttl_cnt_m11"], 2)) \
                    .withColumn(profile + "_buy_ratio_m5", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m5"] / monthly_ttl_act_cnt_0["ttl_cnt_m5"], 2)) \
                    .withColumn(profile + "_buy_ratio_m6", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m6"] / monthly_ttl_act_cnt_0["ttl_cnt_m6"], 2)) \
                    .withColumn(profile + "_buy_ratio_m7", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m7"] / monthly_ttl_act_cnt_0["ttl_cnt_m7"], 2)) \
                    .withColumn(profile + "_buy_ratio_m8", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m8"] / monthly_ttl_act_cnt_0["ttl_cnt_m8"], 2)) \
                    .withColumn(profile + "_buy_ratio_m9", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m9"] / monthly_ttl_act_cnt_0["ttl_cnt_m9"], 2)) \
                    .withColumn(profile + "_buy_ratio_m10", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m10"] / monthly_ttl_act_cnt_0["ttl_cnt_m10"], 2)) \
                    .withColumn(profile + "_buy_ratio_m11", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_m11"] / monthly_ttl_act_cnt_0["ttl_cnt_m11"], 2)) \
                    .withColumn(profile + "_atf_ratio_m5", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m5"] / monthly_ttl_act_cnt_0["ttl_cnt_m5"], 2)) \
                    .withColumn(profile + "_atf_ratio_m6", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m6"] / monthly_ttl_act_cnt_0["ttl_cnt_m6"], 2)) \
                    .withColumn(profile + "_atf_ratio_m7", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m7"] / monthly_ttl_act_cnt_0["ttl_cnt_m7"], 2)) \
                    .withColumn(profile + "_atf_ratio_m8", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m8"] / monthly_ttl_act_cnt_0["ttl_cnt_m8"], 2)) \
                    .withColumn(profile + "_atf_ratio_m9", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m9"] / monthly_ttl_act_cnt_0["ttl_cnt_m9"], 2)) \
                    .withColumn(profile + "_atf_ratio_m10", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m10"] / monthly_ttl_act_cnt_0["ttl_cnt_m10"], 2)) \
                    .withColumn(profile + "_atf_ratio_m11", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_m11"] / monthly_ttl_act_cnt_0["ttl_cnt_m11"], 2)) 

  act_ratio_1 = act_ratio_0 \
               .fillna(0) \
               .drop(*["ttl_cnt", "ttl_cnt_m5", "ttl_cnt_m6", "ttl_cnt_m7", "ttl_cnt_m8", "ttl_cnt_m9", "ttl_cnt_m10", "ttl_cnt_m11"])
  return act_ratio_1

In [14]:
act_cnt_ratio_2 = monthly_action_ratio("c", act_cnt_ratio_1).orderBy("cat_id")

### Day Counts Features

In [15]:
act_day_cnt_0 = train_log_0 \
            .groupBy("cat_id", "action_type") \
            .agg(F.countDistinct("time_stamp")) \
            .withColumnRenamed("count(time_stamp)", "day_cnt")

In [16]:
from pyspark.sql import functions as F

def dfPivot_value(df, keys, column, column_value):
    '''
    params: 
      df: dataframe
      keys: 待转换表中需要保留的主键key，以list[]类型传入
      column: 待转换的列名
      column_value_list: 需要成列值
    '''
    # .fillna(-999.0): 行转列时有时对应的列没有值，就会产生null值，fillna会对null做处理，转换为其他值，如不需要可以删除
    return df.groupBy(keys).pivot(column).agg(F.first(column_value, ignorenulls=True)).fillna(0)

In [17]:
keys = "cat_id"
column = "action_type"
column_value = "day_cnt"

act_day_cnt_1 = dfPivot_value(act_day_cnt_0, keys, column, column_value) \
            .withColumnRenamed("1", "c_clk_atc_day_cnt") \
            .withColumnRenamed("2", "c_buy_day_cnt") \
            .withColumnRenamed("3", "c_atf_day_cnt")

In [18]:
from pyspark.sql.types import *
dt_mth = F.udf(lambda x: x[:2], StringType())
train_log_1 = train_log_0 \
        .withColumn("time_stamp_mth", dt_mth("time_stamp")) \
        .select("cat_id", "time_stamp_mth", "time_stamp", "action_type")

act_day_cnt_2 = train_log_1 \
            .groupBy("cat_id", "time_stamp_mth", "action_type") \
            .agg(F.countDistinct("time_stamp")) \
            .withColumnRenamed("count(time_stamp)", "day_cnt") \
            .orderBy("cat_id", "time_stamp_mth")

In [19]:
def monthly_day_counts(profile, profile_id, df):
  day_cnt_0 = df \
              .withColumn(profile + "_clk_atc_day_cnt_m5", F.when(( (df.time_stamp_mth == "05") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m6", F.when(( (df.time_stamp_mth == "06") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m7", F.when(( (df.time_stamp_mth == "07") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m8", F.when(( (df.time_stamp_mth == "08") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m9", F.when(( (df.time_stamp_mth == "09") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m10", F.when(( (df.time_stamp_mth == "10") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_day_cnt_m11", F.when(( (df.time_stamp_mth == "11") & (df.action_type == "1") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m5", F.when(( (df.time_stamp_mth == "05") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m6", F.when(( (df.time_stamp_mth == "06") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m7", F.when(( (df.time_stamp_mth == "07") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m8", F.when(( (df.time_stamp_mth == "08") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m9", F.when(( (df.time_stamp_mth == "09") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m10", F.when(( (df.time_stamp_mth == "10") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_day_cnt_m11", F.when(( (df.time_stamp_mth == "11") & (df.action_type == "2") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m5", F.when(( (df.time_stamp_mth == "05") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m6", F.when(( (df.time_stamp_mth == "06") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m7", F.when(( (df.time_stamp_mth == "07") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m8", F.when(( (df.time_stamp_mth == "08") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m9", F.when(( (df.time_stamp_mth == "09") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m10", F.when(( (df.time_stamp_mth == "10") & (df.action_type == "3") ), df.day_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_day_cnt_m11", F.when(( (df.time_stamp_mth == "11") & (df.action_type == "3") ), df.day_cnt).otherwise(0))

  day_cnt_1 = day_cnt_0 \
            .groupBy(profile_id) \
            .sum() \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m5)", profile + "_clk_atc_day_cnt_m5") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m6)", profile + "_clk_atc_day_cnt_m6") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m7)", profile + "_clk_atc_day_cnt_m7") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m8)", profile + "_clk_atc_day_cnt_m8") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m9)", profile + "_clk_atc_day_cnt_m9") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m10)", profile + "_clk_atc_day_cnt_m10") \
            .withColumnRenamed("sum(" + profile + "_clk_atc_day_cnt_m11)", profile + "_clk_atc_day_cnt_m11") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m5)", profile + "_buy_day_cnt_m5") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m6)", profile + "_buy_day_cnt_m6") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m7)", profile + "_buy_day_cnt_m7") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m8)", profile + "_buy_day_cnt_m8") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m9)", profile + "_buy_day_cnt_m9") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m10)", profile + "_buy_day_cnt_m10") \
            .withColumnRenamed("sum(" + profile + "_buy_day_cnt_m11)", profile + "_buy_day_cnt_m11") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m5)", profile + "_atf_day_cnt_m5") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m6)", profile + "_atf_day_cnt_m6") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m7)", profile + "_atf_day_cnt_m7") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m8)", profile + "_atf_day_cnt_m8") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m9)", profile + "_atf_day_cnt_m9") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m10)", profile + "_atf_day_cnt_m10") \
            .withColumnRenamed("sum(" + profile + "_atf_day_cnt_m11)", profile + "_atf_day_cnt_m11") \
            .drop("sum(day_cnt)")
  return day_cnt_1

In [20]:
act_day_cnt_3 = monthly_day_counts("c", "cat_id", act_day_cnt_2).orderBy("cat_id")

In [21]:
cat_feature_0 = act_cnt_ratio_2 \
              .join(act_day_cnt_1, "cat_id", "left") \
              .join(act_day_cnt_3, "cat_id", "left") \
              .orderBy("cat_id")

# +------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+
# |cat_id|c_ttl_clk_atc_cnt|c_ttl_buy_cnt|c_ttl_atf_cnt|c_clk_atc_cnt_m5|c_clk_atc_cnt_m6|c_clk_atc_cnt_m7|c_clk_atc_cnt_m8|c_clk_atc_cnt_m9|c_clk_atc_cnt_m10|c_clk_atc_cnt_m11|c_buy_cnt_m5|c_buy_cnt_m6|c_buy_cnt_m7|c_buy_cnt_m8|c_buy_cnt_m9|c_buy_cnt_m10|c_buy_cnt_m11|c_atf_cnt_m5|c_atf_cnt_m6|c_atf_cnt_m7|c_atf_cnt_m8|c_atf_cnt_m9|c_atf_cnt_m10|c_atf_cnt_m11|c_ttl_clk_atc_ratio|c_ttl_buy_ratio|c_ttl_atf_ratio|c_clk_atc_ratio_m5|c_clk_atc_ratio_m6|c_clk_atc_ratio_m7|c_clk_atc_ratio_m8|c_clk_atc_ratio_m9|c_clk_atc_ratio_m10|c_clk_atc_ratio_m11|c_buy_ratio_m5|c_buy_ratio_m6|c_buy_ratio_m7|c_buy_ratio_m8|c_buy_ratio_m9|c_buy_ratio_m10|c_buy_ratio_m11|c_atf_ratio_m5|c_atf_ratio_m6|c_atf_ratio_m7|c_atf_ratio_m8|c_atf_ratio_m9|c_atf_ratio_m10|c_atf_ratio_m11|c_clk_atc_day_cnt|c_buy_day_cnt|c_atf_day_cnt|c_clk_atc_day_cnt_m5|c_clk_atc_day_cnt_m6|c_clk_atc_day_cnt_m7|c_clk_atc_day_cnt_m8|c_clk_atc_day_cnt_m9|c_clk_atc_day_cnt_m10|c_clk_atc_day_cnt_m11|c_buy_day_cnt_m5|c_buy_day_cnt_m6|c_buy_day_cnt_m7|c_buy_day_cnt_m8|c_buy_day_cnt_m9|c_buy_day_cnt_m10|c_buy_day_cnt_m11|c_atf_day_cnt_m5|c_atf_day_cnt_m6|c_atf_day_cnt_m7|c_atf_day_cnt_m8|c_atf_day_cnt_m9|c_atf_day_cnt_m10|c_atf_day_cnt_m11|
# +------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+
# |     1|              221|           48|           11|               8|              23|              23|              39|              28|               43|               57|           3|           7|           3|           9|          12|            9|            5|           3|           0|           0|           1|           1|            3|            3|               0.79|           0.17|           0.04|              0.57|              0.77|              0.88|               0.8|              0.68|               0.78|               0.88|          0.21|          0.23|          0.12|          0.18|          0.29|           0.16|           0.08|          0.21|           0.0|           0.0|          0.02|          0.02|           0.05|           0.05|               94|           36|            9|                   4|                  14|                  14|                  19|                  15|                   19|                    9|               3|               6|               3|               7|               9|                7|                1|               2|               0|               0|               1|               1|                3|                2|
# |    10|              249|           33|           14|              13|              20|              12|              16|              30|              122|               36|           4|           2|           0|           0|           8|           13|            6|           2|           2|           0|           3|           2|            3|            2|               0.84|           0.11|           0.05|              0.68|              0.83|               1.0|              0.84|              0.75|               0.88|               0.82|          0.21|          0.08|           0.0|           0.0|           0.2|           0.09|           0.14|          0.11|          0.08|           0.0|          0.16|          0.05|           0.02|           0.05|               77|           13|           13|                   7|                  12|                   6|                   9|                  14|                   23|                    6|               1|               1|               0|               0|               3|                7|                1|               2|               2|               0|               2|               2|                3|                2|
# |   100|              410|           55|           40|              21|              32|              35|              48|             119|               49|              106|           1|           3|           4|           3|          13|            9|           22|           2|           1|           8|           1|          18|            4|            6|               0.81|           0.11|           0.08|              0.88|              0.89|              0.74|              0.92|              0.79|               0.79|               0.79|          0.04|          0.08|          0.09|          0.06|          0.09|           0.15|           0.16|          0.08|          0.03|          0.17|          0.02|          0.12|           0.06|           0.04|              117|           31|           30|                   8|                  16|                  14|                  22|                  28|                   19|                   10|               1|               3|               4|               3|              11|                7|                2|               2|               1|               4|               1|              13|                4|                5|
# |  1000|               82|           10|            3|               0|              15|               9|               6|              14|                9|               29|           0|           3|           4|           2|           1|            0|            0|           0|           0|           0|           0|           1|            0|            2|               0.86|           0.11|           0.03|               0.0|              0.83|              0.69|              0.75|              0.88|                1.0|               0.94|           0.0|          0.17|          0.31|          0.25|          0.06|            0.0|            0.0|           0.0|           0.0|           0.0|           0.0|          0.06|            0.0|           0.06|               43|            9|            3|                   0|                   9|                   6|                   3|                   8|                    9|                    8|               0|               2|               4|               2|               1|                0|                0|               0|               0|               0|               0|               1|                0|                2|
# |  1001|              173|            9|            2|               1|              11|              13|              38|              47|               12|               51|           0|           2|           1|           3|           1|            0|            2|           0|           1|           0|           0|           0|            0|            1|               0.94|           0.05|           0.01|               1.0|              0.79|              0.93|              0.93|              0.98|                1.0|               0.94|           0.0|          0.14|          0.07|          0.07|          0.02|            0.0|           0.04|           0.0|          0.07|           0.0|           0.0|           0.0|            0.0|           0.02|               64|            8|            2|                   1|                   8|                   7|                  18|                  15|                    8|                    7|               0|               2|               1|               3|               1|                0|                1|               0|               1|               0|               0|               0|                0|                1|
# |  1002|                1|            0|            0|               0|               1|               0|               0|               0|                0|                0|           0|           0|           0|           0|           0|            0|            0|           0|           0|           0|           0|           0|            0|            0|                1.0|            0.0|            0.0|               0.0|               1.0|               0.0|               0.0|               0.0|                0.0|                0.0|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                1|            0|            0|                   0|                   1|                   0|                   0|                   0|                    0|                    0|               0|               0|               0|               0|               0|                0|                0|               0|               0|               0|               0|               0|                0|                0|
# |  1003|                2|            1|            0|               0|               0|               2|               0|               0|                0|                0|           0|           0|           1|           0|           0|            0|            0|           0|           0|           0|           0|           0|            0|            0|               0.67|           0.33|            0.0|               0.0|               0.0|              0.67|               0.0|               0.0|                0.0|                0.0|           0.0|           0.0|          0.33|           0.0|           0.0|            0.0|            0.0|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                1|            1|            0|                   0|                   0|                   1|                   0|                   0|                    0|                    0|               0|               0|               1|               0|               0|                0|                0|               0|               0|               0|               0|               0|                0|                0|
# |  1004|               58|            1|            0|               0|               0|               0|               0|               0|                0|               58|           0|           0|           0|           0|           0|            0|            1|           0|           0|           0|           0|           0|            0|            0|               0.98|           0.02|            0.0|               0.0|               0.0|               0.0|               0.0|               0.0|                0.0|               0.98|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.02|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                3|            1|            0|                   0|                   0|                   0|                   0|                   0|                    0|                    3|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|
# |  1005|             8232|         1455|          476|             497|            1171|             923|            1346|             987|             1046|             2262|         133|         181|         193|         278|         137|          148|          385|          33|          71|          61|          87|          38|           63|          123|               0.81|           0.14|           0.05|              0.75|              0.82|              0.78|              0.79|              0.85|               0.83|               0.82|           0.2|          0.13|          0.16|          0.16|          0.12|           0.12|           0.14|          0.05|          0.05|          0.05|          0.05|          0.03|           0.05|           0.04|              178|          184|          145|                  14|                  30|                  31|                  31|                  30|                   31|                   11|              21|              30|              31|              31|              30|               31|               10|              13|              23|              26|              27|              17|               28|               11|
# |  1006|              695|           76|           29|              32|              68|             150|             108|              79|              110|              148|           3|           9|          16|           6|           9|           10|           23|           5|           6|           2|           1|           6|            3|            6|               0.87|            0.1|           0.04|               0.8|              0.82|              0.89|              0.94|              0.84|               0.89|               0.84|          0.08|          0.11|           0.1|          0.05|           0.1|           0.08|           0.13|          0.13|          0.07|          0.01|          0.01|          0.06|           0.02|           0.03|              142|           39|           25|                   8|                  17|                  27|                  27|                  24|                   29|                   10|               2|               6|               9|               5|               7|                8|                2|               4|               4|               2|               1|               6|                3|                5|
# +------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+


+------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+---------

### Monthly Aggregation Features

In [22]:
# df = spark.createDataFrame([(1, 2, 3, 4), (1, 4, 100, 5), (20, 30, 50, 10)],['a', 'b', 'c', 'd'])
# df1 = df.withColumn("mean", maximum(*(df.columns[0:4])))
# df1.show()

In [23]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import array, udf, array_sort, floor, col, size, sqrt, greatest
from pyspark.sql import Column

# function that calculates the row-wise average
def ssy_mean(*args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    n = len([col_(x) for x in args])
    avg = sum([col_(x) for x in args]) / n
    return F.round(avg, 2)

# function that calculates the row-wise percentage 
def ssy_percentile(p, *args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    xs = array_sort(array(*[col_(x) for x in args]))
    n = size(xs)
    h = (n - 1) * p
    i = floor(h).cast("int")
    x0, x1 = xs[i], xs[i + 1]
    return F.round((x0 + (h - i) * (x1 - x0)), 2)

# function that calculates the row-wise std
def ssy_std(*args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    n = len([col_(x) for x in args])
    avg = sum([col_(x) for x in args]) / n
    std = sqrt(sum([(col_(x) - avg) ** 2 for x in args]) / (n - 1))
    return F.round(std, 2)

# function that calculates the row-wise maximum
def ssy_maximum(*args):
    def col_(c):
        if isinstance(c, Column):
            return c
        elif isinstance(c, str):
            return col(c)
        else:
            raise TypeError("args should be str or Column, got {}".format(type(c)))

    max = greatest(*[col_(x) for x in args])
    return F.round(max, 2)

In [24]:
def monthly_aggregation(profile, df):
  mth_agg_0 = df \
              .withColumn(profile + "_clk_atc_mean", ssy_mean(*(df.columns[4:10]))) \
              .withColumn(profile + "_buy_mean", ssy_mean(*(df.columns[11:17]))) \
              .withColumn(profile + "_atf_mean", ssy_mean(*(df.columns[18:24]))) \
              .withColumn(profile + "_clk_atc_median", ssy_percentile(0.5, *(df.columns[4:10]))) \
              .withColumn(profile + "_buy_median", ssy_percentile(0.5, *(df.columns[11:17]))) \
              .withColumn(profile + "_atf_median", ssy_percentile(0.5, *(df.columns[18:24]))) \
              .withColumn(profile + "_clk_atc_std", ssy_std(*(df.columns[4:10]))) \
              .withColumn(profile + "_buy_std", ssy_std(*(df.columns[11:17]))) \
              .withColumn(profile + "_atf_std", ssy_std(*(df.columns[18:24]))) \
              .withColumn(profile + "_clk_atc_max", ssy_maximum(*(df.columns[4:10]))) \
              .withColumn(profile + "_buy_max", ssy_maximum(*(df.columns[11:17]))) \
              .withColumn(profile + "_atf_max", ssy_maximum(*(df.columns[18:24])))
  return mth_agg_0

In [25]:
cat_feature_1 = monthly_aggregation("c", cat_feature_0).orderBy("cat_id")

# +------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+
# |cat_id|c_ttl_clk_atc_cnt|c_ttl_buy_cnt|c_ttl_atf_cnt|c_clk_atc_cnt_m5|c_clk_atc_cnt_m6|c_clk_atc_cnt_m7|c_clk_atc_cnt_m8|c_clk_atc_cnt_m9|c_clk_atc_cnt_m10|c_clk_atc_cnt_m11|c_buy_cnt_m5|c_buy_cnt_m6|c_buy_cnt_m7|c_buy_cnt_m8|c_buy_cnt_m9|c_buy_cnt_m10|c_buy_cnt_m11|c_atf_cnt_m5|c_atf_cnt_m6|c_atf_cnt_m7|c_atf_cnt_m8|c_atf_cnt_m9|c_atf_cnt_m10|c_atf_cnt_m11|c_ttl_clk_atc_ratio|c_ttl_buy_ratio|c_ttl_atf_ratio|c_clk_atc_ratio_m5|c_clk_atc_ratio_m6|c_clk_atc_ratio_m7|c_clk_atc_ratio_m8|c_clk_atc_ratio_m9|c_clk_atc_ratio_m10|c_clk_atc_ratio_m11|c_buy_ratio_m5|c_buy_ratio_m6|c_buy_ratio_m7|c_buy_ratio_m8|c_buy_ratio_m9|c_buy_ratio_m10|c_buy_ratio_m11|c_atf_ratio_m5|c_atf_ratio_m6|c_atf_ratio_m7|c_atf_ratio_m8|c_atf_ratio_m9|c_atf_ratio_m10|c_atf_ratio_m11|c_clk_atc_day_cnt|c_buy_day_cnt|c_atf_day_cnt|c_clk_atc_day_cnt_m5|c_clk_atc_day_cnt_m6|c_clk_atc_day_cnt_m7|c_clk_atc_day_cnt_m8|c_clk_atc_day_cnt_m9|c_clk_atc_day_cnt_m10|c_clk_atc_day_cnt_m11|c_buy_day_cnt_m5|c_buy_day_cnt_m6|c_buy_day_cnt_m7|c_buy_day_cnt_m8|c_buy_day_cnt_m9|c_buy_day_cnt_m10|c_buy_day_cnt_m11|c_atf_day_cnt_m5|c_atf_day_cnt_m6|c_atf_day_cnt_m7|c_atf_day_cnt_m8|c_atf_day_cnt_m9|c_atf_day_cnt_m10|c_atf_day_cnt_m11|c_clk_atc_mean|c_buy_mean|c_atf_mean|c_clk_atc_median|c_buy_median|c_atf_median|c_clk_atc_std|c_buy_std|c_atf_std|c_clk_atc_max|c_buy_max|c_atf_max|
# +------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+
# |     1|              221|           48|           11|               8|              23|              23|              39|              28|               43|               57|           3|           7|           3|           9|          12|            9|            5|           3|           0|           0|           1|           1|            3|            3|               0.79|           0.17|           0.04|              0.57|              0.77|              0.88|               0.8|              0.68|               0.78|               0.88|          0.21|          0.23|          0.12|          0.18|          0.29|           0.16|           0.08|          0.21|           0.0|           0.0|          0.02|          0.02|           0.05|           0.05|               94|           36|            9|                   4|                  14|                  14|                  19|                  15|                   19|                    9|               3|               6|               3|               7|               9|                7|                1|               2|               0|               0|               1|               1|                3|                2|         27.33|      7.17|      1.33|            25.5|         8.0|         1.0|         12.6|      3.6|     1.37|           43|       12|        3|
# |    10|              249|           33|           14|              13|              20|              12|              16|              30|              122|               36|           4|           2|           0|           0|           8|           13|            6|           2|           2|           0|           3|           2|            3|            2|               0.84|           0.11|           0.05|              0.68|              0.83|               1.0|              0.84|              0.75|               0.88|               0.82|          0.21|          0.08|           0.0|           0.0|           0.2|           0.09|           0.14|          0.11|          0.08|           0.0|          0.16|          0.05|           0.02|           0.05|               77|           13|           13|                   7|                  12|                   6|                   9|                  14|                   23|                    6|               1|               1|               0|               0|               3|                7|                1|               2|               2|               0|               2|               2|                3|                2|          35.5|       4.5|       2.0|            18.0|         3.0|         2.0|        42.88|     5.13|      1.1|          122|       13|        3|
# |   100|              410|           55|           40|              21|              32|              35|              48|             119|               49|              106|           1|           3|           4|           3|          13|            9|           22|           2|           1|           8|           1|          18|            4|            6|               0.81|           0.11|           0.08|              0.88|              0.89|              0.74|              0.92|              0.79|               0.79|               0.79|          0.04|          0.08|          0.09|          0.06|          0.09|           0.15|           0.16|          0.08|          0.03|          0.17|          0.02|          0.12|           0.06|           0.04|              117|           31|           30|                   8|                  16|                  14|                  22|                  28|                   19|                   10|               1|               3|               4|               3|              11|                7|                2|               2|               1|               4|               1|              13|                4|                5|         50.67|       5.5|      5.67|            41.5|         3.5|         3.0|        35.08|     4.55|     6.59|          119|       13|       18|
# |  1000|               82|           10|            3|               0|              15|               9|               6|              14|                9|               29|           0|           3|           4|           2|           1|            0|            0|           0|           0|           0|           0|           1|            0|            2|               0.86|           0.11|           0.03|               0.0|              0.83|              0.69|              0.75|              0.88|                1.0|               0.94|           0.0|          0.17|          0.31|          0.25|          0.06|            0.0|            0.0|           0.0|           0.0|           0.0|           0.0|          0.06|            0.0|           0.06|               43|            9|            3|                   0|                   9|                   6|                   3|                   8|                    9|                    8|               0|               2|               4|               2|               1|                0|                0|               0|               0|               0|               0|               1|                0|                2|          8.83|      1.67|      0.17|             9.0|         1.5|         0.0|         5.49|     1.63|     0.41|           15|        4|        1|
# |  1001|              173|            9|            2|               1|              11|              13|              38|              47|               12|               51|           0|           2|           1|           3|           1|            0|            2|           0|           1|           0|           0|           0|            0|            1|               0.94|           0.05|           0.01|               1.0|              0.79|              0.93|              0.93|              0.98|                1.0|               0.94|           0.0|          0.14|          0.07|          0.07|          0.02|            0.0|           0.04|           0.0|          0.07|           0.0|           0.0|           0.0|            0.0|           0.02|               64|            8|            2|                   1|                   8|                   7|                  18|                  15|                    8|                    7|               0|               2|               1|               3|               1|                0|                1|               0|               1|               0|               0|               0|                0|                1|         20.33|      1.17|      0.17|            12.5|         1.0|         0.0|        17.93|     1.17|     0.41|           47|        3|        1|
# |  1002|                1|            0|            0|               0|               1|               0|               0|               0|                0|                0|           0|           0|           0|           0|           0|            0|            0|           0|           0|           0|           0|           0|            0|            0|                1.0|            0.0|            0.0|               0.0|               1.0|               0.0|               0.0|               0.0|                0.0|                0.0|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                1|            0|            0|                   0|                   1|                   0|                   0|                   0|                    0|                    0|               0|               0|               0|               0|               0|                0|                0|               0|               0|               0|               0|               0|                0|                0|          0.17|       0.0|       0.0|             0.0|         0.0|         0.0|         0.41|      0.0|      0.0|            1|        0|        0|
# |  1003|                2|            1|            0|               0|               0|               2|               0|               0|                0|                0|           0|           0|           1|           0|           0|            0|            0|           0|           0|           0|           0|           0|            0|            0|               0.67|           0.33|            0.0|               0.0|               0.0|              0.67|               0.0|               0.0|                0.0|                0.0|           0.0|           0.0|          0.33|           0.0|           0.0|            0.0|            0.0|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                1|            1|            0|                   0|                   0|                   1|                   0|                   0|                    0|                    0|               0|               0|               1|               0|               0|                0|                0|               0|               0|               0|               0|               0|                0|                0|          0.33|      0.17|       0.0|             0.0|         0.0|         0.0|         0.82|     0.41|      0.0|            2|        1|        0|
# |  1004|               58|            1|            0|               0|               0|               0|               0|               0|                0|               58|           0|           0|           0|           0|           0|            0|            1|           0|           0|           0|           0|           0|            0|            0|               0.98|           0.02|            0.0|               0.0|               0.0|               0.0|               0.0|               0.0|                0.0|               0.98|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.02|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                3|            1|            0|                   0|                   0|                   0|                   0|                   0|                    0|                    3|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|           0.0|       0.0|       0.0|             0.0|         0.0|         0.0|          0.0|      0.0|      0.0|            0|        0|        0|
# |  1005|             8232|         1455|          476|             497|            1171|             923|            1346|             987|             1046|             2262|         133|         181|         193|         278|         137|          148|          385|          33|          71|          61|          87|          38|           63|          123|               0.81|           0.14|           0.05|              0.75|              0.82|              0.78|              0.79|              0.85|               0.83|               0.82|           0.2|          0.13|          0.16|          0.16|          0.12|           0.12|           0.14|          0.05|          0.05|          0.05|          0.05|          0.03|           0.05|           0.04|              178|          184|          145|                  14|                  30|                  31|                  31|                  30|                   31|                   11|              21|              30|              31|              31|              30|               31|               10|              13|              23|              26|              27|              17|               28|               11|         995.0|    178.33|     58.83|          1016.5|       164.5|        62.0|       286.37|    54.48|    20.32|         1346|      278|       87|
# |  1006|              695|           76|           29|              32|              68|             150|             108|              79|              110|              148|           3|           9|          16|           6|           9|           10|           23|           5|           6|           2|           1|           6|            3|            6|               0.87|            0.1|           0.04|               0.8|              0.82|              0.89|              0.94|              0.84|               0.89|               0.84|          0.08|          0.11|           0.1|          0.05|           0.1|           0.08|           0.13|          0.13|          0.07|          0.01|          0.01|          0.06|           0.02|           0.03|              142|           39|           25|                   8|                  17|                  27|                  27|                  24|                   29|                   10|               2|               6|               9|               5|               7|                8|                2|               4|               4|               2|               1|               6|                3|                5|         91.17|      8.83|      3.83|            93.5|         9.0|         4.0|        40.71|     4.36|     2.14|          150|       16|        6|
# +------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+


+------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+---------

### Recent Features

In [26]:
def recent_action_counts(profile, profile_id, act_cnt_log):
  # montly filter
  act_cnt_0 = act_cnt_log \
              .withColumn(profile + "_clk_atc_cnt_double11", F.when(act_cnt_log.time_stamp == "1111", act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_double11", F.when(act_cnt_log.time_stamp == "1111", act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_double11", F.when(act_cnt_log.time_stamp == "1111", act_cnt_log.atf_cnt).otherwise(0)) \
              .withColumn(profile + "_clk_atc_cnt_1wpre_double11", F.when(((act_cnt_log.time_stamp >= "1104") & (act_cnt_log.time_stamp <= "1110")), act_cnt_log.clk_atc_cnt).otherwise(0)) \
              .withColumn(profile + "_buy_cnt_1wpre_double11", F.when(((act_cnt_log.time_stamp >= "1104") & (act_cnt_log.time_stamp <= "1110")), act_cnt_log.buy_cnt).otherwise(0)) \
              .withColumn(profile + "_atf_cnt_1wpre_double11", F.when(((act_cnt_log.time_stamp >= "1104") & (act_cnt_log.time_stamp <= "1110")), act_cnt_log.atf_cnt).otherwise(0))

  act_cnt_1 = act_cnt_0 \
              .groupBy(profile_id) \
              .sum() \
              .withColumnRenamed("sum(clk_atc_cnt)", profile + "_ttl_clk_atc_cnt") \
              .withColumnRenamed("sum(buy_cnt)", profile + "_ttl_buy_cnt") \
              .withColumnRenamed("sum(atf_cnt)", profile + "_ttl_atf_cnt") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_double11)", profile + "_clk_atc_cnt_double11") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_double11)", profile + "_buy_cnt_double11") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_double11)", profile + "_atf_cnt_double11") \
              .withColumnRenamed("sum(" + profile + "_clk_atc_cnt_1wpre_double11)", profile + "_clk_atc_cnt_1wpre_double11") \
              .withColumnRenamed("sum(" + profile + "_buy_cnt_1wpre_double11)", profile + "_buy_cnt_1wpre_double11") \
              .withColumnRenamed("sum(" + profile + "_atf_cnt_1wpre_double11)", profile + "_atf_cnt_1wpre_double11") \
              .drop(*["c_ttl_clk_atc_cnt", "c_ttl_buy_cnt", "c_ttl_atf_cnt"])
  return act_cnt_1

In [27]:
cat_feature_2 = cat_feature_1 \
                .join(recent_action_counts("c", "cat_id", act_cnt_ratio_0), "cat_id", "left") \
                .orderBy("cat_id")

In [28]:
def recent_action_ratio(profile, act_cnt_log):
  monthly_ttl_act_cnt_0 = act_cnt_log \
                         .withColumn("ttl_cnt_double11", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_double11" in col])) \
                         .withColumn("ttl_cnt_1wpre_double11", sum([act_cnt_log[col] for col in act_cnt_log.columns if "_cnt_1wpre_double11" in col])) 

  act_ratio_0 = monthly_ttl_act_cnt_0 \
                    .withColumn(profile + "_clk_atc_ratio_double11", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_double11"], 2)) \
                    .withColumn(profile + "_clk_atc_ratio_1wpre_double11", F.round(monthly_ttl_act_cnt_0[profile + "_clk_atc_cnt_1wpre_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_1wpre_double11"], 2)) \
                    .withColumn(profile + "_buy_ratio_double11", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_double11"], 2)) \
                    .withColumn(profile + "_buy_ratio_1wpre_double11", F.round(monthly_ttl_act_cnt_0[profile + "_buy_cnt_1wpre_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_1wpre_double11"], 2)) \
                    .withColumn(profile + "_atf_ratio_double11", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_double11"], 2)) \
                    .withColumn(profile + "_atf_ratio_1wpre_double11", F.round(monthly_ttl_act_cnt_0[profile + "_atf_cnt_1wpre_double11"] / monthly_ttl_act_cnt_0["ttl_cnt_1wpre_double11"], 2)) \
                    .drop(*["ttl_cnt_double11", "ttl_cnt_1wpre_double11"])

  act_ratio_1 = act_ratio_0 \
               .fillna(0) 
  return act_ratio_1

In [29]:
cat_feature_3 = recent_action_ratio("c", cat_feature_2)

# +------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+------------------+------------------+----------------------------+------------------------+------------------------+------------------------+------------------------------+--------------------+--------------------------+--------------------+--------------------------+
# |cat_id|c_ttl_clk_atc_cnt|c_ttl_buy_cnt|c_ttl_atf_cnt|c_clk_atc_cnt_m5|c_clk_atc_cnt_m6|c_clk_atc_cnt_m7|c_clk_atc_cnt_m8|c_clk_atc_cnt_m9|c_clk_atc_cnt_m10|c_clk_atc_cnt_m11|c_buy_cnt_m5|c_buy_cnt_m6|c_buy_cnt_m7|c_buy_cnt_m8|c_buy_cnt_m9|c_buy_cnt_m10|c_buy_cnt_m11|c_atf_cnt_m5|c_atf_cnt_m6|c_atf_cnt_m7|c_atf_cnt_m8|c_atf_cnt_m9|c_atf_cnt_m10|c_atf_cnt_m11|c_ttl_clk_atc_ratio|c_ttl_buy_ratio|c_ttl_atf_ratio|c_clk_atc_ratio_m5|c_clk_atc_ratio_m6|c_clk_atc_ratio_m7|c_clk_atc_ratio_m8|c_clk_atc_ratio_m9|c_clk_atc_ratio_m10|c_clk_atc_ratio_m11|c_buy_ratio_m5|c_buy_ratio_m6|c_buy_ratio_m7|c_buy_ratio_m8|c_buy_ratio_m9|c_buy_ratio_m10|c_buy_ratio_m11|c_atf_ratio_m5|c_atf_ratio_m6|c_atf_ratio_m7|c_atf_ratio_m8|c_atf_ratio_m9|c_atf_ratio_m10|c_atf_ratio_m11|c_clk_atc_day_cnt|c_buy_day_cnt|c_atf_day_cnt|c_clk_atc_day_cnt_m5|c_clk_atc_day_cnt_m6|c_clk_atc_day_cnt_m7|c_clk_atc_day_cnt_m8|c_clk_atc_day_cnt_m9|c_clk_atc_day_cnt_m10|c_clk_atc_day_cnt_m11|c_buy_day_cnt_m5|c_buy_day_cnt_m6|c_buy_day_cnt_m7|c_buy_day_cnt_m8|c_buy_day_cnt_m9|c_buy_day_cnt_m10|c_buy_day_cnt_m11|c_atf_day_cnt_m5|c_atf_day_cnt_m6|c_atf_day_cnt_m7|c_atf_day_cnt_m8|c_atf_day_cnt_m9|c_atf_day_cnt_m10|c_atf_day_cnt_m11|c_clk_atc_mean|c_buy_mean|c_atf_mean|c_clk_atc_median|c_buy_median|c_atf_median|c_clk_atc_std|c_buy_std|c_atf_std|c_clk_atc_max|c_buy_max|c_atf_max|c_clk_atc_cnt_double11|c_buy_cnt_double11|c_atf_cnt_double11|c_clk_atc_cnt_1wpre_double11|c_buy_cnt_1wpre_double11|c_atf_cnt_1wpre_double11|c_clk_atc_ratio_double11|c_clk_atc_ratio_1wpre_double11|c_buy_ratio_double11|c_buy_ratio_1wpre_double11|c_atf_ratio_double11|c_atf_ratio_1wpre_double11|
# +------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+------------------+------------------+----------------------------+------------------------+------------------------+------------------------+------------------------------+--------------------+--------------------------+--------------------+--------------------------+
# |     1|              221|           48|           11|               8|              23|              23|              39|              28|               43|               57|           3|           7|           3|           9|          12|            9|            5|           3|           0|           0|           1|           1|            3|            3|               0.79|           0.17|           0.04|              0.57|              0.77|              0.88|               0.8|              0.68|               0.78|               0.88|          0.21|          0.23|          0.12|          0.18|          0.29|           0.16|           0.08|          0.21|           0.0|           0.0|          0.02|          0.02|           0.05|           0.05|               94|           36|            9|                   4|                  14|                  14|                  19|                  15|                   19|                    9|               3|               6|               3|               7|               9|                7|                1|               2|               0|               0|               1|               1|                3|                2|         27.33|      7.17|      1.33|            25.5|         8.0|         1.0|         12.6|      3.6|     1.37|           43|       12|        3|                    28|                 5|                 0|                          28|                       0|                       3|                    0.85|                           0.9|                0.15|                       0.0|                 0.0|                       0.1|
# |    10|              249|           33|           14|              13|              20|              12|              16|              30|              122|               36|           4|           2|           0|           0|           8|           13|            6|           2|           2|           0|           3|           2|            3|            2|               0.84|           0.11|           0.05|              0.68|              0.83|               1.0|              0.84|              0.75|               0.88|               0.82|          0.21|          0.08|           0.0|           0.0|           0.2|           0.09|           0.14|          0.11|          0.08|           0.0|          0.16|          0.05|           0.02|           0.05|               77|           13|           13|                   7|                  12|                   6|                   9|                  14|                   23|                    6|               1|               1|               0|               0|               3|                7|                1|               2|               2|               0|               2|               2|                3|                2|          35.5|       4.5|       2.0|            18.0|         3.0|         2.0|        42.88|     5.13|      1.1|          122|       13|        3|                    19|                 6|                 1|                          15|                       0|                       0|                    0.73|                           1.0|                0.23|                       0.0|                0.04|                       0.0|
# |   100|              410|           55|           40|              21|              32|              35|              48|             119|               49|              106|           1|           3|           4|           3|          13|            9|           22|           2|           1|           8|           1|          18|            4|            6|               0.81|           0.11|           0.08|              0.88|              0.89|              0.74|              0.92|              0.79|               0.79|               0.79|          0.04|          0.08|          0.09|          0.06|          0.09|           0.15|           0.16|          0.08|          0.03|          0.17|          0.02|          0.12|           0.06|           0.04|              117|           31|           30|                   8|                  16|                  14|                  22|                  28|                   19|                   10|               1|               3|               4|               3|              11|                7|                2|               2|               1|               4|               1|              13|                4|                5|         50.67|       5.5|      5.67|            41.5|         3.5|         3.0|        35.08|     4.55|     6.59|          119|       13|       18|                    54|                21|                 0|                          45|                       0|                       4|                    0.72|                          0.92|                0.28|                       0.0|                 0.0|                      0.08|
# |  1000|               82|           10|            3|               0|              15|               9|               6|              14|                9|               29|           0|           3|           4|           2|           1|            0|            0|           0|           0|           0|           0|           1|            0|            2|               0.86|           0.11|           0.03|               0.0|              0.83|              0.69|              0.75|              0.88|                1.0|               0.94|           0.0|          0.17|          0.31|          0.25|          0.06|            0.0|            0.0|           0.0|           0.0|           0.0|           0.0|          0.06|            0.0|           0.06|               43|            9|            3|                   0|                   9|                   6|                   3|                   8|                    9|                    8|               0|               2|               4|               2|               1|                0|                0|               0|               0|               0|               0|               1|                0|                2|          8.83|      1.67|      0.17|             9.0|         1.5|         0.0|         5.49|     1.63|     0.41|           15|        4|        1|                    16|                 0|                 0|                           9|                       0|                       1|                     1.0|                           0.9|                 0.0|                       0.0|                 0.0|                       0.1|
# |  1001|              173|            9|            2|               1|              11|              13|              38|              47|               12|               51|           0|           2|           1|           3|           1|            0|            2|           0|           1|           0|           0|           0|            0|            1|               0.94|           0.05|           0.01|               1.0|              0.79|              0.93|              0.93|              0.98|                1.0|               0.94|           0.0|          0.14|          0.07|          0.07|          0.02|            0.0|           0.04|           0.0|          0.07|           0.0|           0.0|           0.0|            0.0|           0.02|               64|            8|            2|                   1|                   8|                   7|                  18|                  15|                    8|                    7|               0|               2|               1|               3|               1|                0|                1|               0|               1|               0|               0|               0|                0|                1|         20.33|      1.17|      0.17|            12.5|         1.0|         0.0|        17.93|     1.17|     0.41|           47|        3|        1|                    27|                 2|                 0|                          24|                       0|                       1|                    0.93|                          0.96|                0.07|                       0.0|                 0.0|                      0.04|
# |  1002|                1|            0|            0|               0|               1|               0|               0|               0|                0|                0|           0|           0|           0|           0|           0|            0|            0|           0|           0|           0|           0|           0|            0|            0|                1.0|            0.0|            0.0|               0.0|               1.0|               0.0|               0.0|               0.0|                0.0|                0.0|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                1|            0|            0|                   0|                   1|                   0|                   0|                   0|                    0|                    0|               0|               0|               0|               0|               0|                0|                0|               0|               0|               0|               0|               0|                0|                0|          0.17|       0.0|       0.0|             0.0|         0.0|         0.0|         0.41|      0.0|      0.0|            1|        0|        0|                     0|                 0|                 0|                           0|                       0|                       0|                     0.0|                           0.0|                 0.0|                       0.0|                 0.0|                       0.0|
# |  1003|                2|            1|            0|               0|               0|               2|               0|               0|                0|                0|           0|           0|           1|           0|           0|            0|            0|           0|           0|           0|           0|           0|            0|            0|               0.67|           0.33|            0.0|               0.0|               0.0|              0.67|               0.0|               0.0|                0.0|                0.0|           0.0|           0.0|          0.33|           0.0|           0.0|            0.0|            0.0|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                1|            1|            0|                   0|                   0|                   1|                   0|                   0|                    0|                    0|               0|               0|               1|               0|               0|                0|                0|               0|               0|               0|               0|               0|                0|                0|          0.33|      0.17|       0.0|             0.0|         0.0|         0.0|         0.82|     0.41|      0.0|            2|        1|        0|                     0|                 0|                 0|                           0|                       0|                       0|                     0.0|                           0.0|                 0.0|                       0.0|                 0.0|                       0.0|
# |  1004|               58|            1|            0|               0|               0|               0|               0|               0|                0|               58|           0|           0|           0|           0|           0|            0|            1|           0|           0|           0|           0|           0|            0|            0|               0.98|           0.02|            0.0|               0.0|               0.0|               0.0|               0.0|               0.0|                0.0|               0.98|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.02|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                3|            1|            0|                   0|                   0|                   0|                   0|                   0|                    0|                    3|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|           0.0|       0.0|       0.0|             0.0|         0.0|         0.0|          0.0|      0.0|      0.0|            0|        0|        0|                    55|                 1|                 0|                           3|                       0|                       0|                    0.98|                           1.0|                0.02|                       0.0|                 0.0|                       0.0|
# |  1005|             8232|         1455|          476|             497|            1171|             923|            1346|             987|             1046|             2262|         133|         181|         193|         278|         137|          148|          385|          33|          71|          61|          87|          38|           63|          123|               0.81|           0.14|           0.05|              0.75|              0.82|              0.78|              0.79|              0.85|               0.83|               0.82|           0.2|          0.13|          0.16|          0.16|          0.12|           0.12|           0.14|          0.05|          0.05|          0.05|          0.05|          0.03|           0.05|           0.04|              178|          184|          145|                  14|                  30|                  31|                  31|                  30|                   31|                   11|              21|              30|              31|              31|              30|               31|               10|              13|              23|              26|              27|              17|               28|               11|         995.0|    178.33|     58.83|          1016.5|       164.5|        62.0|       286.37|    54.48|    20.32|         1346|      278|       87|                  1164|               339|                25|                         895|                      38|                      80|                    0.76|                          0.88|                0.22|                      0.04|                0.02|                      0.08|
# |  1006|              695|           76|           29|              32|              68|             150|             108|              79|              110|              148|           3|           9|          16|           6|           9|           10|           23|           5|           6|           2|           1|           6|            3|            6|               0.87|            0.1|           0.04|               0.8|              0.82|              0.89|              0.94|              0.84|               0.89|               0.84|          0.08|          0.11|           0.1|          0.05|           0.1|           0.08|           0.13|          0.13|          0.07|          0.01|          0.01|          0.06|           0.02|           0.03|              142|           39|           25|                   8|                  17|                  27|                  27|                  24|                   29|                   10|               2|               6|               9|               5|               7|                8|                2|               4|               4|               2|               1|               6|                3|                5|         91.17|      8.83|      3.83|            93.5|         9.0|         4.0|        40.71|     4.36|     2.14|          150|       16|        6|                    91|                22|                 0|                          53|                       1|                       4|                    0.81|                          0.91|                0.19|                      0.02|                 0.0|                      0.07|
# +------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+------------------+------------------+----------------------------+------------------------+------------------------+------------------------+------------------------------+--------------------+--------------------------+--------------------+--------------------------+


+------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+---------

### Repeat Buyer Features

#### Repeat buyer number

In [30]:
def repeat_buy_number(profile, profile_id):
  rpt_byr_0 = train_log_0.filter(train_log_0.action_type == 2) \
          .groupBy(profile_id, "user_id") \
          .agg(F.count("action_type").alias("act_cnt")) \
          .orderBy(profile_id)

  rpt_byr_1 = rpt_byr_0 \
            .filter(rpt_byr_0.act_cnt >= 2) \
            .groupBy(profile_id) \
            .agg(F.countDistinct("user_id").alias(profile + "_rpt_byr_num")) \
            .orderBy(profile_id)
  return rpt_byr_1

In [31]:
cat_feature_4 = cat_feature_3 \
                .join(repeat_buy_number("c", "cat_id"), "cat_id", "left") \
                .orderBy("cat_id")

#### Repeat day number

In [32]:
def repeat_day_number(profile, profile_id):
  rpt_byr_0 = train_log_0.filter(train_log_0.action_type == 2) \
          .groupBy(profile_id, "user_id") \
          .agg(F.count("action_type").alias("act_cnt"), F.countDistinct("time_stamp").alias("day_cnt")) \
          .orderBy(profile_id)

  rpt_byr_1 = rpt_byr_0 \
            .filter(rpt_byr_0.act_cnt >= 2) \
            .groupBy(profile_id) \
            .agg(F.sum("day_cnt").alias(profile + "_rpt_byr_day")) \
            .orderBy(profile_id)
  return rpt_byr_1

In [33]:
cat_feature_5 = cat_feature_4 \
                .join(repeat_day_number("c", "cat_id"), "cat_id", "left") \
                .orderBy("cat_id")

# +------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+------------------+------------------+----------------------------+------------------------+------------------------+------------------------+------------------------------+--------------------+--------------------------+--------------------+--------------------------+-------------+-------------+
# |cat_id|c_ttl_clk_atc_cnt|c_ttl_buy_cnt|c_ttl_atf_cnt|c_clk_atc_cnt_m5|c_clk_atc_cnt_m6|c_clk_atc_cnt_m7|c_clk_atc_cnt_m8|c_clk_atc_cnt_m9|c_clk_atc_cnt_m10|c_clk_atc_cnt_m11|c_buy_cnt_m5|c_buy_cnt_m6|c_buy_cnt_m7|c_buy_cnt_m8|c_buy_cnt_m9|c_buy_cnt_m10|c_buy_cnt_m11|c_atf_cnt_m5|c_atf_cnt_m6|c_atf_cnt_m7|c_atf_cnt_m8|c_atf_cnt_m9|c_atf_cnt_m10|c_atf_cnt_m11|c_ttl_clk_atc_ratio|c_ttl_buy_ratio|c_ttl_atf_ratio|c_clk_atc_ratio_m5|c_clk_atc_ratio_m6|c_clk_atc_ratio_m7|c_clk_atc_ratio_m8|c_clk_atc_ratio_m9|c_clk_atc_ratio_m10|c_clk_atc_ratio_m11|c_buy_ratio_m5|c_buy_ratio_m6|c_buy_ratio_m7|c_buy_ratio_m8|c_buy_ratio_m9|c_buy_ratio_m10|c_buy_ratio_m11|c_atf_ratio_m5|c_atf_ratio_m6|c_atf_ratio_m7|c_atf_ratio_m8|c_atf_ratio_m9|c_atf_ratio_m10|c_atf_ratio_m11|c_clk_atc_day_cnt|c_buy_day_cnt|c_atf_day_cnt|c_clk_atc_day_cnt_m5|c_clk_atc_day_cnt_m6|c_clk_atc_day_cnt_m7|c_clk_atc_day_cnt_m8|c_clk_atc_day_cnt_m9|c_clk_atc_day_cnt_m10|c_clk_atc_day_cnt_m11|c_buy_day_cnt_m5|c_buy_day_cnt_m6|c_buy_day_cnt_m7|c_buy_day_cnt_m8|c_buy_day_cnt_m9|c_buy_day_cnt_m10|c_buy_day_cnt_m11|c_atf_day_cnt_m5|c_atf_day_cnt_m6|c_atf_day_cnt_m7|c_atf_day_cnt_m8|c_atf_day_cnt_m9|c_atf_day_cnt_m10|c_atf_day_cnt_m11|c_clk_atc_mean|c_buy_mean|c_atf_mean|c_clk_atc_median|c_buy_median|c_atf_median|c_clk_atc_std|c_buy_std|c_atf_std|c_clk_atc_max|c_buy_max|c_atf_max|c_clk_atc_cnt_double11|c_buy_cnt_double11|c_atf_cnt_double11|c_clk_atc_cnt_1wpre_double11|c_buy_cnt_1wpre_double11|c_atf_cnt_1wpre_double11|c_clk_atc_ratio_double11|c_clk_atc_ratio_1wpre_double11|c_buy_ratio_double11|c_buy_ratio_1wpre_double11|c_atf_ratio_double11|c_atf_ratio_1wpre_double11|c_rpt_byr_num|c_rpt_byr_day|
# +------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+------------------+------------------+----------------------------+------------------------+------------------------+------------------------+------------------------------+--------------------+--------------------------+--------------------+--------------------------+-------------+-------------+
# |     1|              221|           48|           11|               8|              23|              23|              39|              28|               43|               57|           3|           7|           3|           9|          12|            9|            5|           3|           0|           0|           1|           1|            3|            3|               0.79|           0.17|           0.04|              0.57|              0.77|              0.88|               0.8|              0.68|               0.78|               0.88|          0.21|          0.23|          0.12|          0.18|          0.29|           0.16|           0.08|          0.21|           0.0|           0.0|          0.02|          0.02|           0.05|           0.05|               94|           36|            9|                   4|                  14|                  14|                  19|                  15|                   19|                    9|               3|               6|               3|               7|               9|                7|                1|               2|               0|               0|               1|               1|                3|                2|         27.33|      7.17|      1.33|            25.5|         8.0|         1.0|         12.6|      3.6|     1.37|           43|       12|        3|                    28|                 5|                 0|                          28|                       0|                       3|                    0.85|                           0.9|                0.15|                       0.0|                 0.0|                       0.1|            5|            5|
# |    10|              249|           33|           14|              13|              20|              12|              16|              30|              122|               36|           4|           2|           0|           0|           8|           13|            6|           2|           2|           0|           3|           2|            3|            2|               0.84|           0.11|           0.05|              0.68|              0.83|               1.0|              0.84|              0.75|               0.88|               0.82|          0.21|          0.08|           0.0|           0.0|           0.2|           0.09|           0.14|          0.11|          0.08|           0.0|          0.16|          0.05|           0.02|           0.05|               77|           13|           13|                   7|                  12|                   6|                   9|                  14|                   23|                    6|               1|               1|               0|               0|               3|                7|                1|               2|               2|               0|               2|               2|                3|                2|          35.5|       4.5|       2.0|            18.0|         3.0|         2.0|        42.88|     5.13|      1.1|          122|       13|        3|                    19|                 6|                 1|                          15|                       0|                       0|                    0.73|                           1.0|                0.23|                       0.0|                0.04|                       0.0|            6|            6|
# |   100|              410|           55|           40|              21|              32|              35|              48|             119|               49|              106|           1|           3|           4|           3|          13|            9|           22|           2|           1|           8|           1|          18|            4|            6|               0.81|           0.11|           0.08|              0.88|              0.89|              0.74|              0.92|              0.79|               0.79|               0.79|          0.04|          0.08|          0.09|          0.06|          0.09|           0.15|           0.16|          0.08|          0.03|          0.17|          0.02|          0.12|           0.06|           0.04|              117|           31|           30|                   8|                  16|                  14|                  22|                  28|                   19|                   10|               1|               3|               4|               3|              11|                7|                2|               2|               1|               4|               1|              13|                4|                5|         50.67|       5.5|      5.67|            41.5|         3.5|         3.0|        35.08|     4.55|     6.59|          119|       13|       18|                    54|                21|                 0|                          45|                       0|                       4|                    0.72|                          0.92|                0.28|                       0.0|                 0.0|                      0.08|         null|         null|
# |  1000|               82|           10|            3|               0|              15|               9|               6|              14|                9|               29|           0|           3|           4|           2|           1|            0|            0|           0|           0|           0|           0|           1|            0|            2|               0.86|           0.11|           0.03|               0.0|              0.83|              0.69|              0.75|              0.88|                1.0|               0.94|           0.0|          0.17|          0.31|          0.25|          0.06|            0.0|            0.0|           0.0|           0.0|           0.0|           0.0|          0.06|            0.0|           0.06|               43|            9|            3|                   0|                   9|                   6|                   3|                   8|                    9|                    8|               0|               2|               4|               2|               1|                0|                0|               0|               0|               0|               0|               1|                0|                2|          8.83|      1.67|      0.17|             9.0|         1.5|         0.0|         5.49|     1.63|     0.41|           15|        4|        1|                    16|                 0|                 0|                           9|                       0|                       1|                     1.0|                           0.9|                 0.0|                       0.0|                 0.0|                       0.1|            1|            3|
# |  1001|              173|            9|            2|               1|              11|              13|              38|              47|               12|               51|           0|           2|           1|           3|           1|            0|            2|           0|           1|           0|           0|           0|            0|            1|               0.94|           0.05|           0.01|               1.0|              0.79|              0.93|              0.93|              0.98|                1.0|               0.94|           0.0|          0.14|          0.07|          0.07|          0.02|            0.0|           0.04|           0.0|          0.07|           0.0|           0.0|           0.0|            0.0|           0.02|               64|            8|            2|                   1|                   8|                   7|                  18|                  15|                    8|                    7|               0|               2|               1|               3|               1|                0|                1|               0|               1|               0|               0|               0|                0|                1|         20.33|      1.17|      0.17|            12.5|         1.0|         0.0|        17.93|     1.17|     0.41|           47|        3|        1|                    27|                 2|                 0|                          24|                       0|                       1|                    0.93|                          0.96|                0.07|                       0.0|                 0.0|                      0.04|            1|            2|
# |  1002|                1|            0|            0|               0|               1|               0|               0|               0|                0|                0|           0|           0|           0|           0|           0|            0|            0|           0|           0|           0|           0|           0|            0|            0|                1.0|            0.0|            0.0|               0.0|               1.0|               0.0|               0.0|               0.0|                0.0|                0.0|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                1|            0|            0|                   0|                   1|                   0|                   0|                   0|                    0|                    0|               0|               0|               0|               0|               0|                0|                0|               0|               0|               0|               0|               0|                0|                0|          0.17|       0.0|       0.0|             0.0|         0.0|         0.0|         0.41|      0.0|      0.0|            1|        0|        0|                     0|                 0|                 0|                           0|                       0|                       0|                     0.0|                           0.0|                 0.0|                       0.0|                 0.0|                       0.0|         null|         null|
# |  1003|                2|            1|            0|               0|               0|               2|               0|               0|                0|                0|           0|           0|           1|           0|           0|            0|            0|           0|           0|           0|           0|           0|            0|            0|               0.67|           0.33|            0.0|               0.0|               0.0|              0.67|               0.0|               0.0|                0.0|                0.0|           0.0|           0.0|          0.33|           0.0|           0.0|            0.0|            0.0|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                1|            1|            0|                   0|                   0|                   1|                   0|                   0|                    0|                    0|               0|               0|               1|               0|               0|                0|                0|               0|               0|               0|               0|               0|                0|                0|          0.33|      0.17|       0.0|             0.0|         0.0|         0.0|         0.82|     0.41|      0.0|            2|        1|        0|                     0|                 0|                 0|                           0|                       0|                       0|                     0.0|                           0.0|                 0.0|                       0.0|                 0.0|                       0.0|         null|         null|
# |  1004|               58|            1|            0|               0|               0|               0|               0|               0|                0|               58|           0|           0|           0|           0|           0|            0|            1|           0|           0|           0|           0|           0|            0|            0|               0.98|           0.02|            0.0|               0.0|               0.0|               0.0|               0.0|               0.0|                0.0|               0.98|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|           0.02|           0.0|           0.0|           0.0|           0.0|           0.0|            0.0|            0.0|                3|            1|            0|                   0|                   0|                   0|                   0|                   0|                    0|                    3|               0|               0|               0|               0|               0|                0|                1|               0|               0|               0|               0|               0|                0|                0|           0.0|       0.0|       0.0|             0.0|         0.0|         0.0|          0.0|      0.0|      0.0|            0|        0|        0|                    55|                 1|                 0|                           3|                       0|                       0|                    0.98|                           1.0|                0.02|                       0.0|                 0.0|                       0.0|         null|         null|
# |  1005|             8232|         1455|          476|             497|            1171|             923|            1346|             987|             1046|             2262|         133|         181|         193|         278|         137|          148|          385|          33|          71|          61|          87|          38|           63|          123|               0.81|           0.14|           0.05|              0.75|              0.82|              0.78|              0.79|              0.85|               0.83|               0.82|           0.2|          0.13|          0.16|          0.16|          0.12|           0.12|           0.14|          0.05|          0.05|          0.05|          0.05|          0.03|           0.05|           0.04|              178|          184|          145|                  14|                  30|                  31|                  31|                  30|                   31|                   11|              21|              30|              31|              31|              30|               31|               10|              13|              23|              26|              27|              17|               28|               11|         995.0|    178.33|     58.83|          1016.5|       164.5|        62.0|       286.37|    54.48|    20.32|         1346|      278|       87|                  1164|               339|                25|                         895|                      38|                      80|                    0.76|                          0.88|                0.22|                      0.04|                0.02|                      0.08|          234|          363|
# |  1006|              695|           76|           29|              32|              68|             150|             108|              79|              110|              148|           3|           9|          16|           6|           9|           10|           23|           5|           6|           2|           1|           6|            3|            6|               0.87|            0.1|           0.04|               0.8|              0.82|              0.89|              0.94|              0.84|               0.89|               0.84|          0.08|          0.11|           0.1|          0.05|           0.1|           0.08|           0.13|          0.13|          0.07|          0.01|          0.01|          0.06|           0.02|           0.03|              142|           39|           25|                   8|                  17|                  27|                  27|                  24|                   29|                   10|               2|               6|               9|               5|               7|                8|                2|               4|               4|               2|               1|               6|                3|                5|         91.17|      8.83|      3.83|            93.5|         9.0|         4.0|        40.71|     4.36|     2.14|          150|       16|        6|                    91|                22|                 0|                          53|                       1|                       4|                    0.81|                          0.91|                0.19|                      0.02|                 0.0|                      0.07|           16|           26|
# +------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+--------------+----------+----------+----------------+------------+------------+-------------+---------+---------+-------------+---------+---------+----------------------+------------------+------------------+----------------------------+------------------------+------------------------+------------------------+------------------------------+--------------------+--------------------------+--------------------+--------------------------+-------------+-------------+


+------+-----------------+-------------+-------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+------------+------------+------------+------------+------------+-------------+-------------+------------+------------+------------+------------+------------+-------------+-------------+-------------------+---------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+-----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+----------------+----------------+---------

## Gathering

In [None]:
cat_feature_5.show(10)

## Save Parquet

In [36]:
cat_feature_5.coalesce(50) \
              .write.format("parquet") \
              .mode("overwrite") \
              .save("./drive/MyDrive/Colab Notebooks/data/feature_cat_new")