In [1]:
spark

In [82]:
dir0 = 'file:///home/cloudera/2.kkbox_churn/data01/from_raw_transactions-v1/feature_creation/'
file_flag = 'msno_a-n'
infile_trans = dir0 + '02b.w_features_added/' + file_flag
infile_period_lens = dir0 + '21.subscrip-period_intersected_lengths'

In [5]:
df_trans = spark.read.format('csv').option('header','true').load(infile_trans)
df_lens = spark.read.format('csv').option('header','true').load(infile_period_lens) \
          .selectExpr('tid AS tid_', 'length', 'is_latest')

# joining
df0 = df_trans.join(df_lens, df_trans['tid']==df_lens['tid_'], 'inner') \
      .drop('tid_').drop('tid').withColumnRenamed('length', 'intersected_len')
df0.printSchema()

root
 |-- msno: string (nullable = true)
 |-- trans_date: string (nullable = true)
 |-- pay_method: string (nullable = true)
 |-- plan_days: string (nullable = true)
 |-- list_price: string (nullable = true)
 |-- actual_paid: string (nullable = true)
 |-- discount: string (nullable = true)
 |-- paid_per_day: string (nullable = true)
 |-- auto_renew: string (nullable = true)
 |-- intersected_len: string (nullable = true)
 |-- is_latest: string (nullable = true)



## latest behavior

In [9]:
from pyspark.sql.functions import col
df_last = df0.where(col('is_latest') == '1') \
          .selectExpr('msno',
                      'pay_method AS last_pay_method',
                      'plan_days AS last_plan_days',
                      'list_price AS last_list_price',
                      'actual_paid AS last_actual_paid',
                      'discount AS last_discount',
                      'paid_per_day AS last_paid_per_day')

# output
outfile = dir0 + '31.features_latest/' + file_flag
df_last.write.format('csv').option('header','true').save(outfile)

## categorical 

In [96]:
from pyspark.sql.functions import count
from pyspark.sql.functions import concat_ws, collect_list

field = 'actual_paid'
#field = 'auto_renew'
#field = 'discount'
#field = 'list_price'
#field = 'pay_method'
#field = 'plan_days'

df1 = df0.select('msno', field)
df2 = df1.groupBy('msno', field).agg(count('*').cast('string').alias('count_trans'))
df3 = df2.select('msno', concat_ws(':', field, 'count_trans').alias('cat_count'))
df4 = df3.groupBy('msno').agg(concat_ws('/', collect_list('cat_count')).alias('cat_counts'))

In [97]:
import random

def find_uniq_cnt_et_mode(cat_counts_str):
    cat_counts = [s.split(':')  for s in cat_counts_str.split('/')]
    count_uniq = len(cat_counts)
    if count_uniq == 1:
        return (1, cat_counts[0][0])
    
    max_count = max([int(it[1])  for it in cat_counts])
    cats_w_max_cnt = [it[0]  for it in cat_counts  if it[1] == str(max_count)]

    return (count_uniq, random.choice(cats_w_max_cnt))

#find_uniq_cnt_et_mode('a:3')    

In [98]:
from pyspark.sql import Row
from pyspark.sql.functions import col

def map_func(row):
    cnt_uniq, mode = find_uniq_cnt_et_mode(row['cat_counts'])
    return Row(msno=row['msno'], cnt_uniq=cnt_uniq, mode=mode)

df5 = df4.rdd.map(map_func).toDF() \
      .select('msno', col('cnt_uniq').alias(field+'_cnt_uniq'), col('mode').alias(field+'_mode'))

# output
outfile = dir0 + '32.features_categorical/' + file_flag + '/' + field
df5.write.format('csv').option('header','true').save(outfile)

## continuous