In [1]:
spark

In [4]:
dir0 = 'file:///home/cloudera/2.kkbox_churn/data01/'

In [9]:
infile = dir0 + 'transactions-v1_members-no-renew_cl3'
df0_ = spark.read.format('csv').option('header', 'true').load(infile)

df0_.printSchema()

root
 |-- msno: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- list_price: string (nullable = true)
 |-- actual_paid: string (nullable = true)
 |-- plan_days: string (nullable = true)
 |-- trans_date: string (nullable = true)
 |-- exp_date: string (nullable = true)
 |-- start_date: string (nullable = true)



In [13]:
df0 = df0_.selectExpr('msno', 
                      'CAST(payment_method AS int)',
                      'CAST(list_price AS int)',
                      'CAST(actual_paid AS int)',
                      'CAST(plan_days AS int)',
                      'CAST(trans_date AS date)',
                      'CAST(start_date AS date)',
                      'CAST(exp_date AS date)',
                     )
df0.printSchema()

root
 |-- msno: string (nullable = true)
 |-- payment_method: integer (nullable = true)
 |-- list_price: integer (nullable = true)
 |-- actual_paid: integer (nullable = true)
 |-- plan_days: integer (nullable = true)
 |-- trans_date: date (nullable = true)
 |-- start_date: date (nullable = true)
 |-- exp_date: date (nullable = true)



In [14]:
df0.show(5)

+--------------------+--------------+----------+-----------+---------+----------+----------+----------+
|                msno|payment_method|list_price|actual_paid|plan_days|trans_date|start_date|  exp_date|
+--------------------+--------------+----------+-----------+---------+----------+----------+----------+
|nuNWXAG/T41m2CTNT...|            38|       149|        149|       30|2016-01-25|2016-01-25|2016-02-24|
|nuNWXAG/T41m2CTNT...|            38|       149|        149|       30|2016-06-09|2016-06-10|2016-07-09|
|nuNWXAG/T41m2CTNT...|            38|       149|        149|       30|2015-11-20|2015-11-21|2015-12-20|
|nuNWXAG/T41m2CTNT...|            38|       149|        149|       30|2016-04-03|2016-04-04|2016-05-03|
|nuNWXAG/T41m2CTNT...|            38|       149|        149|       30|2017-02-13|2017-02-16|2017-03-15|
+--------------------+--------------+----------+-----------+---------+----------+----------+----------+
only showing top 5 rows



In [15]:
Nrec = df0.count()
print(Nrec)

1816863


In [18]:
# check
df0.where('start_date > exp_date').count()  # should be 0

0

In [27]:
from pyspark.sql.functions import when, col
df1 = df0.select('msno',
                 when(col('trans_date') < col('start_date'), col('trans_date')) \
                   .otherwise(col('start_date')) \
                   .alias('active_begin_date'),
                 'exp_date'
                )
df1.show(5)

+--------------------+-----------------+----------+
|                msno|active_begin_date|  exp_date|
+--------------------+-----------------+----------+
|nuNWXAG/T41m2CTNT...|       2016-01-25|2016-02-24|
|nuNWXAG/T41m2CTNT...|       2016-06-09|2016-07-09|
|nuNWXAG/T41m2CTNT...|       2015-11-20|2015-12-20|
|nuNWXAG/T41m2CTNT...|       2016-04-03|2016-05-03|
|nuNWXAG/T41m2CTNT...|       2017-02-13|2017-03-15|
+--------------------+-----------------+----------+
only showing top 5 rows



In [30]:
from pyspark.sql.functions import concat_ws, col
df2 = df1.select('msno', 
                 concat_ws('_', col('active_begin_date').cast('string'), col('exp_date').cast('string')) \
                   .alias('period')
                )
df2.show(5, truncate=False)

+--------------------------------------------+---------------------+
|msno                                        |period               |
+--------------------------------------------+---------------------+
|nuNWXAG/T41m2CTNT+WvM/m/Cg8zKaeC4xFvwfRzvPY=|2016-01-25_2016-02-24|
|nuNWXAG/T41m2CTNT+WvM/m/Cg8zKaeC4xFvwfRzvPY=|2016-06-09_2016-07-09|
|nuNWXAG/T41m2CTNT+WvM/m/Cg8zKaeC4xFvwfRzvPY=|2015-11-20_2015-12-20|
|nuNWXAG/T41m2CTNT+WvM/m/Cg8zKaeC4xFvwfRzvPY=|2016-04-03_2016-05-03|
|nuNWXAG/T41m2CTNT+WvM/m/Cg8zKaeC4xFvwfRzvPY=|2017-02-13_2017-03-15|
+--------------------------------------------+---------------------+
only showing top 5 rows



In [37]:
from pyspark.sql.functions import concat_ws, collect_list
df3 = df2.groupBy('msno') \
         .agg(concat_ws('|', collect_list('period')).alias('active_periods'))

In [38]:
df3.rdd.take(5)

[Row(msno='++7jKYbuIJPXry8Oh1NcEh9fCsqcQgUaaxXsgG15kMg=', active_periods='2016-03-12_2016-03-22'),
 Row(msno='+0RJtbyhoPAHPa+34MkYcE2Ox0cjMgJOTXMXVBYgkJE=', active_periods='2016-01-13_2017-02-27'),
 Row(msno='+3tRPOto/e58mdDA6oluFztHfwlqjJeNXb1vmYQjWfw=', active_periods='2016-01-18_2016-02-17'),
 Row(msno='+43UqHvqzu5STDVIuBLWryllqYYP4UrG9XyiajeGePk=', active_periods='2015-12-26_2017-02-28|2015-12-08_2017-01-19|2015-12-26_2017-01-29|2015-12-26_2017-02-18|2015-12-26_2017-02-08'),
 Row(msno='+4dMilwe46UsxpXXH0O7/mF3g+5fn4MAF2VTWPNzuoo=', active_periods='2016-03-05_2016-04-04')]

In [41]:
outfile = dir0 + 'active_periods'
df3.coalesce(1).write.format('csv').option('header','false').save(outfile)