In [1]:
spark

In [2]:
dir0 = 'file:///home/cloudera/2.kkbox_churn/data01/from_raw_transactions-v1/feature_creation/'
infile = dir0 + '20.init-final-et-gaps_for_each_member'

df0 = spark.read.format('csv').option('header','true').load(infile)
df0.printSchema()

root
 |-- msno: string (nullable = true)
 |-- init_date: string (nullable = true)
 |-- final_date: string (nullable = true)
 |-- gaps: string (nullable = true)



In [7]:
df0.show(40, truncate=False)

+--------------------------------------------+----------+----------+-------------------------------------------+
|msno                                        |init_date |final_date|gaps                                       |
+--------------------------------------------+----------+----------+-------------------------------------------+
|++VRDCn5gLo3BcAdq/KFqyn7wP/okNzGVU0yEZ4Ri9k=|2016-12-01|2017-02-28|null                                       |
|++cfKQHwGGB4zdPP19jxYuvB4cS0S6hYzp0NMY/nsOU=|2016-12-01|2017-02-28|null                                       |
|++v7Q4BYjprrVuqZpChNEXHoXWZQSU3HR5hGJo4+sWk=|2016-12-01|2017-02-28|null                                       |
|++y36YWzjZTGJplpAAY19R0s4ktvmUsuq6fCbTv4Duc=|2016-12-01|2017-02-28|null                                       |
|+/OXTCS/xccwbuw/IBoOiO80bmJDJBDECRueoVmTgxs=|2016-12-01|2017-02-28|null                                       |
|+/zCVLTUUN2zy0P/Km0iDLW5d+xi2XSqyNq4++XGwxg=|2016-12-01|2017-02-28|null                        

In [4]:
df0.count()

948512

In [46]:
import datetime as DT

def count_days(start_date, end_date):
    delta = DT.datetime.strptime(end_date, '%Y-%m-%d') - DT.datetime.strptime(start_date, '%Y-%m-%d')
    return delta.days + 1


def gap_stats(gaps_str):
    if gaps_str is None:
        return (0, 0, 0)
    
    ##
    gaps = [p.split('_')  for p in gaps_str.split('/')]

    gaps_count = len(gaps)

    total_gap_length = 0
    has_long_gap = False
    for g in gaps:
        gap_len = count_days(g[0], g[1])
        total_gap_length += gap_len
        if gap_len > 30:
            has_long_gap = True
    
    return (gaps_count, total_gap_length, int(has_long_gap))

In [53]:
from pyspark.sql import Row

def map_func(row):
    gaps_count, total_gap_length, has_long_gap = gap_stats(row['gaps'])
    return Row(msno=row['msno'], init_date=row['init_date'], final_date=row['final_date'],
               gaps=row['gaps'],
               gaps_count=gaps_count, total_gap_len=total_gap_length, has_long_gap=has_long_gap)
    
df1 = df0.rdd.map(map_func).toDF() \
      .select('msno', 'init_date', 'final_date', 'gaps', 'gaps_count', 'total_gap_len', 'has_long_gap')

In [54]:
df1.show(5)

+--------------------+----------+----------+----+----------+-------------+------------+
|                msno| init_date|final_date|gaps|gaps_count|total_gap_len|has_long_gap|
+--------------------+----------+----------+----+----------+-------------+------------+
|++VRDCn5gLo3BcAdq...|2016-12-01|2017-02-28|null|         0|            0|           0|
|++cfKQHwGGB4zdPP1...|2016-12-01|2017-02-28|null|         0|            0|           0|
|++v7Q4BYjprrVuqZp...|2016-12-01|2017-02-28|null|         0|            0|           0|
|++y36YWzjZTGJplpA...|2016-12-01|2017-02-28|null|         0|            0|           0|
|+/OXTCS/xccwbuw/I...|2016-12-01|2017-02-28|null|         0|            0|           0|
+--------------------+----------+----------+----+----------+-------------+------------+
only showing top 5 rows



In [55]:
from pyspark.sql.functions import when, col
df2 = df1.withColumn('has_expired', when(col('final_date') <= '2017-02-26', 1).otherwise(0))

df2.show(5)

+--------------------+----------+----------+----+----------+-------------+------------+-----------+
|                msno| init_date|final_date|gaps|gaps_count|total_gap_len|has_long_gap|has_expired|
+--------------------+----------+----------+----+----------+-------------+------------+-----------+
|++VRDCn5gLo3BcAdq...|2016-12-01|2017-02-28|null|         0|            0|           0|          0|
|++cfKQHwGGB4zdPP1...|2016-12-01|2017-02-28|null|         0|            0|           0|          0|
|++v7Q4BYjprrVuqZp...|2016-12-01|2017-02-28|null|         0|            0|           0|          0|
|++y36YWzjZTGJplpA...|2016-12-01|2017-02-28|null|         0|            0|           0|          0|
|+/OXTCS/xccwbuw/I...|2016-12-01|2017-02-28|null|         0|            0|           0|          0|
+--------------------+----------+----------+----+----------+-------------+------------+-----------+
only showing top 5 rows



In [50]:
df2.where('has_expired = 1').count()

80105

In [51]:
df2.where('has_long_gap = 1').count()

63030

In [58]:
df2.where('has_long_gap = 1').show(1, truncate=False)

+--------------------------------------------+----------+----------+---------------------+----------+-------------+------------+-----------+
|msno                                        |init_date |final_date|gaps                 |gaps_count|total_gap_len|has_long_gap|has_expired|
+--------------------------------------------+----------+----------+---------------------+----------+-------------+------------+-----------+
|+BRI975tMnCUD6GU4gVUaWCX6KGC5sjuunh3ex4HTs0=|2016-11-10|2017-02-07|2016-11-10_2017-01-08|1         |60           |1           |1          |
+--------------------------------------------+----------+----------+---------------------+----------+-------------+------------+-----------+
only showing top 1 row



In [60]:
outfile = dir0 + '30.features_1'
df2.write.format('csv').option('header','true').save(outfile)