In [1]:
spark

In [2]:
dir0 = '/home/cloudera/2.kkbox_churn/data01/from_raw_transactions-v1/feature_creation/'
infile_subscrip_periods = dir0 + '13.members-w-last-end-in-2017_ends-after-20161001'
infile_init_final = dir0 + '20.init-final-et-gaps_for_each_member'



## subscription periods

In [3]:
df_periods = spark.read.format('csv').option('header','true').load(infile_subscrip_periods)
df_periods.printSchema()

root
 |-- msno: string (nullable = true)
 |-- tid: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- end_date: string (nullable = true)



In [4]:
df_periods.show(3)

+--------------------+------------+----------+----------+
|                msno|         tid|start_date|  end_date|
+--------------------+------------+----------+----------+
|++oyL81blqseBfQMo...|111669204178|2016-09-02|2016-10-01|
|++oyL81blqseBfQMo...|120259132531|2016-11-02|2016-12-01|
|++oyL81blqseBfQMo...|137439351298|2016-10-03|2016-11-01|
+--------------------+------------+----------+----------+
only showing top 3 rows



In [5]:
from pyspark.sql.functions import col, concat_ws, collect_list
df1 = df_periods.select('msno', concat_ws('_', 'tid', 'start_date', 'end_date').alias('period'))

df_periods_memberwise = df1.groupBy('msno').agg(concat_ws('/', collect_list('period')).alias('subscrip_periods'))
df_periods_memberwise.rdd.take(1)

[Row(msno='++4RuqBw0Ss6bQU4oMxaRlbBPoWzoEiIZaxPM04Y4+U=', subscrip_periods='85899821409_2016-10-15_2016-11-13/68720069836_2016-12-15_2017-01-13/761587_2017-02-13_2017-03-13/120259666476_2016-09-14_2016-10-13/60129699746_2016-11-14_2016-12-13/77310103266_2017-01-15_2017-02-13')]

## range to be considered

In [6]:
df_range = spark.read.format('csv').option('header','true').load(infile_init_final) \
           .drop('gaps') \
           .withColumnRenamed('msno', 'msno_')
df_range.printSchema()

root
 |-- msno_: string (nullable = true)
 |-- init_date: string (nullable = true)
 |-- final_date: string (nullable = true)



In [7]:
df_range.show(3)

+--------------------+----------+----------+
|               msno_| init_date|final_date|
+--------------------+----------+----------+
|++VRDCn5gLo3BcAdq...|2016-12-01|2017-02-28|
|++cfKQHwGGB4zdPP1...|2016-12-01|2017-02-28|
|++v7Q4BYjprrVuqZp...|2016-12-01|2017-02-28|
+--------------------+----------+----------+
only showing top 3 rows



In [20]:
df_range.count()

948512

##  joining

In [8]:
df0 = df_range.join(df_periods_memberwise, df_range['msno_']==df_periods_memberwise['msno'], 'inner') \
     .drop('msno').withColumnRenamed('msno_', 'msno')
df0.printSchema()

root
 |-- msno: string (nullable = true)
 |-- init_date: string (nullable = true)
 |-- final_date: string (nullable = true)
 |-- subscrip_periods: string (nullable = false)



In [9]:
# write to disk
outfile1 = dir0 + 'temp'
df0.write.format('csv').option('header','true').save(outfile1)

In [3]:
# read from disk
infile1 = dir0 + 'temp'
df0 = spark.read.format('csv').option('header','true').load(infile1)
df0.printSchema()

root
 |-- msno: string (nullable = true)
 |-- init_date: string (nullable = true)
 |-- final_date: string (nullable = true)
 |-- subscrip_periods: string (nullable = true)



In [4]:
df0.count()

948512

In [4]:
df0.rdd.take(1)

[Row(msno='++VRDCn5gLo3BcAdq/KFqyn7wP/okNzGVU0yEZ4Ri9k=', init_date='2016-12-01', final_date='2017-02-28', subscrip_periods='17179888760_2016-09-04_2016-10-03/94489487715_2017-01-05_2017-02-03/85900042891_2016-10-05_2016-11-03/111669836797_2016-12-05_2017-01-03/146029549201_2016-11-04_2016-12-03/51540116595_2017-02-04_2017-03-03')]

## intersect each period with [init, final]

In [16]:
import datetime as DT

def len_of_interval(i, f):
    delta = DT.datetime.strptime(f, '%Y-%m-%d') - DT.datetime.strptime(i, '%Y-%m-%d')
    return delta.days + 1
    
def len_of_intersection(i1, f1, i2, f2):
    return len_of_interval(max(i1, i2), min(f1, f2))

def intersect_periods(init_date_str, final_date_str, periods_str):
    periods = [p.split('_')  for p in periods_str.split('/')]
    
    latest_start, latest_end = ('0000-00-00', '0000-00-00')
    periods2 = []
    for p in periods:
        tid = p[0]; start = p[1]; end = p[2]
        intersect_length = len_of_intersection(init_date_str, final_date_str, start, end)
        if intersect_length > 0:
            periods2.append([tid, start, end, intersect_length])
            ##
            if (end, start) > (latest_end, latest_start):
                latest_end, latest_start = (end, start)
            
    result = []
    for p in periods2:
        is_latest = 1 if (p[1]==latest_start and p[2]==latest_end)  else 0
        result.append([p[0], p[3], is_latest])
    
    return result


#intersect_periods('2016-12-01', '2017-02-28', 
#                  '17179888760_2016-09-04_2016-10-03/94489487715_2017-01-05_2017-02-03/85900042891_2016-10-05_2016-11-03/111669836797_2016-12-05_2017-01-03/146029549201_2016-11-04_2016-12-03/51540116595_2017-02-04_2017-03-03')

In [18]:
from pyspark.sql.functions import expr
from pyspark.sql import Row

def map_func(row):
    tid_lengths = intersect_periods(row['init_date'], row['final_date'], row['subscrip_periods'])
    return [Row(msno=row['msno'], tid=tl[0], length=tl[1], is_latest=tl[2])  for tl in tid_lengths]
    
df1 = df0.rdd.flatMap(map_func).toDF() \
      .select('msno', 'tid', 'length', 'is_latest')
df1.show(10)

+--------------------+------------+------+---------+
|                msno|         tid|length|is_latest|
+--------------------+------------+------+---------+
|++VRDCn5gLo3BcAdq...| 94489487715|    30|        0|
|++VRDCn5gLo3BcAdq...|111669836797|    30|        0|
|++VRDCn5gLo3BcAdq...|146029549201|     3|        0|
|++VRDCn5gLo3BcAdq...| 51540116595|    25|        1|
|++cfKQHwGGB4zdPP1...| 17180183272|    17|        0|
|++cfKQHwGGB4zdPP1...|137439288614|    12|        1|
|++cfKQHwGGB4zdPP1...|120259199484|    30|        0|
|++cfKQHwGGB4zdPP1...| 51539653291|    30|        0|
|++v7Q4BYjprrVuqZp...|120259440005|    30|        0|
|++v7Q4BYjprrVuqZp...|137439153240|    30|        0|
+--------------------+------------+------+---------+
only showing top 10 rows



In [19]:
# output
outfile = dir0 + '21.subscrip-period_intersected_lengths'
df1.write.format('csv').option('header','true').save(outfile)