In [1]:
spark

In [2]:
dir0 = '/home/cloudera/2.kkbox_churn/data01/from_raw_transactions-v1/feature_creation/'
# dir0 = '/home/master/iii_projects_data/kkbox_churn/data01/from_raw_transactions_v1/feature_creation/'
infile = dir0 + '13.members-w-last-end-in-2017_ends-after-20161001'

df0 = spark.read.format('csv').option('header','true').load(infile) 
df0.show(1)

+--------------------+------------+----------+----------+
|                msno|         tid|start_date|  end_date|
+--------------------+------------+----------+----------+
|++oyL81blqseBfQMo...|111669204178|2016-09-02|2016-10-01|
+--------------------+------------+----------+----------+
only showing top 1 row



In [3]:
from pyspark.sql.functions import concat_ws

df1 = df0.select('msno', concat_ws('_', 'start_date', 'end_date').alias('period'))
df1.show(1, truncate=False)

+--------------------------------------------+---------------------+
|msno                                        |period               |
+--------------------------------------------+---------------------+
|++oyL81blqseBfQMoxZgHD1yxxHclr+8pc61dVPsoRc=|2016-09-02_2016-10-01|
+--------------------------------------------+---------------------+
only showing top 1 row



In [4]:
from pyspark.sql.functions import concat_ws, collect_list, col
df2 = df1.groupBy('msno').agg(concat_ws('/', collect_list('period')).alias('subscrip_periods'))

#df2.show(1, truncate=False)
df2.where(col('msno').startswith('fF1YY7d')).show(truncate=False)

+--------------------------------------------+-----------------------------------------------------------------+
|msno                                        |subscrip_periods                                                 |
+--------------------------------------------+-----------------------------------------------------------------+
|fF1YY7dQ4AjyMvfSi53Xu7Kdr6+U2Cgv4032gvmCF5Q=|2016-12-02_2016-12-31/2016-09-02_2016-10-01/2016-10-11_2017-02-07|
+--------------------------------------------+-----------------------------------------------------------------+



In [5]:
import datetime as DT

def add_one_day(date_str):
    t = DT.datetime.strptime(date_str, '%Y-%m-%d') + DT.timedelta(days=1)
    return t.strftime('%Y-%m-%d')


def subtract_one_day(date_str):
    t = DT.datetime.strptime(date_str, '%Y-%m-%d') - DT.timedelta(days=1)
    return t.strftime('%Y-%m-%d')


def count_days(start_date, end_date):
    delta = DT.datetime.strptime(end_date, '%Y-%m-%d') - DT.datetime.strptime(start_date, '%Y-%m-%d')
    return delta.days + 1
    

def take_union(date_intervals):
    shifts_dict = {}
    for p in date_intervals:
        if p[0] in shifts_dict:
            shifts_dict[p[0]] += 1
        else:
            shifts_dict[p[0]] = 1
            
        t_end = add_one_day(p[1])
        if t_end in shifts_dict:
            shifts_dict[t_end] -= 1
        else:
            shifts_dict[t_end] = -1
    
    shifts = sorted(shifts_dict.items())
    union_intervals = []
    v = 0
    start = None
    for sh in shifts:
        v_prev = v
        v += sh[1]
        if v_prev == 0 and v > 0:
            start = sh[0]
        elif v_prev > 0 and v == 0:
            end = sh[0]
            union_intervals.append([start, end])
            
    return [[p[0], subtract_one_day(p[1])]  for p in union_intervals]
    
    
def find_gaps(periods, init_date, final_date, min_gap_len):
    '''
    Find gaps within [init_date, final_date] that are no shorter than `min_gap_len` days.
    '''
    periods += [['1970-01-01', subtract_one_day(init_date)], [add_one_day(final_date), '2099-12-31']]
    union_periods = take_union(periods)
    
    gaps = []
    for i in range(0, len(union_periods)-1):
        a_gap = [add_one_day(union_periods[i][1]), subtract_one_day(union_periods[i+1][0])]
        if count_days(a_gap[0], a_gap[1]) >= min_gap_len:
            gaps.append(a_gap)
        
    return gaps  


def determin_init_final_et_gaps(subscrip_periods_str):
    '''
    Determine init. date and final date to be considered for feature generation.
    Also, find gaps within [init_date, final_date] that are no shorter than 5 days.
    Return (init_date, final_date, gaps).
    '''
    subscrip_periods = [p.split('_') for p in subscrip_periods_str.split('/')]
    
    # determine final date
    final_date = max([p[1] for p in subscrip_periods])
    if final_date > '2017-02-28':
        final_date = '2017-02-28'
    
    # determine init. date (final_date - 89 days)
    init_date = (DT.datetime.strptime(final_date, '%Y-%m-%d') - DT.timedelta(days=89)) \
                .strftime('%Y-%m-%d')
    
    # find gaps within [init_date, final_date] that are no shorter than 5 days
    min_gap_len = 5
    gaps = find_gaps(subscrip_periods, init_date, final_date, min_gap_len)
    
    return (init_date, final_date, gaps)


In [7]:
subscrip_periods = '2016-12-02_2016-12-31/2016-09-02_2016-10-01/2016-10-11_2017-03-07'
determin_init_final_et_gaps(subscrip_periods)

('2016-12-01', '2017-02-28', [])

In [8]:
from pyspark.sql import Row

def map_func(row):
    init, final, gaps = determin_init_final_et_gaps(row['subscrip_periods'])
    gaps_str = '/'.join(['_'.join(g)  for g in gaps])
    return Row(msno=row['msno'], init_date=init, final_date=final, gaps=gaps_str)

df3 = df2.rdd.map(map_func).toDF() \
      .select('msno', 'init_date', 'final_date', 'gaps')
#df3.show(20, truncate=False)

In [9]:
# output
outfile = dir0 + '20.init-final-et-gaps_for_each_member'
df3.write.format('csv').option('header', 'true').save(outfile)