In [1]:
spark

In [50]:
dir0 = '/home/cloudera/2.kkbox_churn/data01/from_raw_transactions-v1+v2/'

In [51]:
infile = dir0 + '10.effective_subscr_periods'
df0 = spark.read.format('parquet').load(infile)

In [22]:
df0.printSchema()

root
 |-- msno: string (nullable = true)
 |-- tid: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- end_date: string (nullable = true)



In [49]:
########################
df0.where(col('msno').startswith('+k34uNljb84Fx+8Cb')).sort('start_date').show()

+--------------------+----+----------+----------+
|                msno| tid|start_date|  end_date|
+--------------------+----+----------+----------+
|+k34uNljb84Fx+8Cb...|1551|2015-01-14|2015-02-13|
|+k34uNljb84Fx+8Cb...|1556|2015-03-04|2015-04-03|
|+k34uNljb84Fx+8Cb...|1554|2016-09-28|2016-10-27|
|+k34uNljb84Fx+8Cb...|1558|2016-10-27|2016-11-26|
|+k34uNljb84Fx+8Cb...|1555|2016-11-28|2016-12-27|
|+k34uNljb84Fx+8Cb...|1552|2016-12-28|2017-01-27|
|+k34uNljb84Fx+8Cb...|1557|2017-01-28|2017-02-27|
|+k34uNljb84Fx+8Cb...|1553|2017-03-01|2017-03-30|
|+k34uNljb84Fx+8Cb...|1559|2017-04-01|2017-04-30|
+--------------------+----+----------+----------+



# select members

In [52]:
# exclude members with indeterminate periods overlapping [2017-02-01, 2017-03-31]
from pyspark.sql.functions import col

df = df0.where(col('tid').startswith('indet')) \
     .where(col('end_date') >= '2017-02-01') \
     .where(col('start_date') <= '2017-03-31') \
     .selectExpr('msno AS msno_')
df1 = df0.join(df, df0['msno']==df['msno_'], 'left_anti')

In [53]:
# retain only members with a subscription period ending within Feb 2017
df = df1.where(col('end_date') >= '2017-02-01') \
        .where(col('start_date') < '2017-03-01')

df2 = df1.join(df, df1['msno']==df['msno'], 'left_semi')

# output
outfile = dir0 + 'temp'
df2.write.format('parquet').save(outfile)

# find long gap across 2017 Feb and Mar

In [54]:
# input
infile = dir0 + 'temp'
df2 = spark.read.format('parquet').load(infile)

df2.printSchema()

root
 |-- msno: string (nullable = true)
 |-- tid: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- end_date: string (nullable = true)



In [56]:
################################################
df2.where(col('msno').startswith('+k34uNljb84Fx+8Cb')).sort('start_date').show()

+--------------------+----+----------+----------+
|                msno| tid|start_date|  end_date|
+--------------------+----+----------+----------+
|+k34uNljb84Fx+8Cb...|1551|2015-01-14|2015-02-13|
|+k34uNljb84Fx+8Cb...|1556|2015-03-04|2015-04-03|
|+k34uNljb84Fx+8Cb...|1554|2016-09-28|2016-10-27|
|+k34uNljb84Fx+8Cb...|1558|2016-10-27|2016-11-26|
|+k34uNljb84Fx+8Cb...|1555|2016-11-28|2016-12-27|
|+k34uNljb84Fx+8Cb...|1552|2016-12-28|2017-01-27|
|+k34uNljb84Fx+8Cb...|1557|2017-01-28|2017-02-27|
|+k34uNljb84Fx+8Cb...|1553|2017-03-01|2017-03-30|
|+k34uNljb84Fx+8Cb...|1559|2017-04-01|2017-04-30|
+--------------------+----+----------+----------+



In [57]:
from pyspark.sql import Row
from util_time_ranges import find_gaps

def map_func(row):
    msno = row['msno']
    periods = [s.split('_') for s in row['sub_periods']]
    gaps = find_gaps(periods, '2017-02-01', '2017-03-31', min_gap_len=30)
    
    is_churn = False
    for gap in gaps:
        if gap[0] <= '2017-02-28' and gap[1] >= '2017-03-01':
            is_churn = True
            break
        
    return Row(msno=msno, is_churn=is_churn)

In [58]:
from pyspark.sql.functions import col, collect_list, concat_ws

df3 = df2.groupBy('msno').agg(collect_list(concat_ws('_', 'start_date', 'end_date')).alias('sub_periods'))
df4 = df3.rdd.map(map_func).toDF() \
      .select('msno', 'is_churn')

df4.printSchema()

root
 |-- msno: string (nullable = true)
 |-- is_churn: boolean (nullable = true)



In [59]:
# output
outfile = dir0 + '11.is-churn_across-201702-03_members-exp-in-201702'
df4.write.format('parquet').save(outfile)

# compare with "train.csv"

In [60]:
infile = dir0 + '11.is-churn_across-201702-03_members-exp-in-201702'
df4 = spark.read.format('parquet').load(infile)
df4.printSchema()

root
 |-- msno: string (nullable = true)
 |-- is_churn: boolean (nullable = true)



In [61]:
df4.count()

1181333

In [62]:
df4.where('is_churn').count()

38202

In [63]:
infile = '/home/cloudera/2.kkbox_churn/raw_data/train.csv'
df5 = spark.read.format('csv').option('header','true').load(infile) \
      .selectExpr('msno AS msno_', 'is_churn = 1 AS is_churn_')

In [64]:
df5.printSchema()

root
 |-- msno_: string (nullable = true)
 |-- is_churn_: boolean (nullable = true)



In [65]:
df5.show(4)

+--------------------+---------+
|               msno_|is_churn_|
+--------------------+---------+
|waLDQMmcOu2jLDaV1...|     true|
|QA7uiXy8vIbUSPOkC...|     true|
|fGwBva6hikQmTJzrb...|     true|
|mT5V8rEpa+8wuqi6x...|     true|
+--------------------+---------+
only showing top 4 rows



In [66]:
df5.count()

992931

In [67]:
df6 = df4.join(df5, df4['msno']==df5['msno_'], 'inner').drop('msno_')
df6.count()

953553

In [68]:
df6.printSchema()

root
 |-- msno: string (nullable = true)
 |-- is_churn: boolean (nullable = true)
 |-- is_churn_: boolean (nullable = true)



In [69]:
df6.crosstab('is_churn', 'is_churn_').show()

+------------------+------+-----+
|is_churn_is_churn_| false| true|
+------------------+------+-----+
|             false|897347|18013|
|              true|   297|37896|
+------------------+------+-----+



In [71]:
##########################################
df6.where('is_churn').where('NOT is_churn_').show(5)

+--------------------+--------+---------+
|                msno|is_churn|is_churn_|
+--------------------+--------+---------+
|JlOCq1KfgxTTky0ek...|    true|    false|
|qp2ZM62QvCW2lA33s...|    true|    false|
|BYR+BL8wSy5kQP3iR...|    true|    false|
|gBiQxnbw1Z9oZC4Rp...|    true|    false|
|lDYK3CRe6Q3LuKAV0...|    true|    false|
+--------------------+--------+---------+
only showing top 5 rows



# compare with output of churn labeler

In [72]:
infile = '/home/cloudera/2.kkbox_churn/raw_data/churn_labeler_output.csv'
df = spark.read.format('csv').option('header','true').load(infile) \
     .selectExpr('msno AS msno_', 'is_churn = true AS is_churn2')
df7 = df4.join(df, df4['msno']==df['msno_'], 'inner').drop('msno_')
df7.count()

829786

In [73]:
df7.crosstab('is_churn', 'is_churn2').show()

+------------------+------+-----+
|is_churn_is_churn2| false| true|
+------------------+------+-----+
|             false|797681|  839|
|              true|  4220|27046|
+------------------+------+-----+

