In [1]:
spark

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

infile = '/home/cloudera/Desktop/KKBox_churn_predict/raw_data/transactions.csv'
schema = StructType([
    StructField("msno", StringType(), True),
    StructField("payment_method", IntegerType(), True),
    StructField("plan_days", IntegerType(), True),
    StructField("list_price", IntegerType(), True),
    StructField("actual_paid", IntegerType(), True),
    StructField("is_auto_renew", IntegerType(), True),
    StructField("trans_date", IntegerType(), True),
    StructField("exp_date", IntegerType(), True),
    StructField("is_cancel", IntegerType(), True),
])
df0 = spark.read.format('csv').option('header', 'true').schema(schema).load(infile)
df0.printSchema()

root
 |-- msno: string (nullable = true)
 |-- payment_method: integer (nullable = true)
 |-- plan_days: integer (nullable = true)
 |-- list_price: integer (nullable = true)
 |-- actual_paid: integer (nullable = true)
 |-- is_auto_renew: integer (nullable = true)
 |-- trans_date: integer (nullable = true)
 |-- exp_date: integer (nullable = true)
 |-- is_cancel: integer (nullable = true)



In [3]:
# cleaning
df1 = df0.where('trans_date >= 20151101') \
         .where('plan_days != 0 OR is_cancel = 1') \
         .where('is_cancel = 0 OR is_auto_renew = 1')
df1.show(3)

+--------------------+--------------+---------+----------+-----------+-------------+----------+--------+---------+
|                msno|payment_method|plan_days|list_price|actual_paid|is_auto_renew|trans_date|exp_date|is_cancel|
+--------------------+--------------+---------+----------+-----------+-------------+----------+--------+---------+
|FT4moGxOj6tzwkTSA...|            34|       30|       149|        149|            1|  20151130|20151231|        0|
|z1s1E/gm6xiwjNb8T...|            34|       30|       149|        149|            1|  20151130|20151231|        0|
|lZyYiuAJW3qzDnicN...|            34|       30|       149|        149|            1|  20151130|20151231|        0|
+--------------------+--------------+---------+----------+-----------+-------------+----------+--------+---------+
only showing top 3 rows



In [4]:
from pyspark.sql.functions import count, sum
df2 = df1.groupBy('msno').agg(count('*').alias('count'), sum('is_auto_renew').alias('count_auto_renew'))
df2.show(3)

+--------------------+-----+----------------+
|                msno|count|count_auto_renew|
+--------------------+-----+----------------+
|DQH8jGMrXq1FgWyr+...|   16|              16|
|jENtJsb8LoDxqKMpG...|   16|              16|
|5F7G3pHKf5ijGQpoK...|   16|              16|
+--------------------+-----+----------------+
only showing top 3 rows



In [5]:
Nmem = df2.count()
Nmem

1944388

# members who have used auto renewal for every transaction

In [7]:
df3 = df2.where('count = count_auto_renew')
df3.show(3)

+--------------------+-----+----------------+
|                msno|count|count_auto_renew|
+--------------------+-----+----------------+
|DQH8jGMrXq1FgWyr+...|   16|              16|
|jENtJsb8LoDxqKMpG...|   16|              16|
|5F7G3pHKf5ijGQpoK...|   16|              16|
+--------------------+-----+----------------+
only showing top 3 rows



In [10]:
n = df3.count()
print(n)
print(n/Nmem*100, '%')

1170236
60.18531280793751 %


In [11]:
df4 = df1.join(df3, df1['msno'] == df3['msno'], 'left_semi')
df4.printSchema()

root
 |-- msno: string (nullable = true)
 |-- payment_method: integer (nullable = true)
 |-- plan_days: integer (nullable = true)
 |-- list_price: integer (nullable = true)
 |-- actual_paid: integer (nullable = true)
 |-- is_auto_renew: integer (nullable = true)
 |-- trans_date: integer (nullable = true)
 |-- exp_date: integer (nullable = true)
 |-- is_cancel: integer (nullable = true)



In [12]:
outfile = 'file:///home/cloudera/Desktop/KKBox_churn_predict/data01/transactions_v1_members-all-renew.csv'
df4.coalesce(1).write.format('csv').option('header', 'true').save(outfile)

# members who have never used auto renewal for any transaction

In [6]:
df3 = df2.where('count_auto_renew = 0')
df3.show(3)

+--------------------+-----+----------------+
|                msno|count|count_auto_renew|
+--------------------+-----+----------------+
|m4Wf4EZuJJODjisMW...|   12|               0|
|H9q5fhYo6VRVqGDxw...|    8|               0|
|2+BjzPtI4IIyT5Aps...|    7|               0|
+--------------------+-----+----------------+
only showing top 3 rows



In [7]:
df4 = df1.join(df3, df1['msno'] == df3['msno'], 'left_semi')
df4.printSchema()

root
 |-- msno: string (nullable = true)
 |-- payment_method: integer (nullable = true)
 |-- plan_days: integer (nullable = true)
 |-- list_price: integer (nullable = true)
 |-- actual_paid: integer (nullable = true)
 |-- is_auto_renew: integer (nullable = true)
 |-- trans_date: integer (nullable = true)
 |-- exp_date: integer (nullable = true)
 |-- is_cancel: integer (nullable = true)



In [8]:
df4.show(10)

+--------------------+--------------+---------+----------+-----------+-------------+----------+--------+---------+
|                msno|payment_method|plan_days|list_price|actual_paid|is_auto_renew|trans_date|exp_date|is_cancel|
+--------------------+--------------+---------+----------+-----------+-------------+----------+--------+---------+
|++7jKYbuIJPXry8Oh...|            38|       10|         0|          0|            0|  20160312|20160322|        0|
|+0RJtbyhoPAHPa+34...|            32|      410|      1788|       1788|            0|  20160114|20170227|        0|
|+3tRPOto/e58mdDA6...|            38|       30|       149|        149|            0|  20160118|20160217|        0|
|+43UqHvqzu5STDVIu...|            38|       10|         0|          0|            0|  20151226|20170228|        0|
|+43UqHvqzu5STDVIu...|            32|      400|      1599|       1599|            0|  20151208|20170119|        0|
|+43UqHvqzu5STDVIu...|            38|       10|         0|          0|          

In [10]:
df4.count()

1821142

In [9]:
outfile = 'file:///home/cloudera/Desktop/KKBox_churn_predict/data01/transactions_v1_members-no-renew.csv'
df4.write.format('csv').option('header', 'true').save(outfile)

In [5]:
df3 = df2.where('count_auto_renew < count').where('count_auto_renew > 0').persist()
df3.show(10)

+--------------------+-----+----------------+
|                msno|count|count_auto_renew|
+--------------------+-----+----------------+
|VbCNHUZZoCRjoCu3y...|    8|               6|
|T92xj92xZjD3k41Vu...|   12|              11|
|ULmIjp5BmUC3PX7S4...|   15|               9|
|dV9KSnGQmobtPwKDE...|    6|               5|
|RQ3KgSJH/pz9GUvha...|    6|               5|
|zsz/HPr9dDEuCGTns...|    5|               3|
|UKw0drb/2YPnnwddd...|   16|              15|
|K7UtuHqM7tT9njbB/...|    4|               2|
|4ZDIdd3OSGSLz2bPd...|   14|               9|
|DhwRAg/4TwLcGRkxO...|    6|               3|
+--------------------+-----+----------------+
only showing top 10 rows



In [6]:
df4 = df1.join(df3, df1['msno'] == df3['msno'], 'left_semi')
df4.printSchema()

root
 |-- msno: string (nullable = true)
 |-- payment_method: integer (nullable = true)
 |-- plan_days: integer (nullable = true)
 |-- list_price: integer (nullable = true)
 |-- actual_paid: integer (nullable = true)
 |-- is_auto_renew: integer (nullable = true)
 |-- trans_date: integer (nullable = true)
 |-- exp_date: integer (nullable = true)
 |-- is_cancel: integer (nullable = true)



In [7]:
outfile = 'file:///home/cloudera/Desktop/KKBox_churn_predict/data01/transactions_v1_members-partial-renew.csv'
df4.coalesce(1).write.format('csv').option('header', 'true').save(outfile)