```
input: transactions_370k

Find correspondence between plan-days and actual-paid. If more than one values of 'actual-paid' are found for a value of 'plan-days', choose the one that has the largest number of records. Only records with transaction_date < 20160101, plan_days > 0 and actual_paid > 0 are considered.

output: 'master' local: /home/master/iii_projects_data/kkbox_churn/data01/004.plan-days_vs_actual-paid.csv
```

In [1]:
spark

In [2]:
infile = 'file:///home/cloudera/Desktop/KKBox_churn_predict/data01/transactions_370k.csv'
df0 = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load(infile)

In [3]:
df0.printSchema()

root
 |-- msno: string (nullable = true)
 |-- payment_method_id: integer (nullable = true)
 |-- payment_plan_days: integer (nullable = true)
 |-- plan_list_price: integer (nullable = true)
 |-- actual_amount_paid: integer (nullable = true)
 |-- is_auto_renew: integer (nullable = true)
 |-- transaction_date: integer (nullable = true)
 |-- membership_expire_date: integer (nullable = true)
 |-- is_cancel: integer (nullable = true)
 |-- ID: string (nullable = true)



In [4]:
df1 = df0.selectExpr('msno', 'transaction_date as trans_date', 
                     'payment_plan_days AS plan_days', 'actual_amount_paid AS actual_paid',
                     'is_cancel') 

In [5]:
df1.show(20)

+--------------------+----------+---------+-----------+---------+
|                msno|trans_date|plan_days|actual_paid|is_cancel|
+--------------------+----------+---------+-----------+---------+
|+/namlXq+u3izRjHC...|  20150831|        0|        149|        0|
|+/namlXq+u3izRjHC...|  20170228|       30|        149|        0|
|+/namlXq+u3izRjHC...|  20161031|       30|        149|        0|
|+/namlXq+u3izRjHC...|  20160531|       30|        149|        0|
|+/namlXq+u3izRjHC...|  20151231|       30|        149|        0|
|+/namlXq+u3izRjHC...|  20170131|       30|        149|        0|
|+/namlXq+u3izRjHC...|  20150930|        0|        149|        0|
|+/namlXq+u3izRjHC...|  20160430|       30|        149|        0|
|+/namlXq+u3izRjHC...|  20151031|        0|        149|        0|
|+/namlXq+u3izRjHC...|  20160831|       30|        149|        0|
|+/namlXq+u3izRjHC...|  20160331|       30|        149|        0|
|+/namlXq+u3izRjHC...|  20151130|       30|        149|        0|
|+/namlXq+

In [6]:
from pyspark.sql.functions import col
df2 = df1.where('is_cancel = 0').where(col('trans_date') < '20160101').where('plan_days > 0').where('actual_paid > 0') \
         .groupBy('actual_paid', 'plan_days').count().sort('actual_paid')

df2.show(50)

+-----------+---------+-------+
|actual_paid|plan_days|  count|
+-----------+---------+-------+
|         10|        2|      1|
|         15|        3|      1|
|         35|        7|    770|
|         50|       10|     29|
|         99|       30|  24797|
|        100|       30|   2147|
|        105|       21|     48|
|        119|       30|  67883|
|        127|       30|     17|
|        129|       30|  97136|
|        131|       30|     22|
|        134|       30|    394|
|        149|       35|     54|
|        149|       30|1477207|
|        149|       31| 395702|
|        150|       30|  63400|
|        210|       30|     15|
|        265|       60|      1|
|        300|       60|    751|
|        350|       70|     73|
|        400|       80|     45|
|        447|       90|    281|
|        450|       90|    172|
|        480|      100|   1256|
|        500|       30|      1|
|        500|      100|     95|
|        536|      180|   2439|
|        596|      120|      1|
|       

In [7]:
rdd2 = df2.rdd
rdd3 = rdd2.map(lambda row: (row[0], (row['count'], row['plan_days'])))

rdd3.take(20)

[(10, (1, 2)),
 (15, (1, 3)),
 (35, (770, 7)),
 (50, (29, 10)),
 (99, (24797, 30)),
 (100, (2147, 30)),
 (105, (48, 21)),
 (119, (67883, 30)),
 (127, (17, 30)),
 (129, (97136, 30)),
 (131, (22, 30)),
 (134, (394, 30)),
 (149, (1477207, 30)),
 (149, (54, 35)),
 (149, (395702, 31)),
 (150, (63400, 30)),
 (210, (15, 30)),
 (265, (1, 60)),
 (300, (751, 60)),
 (350, (73, 70))]

In [8]:
rdd4 = rdd3.combineByKey(lambda p: p, 
                         lambda p1, p2: p1 if p1[0]>p2[0] else p2,
                         lambda p1, p2: p1 if p1[0]>p2[0] else p2)

In [9]:
rdd5 = rdd4.map(lambda p: (p[0], p[1][1])).sortByKey()

rdd5.take(15)

[(10, 2),
 (15, 3),
 (35, 7),
 (50, 10),
 (99, 30),
 (100, 30),
 (105, 21),
 (119, 30),
 (127, 30),
 (129, 30),
 (131, 30),
 (134, 30),
 (149, 30),
 (150, 30),
 (210, 30)]

In [10]:
df5 = rdd5.toDF()
df5.printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: long (nullable = true)



In [11]:
df5.show()

+---+---+
| _1| _2|
+---+---+
| 10|  2|
| 15|  3|
| 35|  7|
| 50| 10|
| 99| 30|
|100| 30|
|105| 21|
|119| 30|
|127| 30|
|129| 30|
|131| 30|
|134| 30|
|149| 30|
|150| 30|
|210| 30|
|265| 60|
|300| 60|
|350| 70|
|400| 80|
|447| 90|
+---+---+
only showing top 20 rows



In [14]:
df6 = df5.selectExpr("_1 AS actual_paid", "_2 AS plan_days")

In [15]:
df6.coalesce(1).write.format('csv').option('header','true').save('result.csv')