In [50]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

22/09/12 15:21:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Read the given datasets

In [34]:
ori_transaction1 = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot').sort('order_datetime')
ori_transaction2 = spark.read.parquet('../data/tables/transactions_20210828_20220227_snapshot').sort('order_datetime')
ori_transaction3 = spark.read.parquet('../data/tables/transactions_20220228_20220828_snapshot').sort('order_datetime')

#read the parquet ori_transaction
ori_transaction = ori_transaction1.union(ori_transaction2)
ori_transaction = ori_transaction.union(ori_transaction3)
ori_transaction.groupby('merchant_abn').count().count()

                                                                                

4422

In [40]:
tbl_merchants = spark.read.parquet('../data/tables/tbl_merchants.parquet')
tbl_merchants.count()

4026

In [41]:
consumer_detail = spark.read.parquet('../data/tables/consumer_user_details.parquet')

In [5]:
import pandas as pd
tbl_consumer = spark.read.option("delimiter", "|").option("header",True).csv("../data/tables/tbl_consumer.csv")

In [6]:
import pandas as pd
tbl_consumer = spark.read.option("delimiter", "|").option("header",True).csv("../data/tables/tbl_consumer.csv")

# Merchant datasets

In [36]:
from pyspark.sql.functions import round
grouped_transaction = ori_transaction.groupBy('merchant_abn').agg(F.sum('dollar_value').alias('Amount'), F.count('dollar_value').alias('Count')).sort('merchant_abn')
grouped_transaction.drop(F.col('order_id'))
grouped_transaction = grouped_transaction.withColumn('Monthly Amount', round(grouped_transaction['Amount']/12, 2))
#grouped_transaction = grouped_transaction.withColumn('Round Monthly Amount', round(grouped_transaction['Monthly Amount'], 2))
grouped_transaction = grouped_transaction.withColumn('Monthly Count', round(grouped_transaction['Count']/12, 2))
grouped_transaction

                                                                                

4422

In [38]:
grouped_transaction_pd = grouped_transaction.toPandas()
tbl_merchants_pd = tbl_merchants.toPandas()
merchant = pd.merge(grouped_transaction_pd, tbl_merchants_pd)
merchant.to_parquet("../data/curated/merchant.parquet")

Amount            4026
Count             4026
Monthly Amount    4026
Monthly Count     4026
name              4026
tags              4026
dtype: int64

In [9]:
for i in range(int(merchant['tags'].count())):
    merchant['tags'].iloc[i] = merchant['tags'].iloc[i].replace(r'[', r'(').replace(r']', r')')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merchant['tags'].iloc[i] = merchant['tags'].iloc[i].replace(r'[', r'(').replace(r']', r')')


In [10]:
#merchant['tags'].str.split(pat, n=-1, expand=False)
# to split into multiple columns by delimiter
merchant_tags = merchant['tags'].str.split(')', expand=True)

In [11]:
for row in range(int(len(merchant_tags))):
    for col in range(3):
        merchant_tags.iloc[row,col] = merchant_tags.iloc[row,col].replace(r'((', r'').replace(r', (', r'').replace(r'take rate:', r'')
merchant_tags.rename(columns = {0 : 'Store type', 1 : 'Revenue levels', 2 : 'Take rate'}, inplace = True)
merchant_tags = merchant_tags[['Store type', 'Revenue levels', 'Take rate']]

In [12]:
import pyspark.sql.functions as f
merchant[['Store type', 'Revenue levels', 'Take rate']] = merchant_tags[['Store type', 'Revenue levels', 'Take rate']]
#merchant.drop(columns=['tags'])
merchant['Store type'] = merchant['Store type'].str.lower()

for i in range(len(merchant)):
    merchant['Store type'][i] = ' '.join(merchant['Store type'][i].split())

#merchant
merchant.groupby('Store type').count()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merchant['Store type'][i] = ' '.join(merchant['Store type'][i].split())


Unnamed: 0_level_0,merchant_abn,Amount,Count,Monthly Amount,Monthly Count,name,tags,Revenue levels,Take rate
Store type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"antique shops - sales, repairs, and restoration services",129,129,129,129,129,129,129,129,129
art dealers and galleries,112,112,112,112,112,112,112,112,112
artist supply and craft shops,193,193,193,193,193,193,193,193,193
bicycle shops - sales and service,170,170,170,170,170,170,170,170,170
"books, periodicals, and newspapers",164,164,164,164,164,164,164,164,164
"cable, satellite, and other pay television and radio services",175,175,175,175,175,175,175,175,175
"computer programming , data processing, and integrated systems design services",191,191,191,191,191,191,191,191,191
"computers, computer peripheral equipment, and software",181,181,181,181,181,181,181,181,181
"digital goods: books, movies, music",195,195,195,195,195,195,195,195,195
"equipment, tool, furniture, and appliance rent al and leasing",134,134,134,134,134,134,134,134,134


In [13]:
merchant['Store type'][0]

'furniture, home furnishings and equipment shops, and manufacturers, except appliances'

# Consumer datasets

In [14]:
consumer_detail_pd = consumer_detail.toPandas()
tbl_consumer_pd = tbl_consumer.toPandas()
tbl_consumer_pd['consumer_id']=tbl_consumer_pd['consumer_id'].astype(int)
consumer = pd.merge(tbl_consumer_pd, consumer_detail_pd)
consumer

                                                                                

Unnamed: 0,name,address,state,postcode,gender,consumer_id,user_id
0,Yolanda Williams,413 Haney Gardens Apt. 742,WA,6935,Female,1195503,1
1,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208,2
2,Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530,3
3,Lindsay Jimenez,00653 Davenport Crossroad,NSW,2780,Female,154128,4
4,Rebecca Blanchard,9271 Michael Manors Suite 651,WA,6355,Female,712975,5
...,...,...,...,...,...,...,...
499994,Jessica Avila,508 Miranda Overpass Apt. 218,QLD,4400,Female,1385608,499995
499995,Steven Thornton,7913 Schwartz Mission Suite 483,VIC,3097,Undisclosed,1466964,499996
499996,Christy Smith,5681 Zachary Mountain Apt. 060,NSW,2756,Undisclosed,1253484,499997
499997,Donna Sutton,54140 Jacob Point,VIC,3989,Female,175005,499998


In [15]:
grouped_transaction = ori_transaction.groupBy('user_id').agg(F.avg('dollar_value').alias('avg'), F.sum('dollar_value').alias('sum'), F.count('dollar_value').alias('count')).sort('user_id')
grouped_transaction.drop(F.col('order_id'))
grouped_transaction_df = grouped_transaction.toPandas()
consumer_transaction = pd.merge(consumer, grouped_transaction_df)
consumer_transaction.to_parquet("../data/curated/consumer_transaction.parquet")

22/09/12 14:11:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 14:11:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 14:11:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 14:11:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 14:11:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 14:11:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 14:11:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 14:11:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 14:11:07 WARN RowBasedKeyValueBatch: Calling spill() on

                                                                                

Unnamed: 0,name,address,state,postcode,gender,consumer_id,user_id,avg,sum,count
0,Yolanda Williams,413 Haney Gardens Apt. 742,WA,6935,Female,1195503,1,162.546514,94927.163969,584
1,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208,2,211.886336,123953.506822,585
2,Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530,3,165.307193,97035.322010,587
3,Lindsay Jimenez,00653 Davenport Crossroad,NSW,2780,Female,154128,4,147.710376,87592.252770,593
4,Rebecca Blanchard,9271 Michael Manors Suite 651,WA,6355,Female,712975,5,155.797170,94880.476559,609
...,...,...,...,...,...,...,...,...,...,...
24076,Maria Lopez,6036 Rosario Mount Apt. 283,QLD,4474,Female,831103,24077,140.012095,83727.232529,598
24077,Mark Fischer,31055 Scott Brooks Apt. 953,WA,6566,Male,487249,24078,160.996582,93378.017587,580
24078,Tony Schmidt,670 Burke Turnpike Apt. 079,VIC,3029,Male,256441,24079,205.771055,122228.006616,594
24079,Amy Russo,4525 Frazier Meadows,NSW,2809,Female,940951,24080,149.980159,88638.274060,591


## Guess postcode for the merchant

In [51]:
merchant_abn_and_consumer_id = ori_transaction['merchant_abn', 'user_id']
#merchant_and_consumer_postcode['consumer postcode'] = c
merchant_abn_and_consumer_id
user_id_and_postcode = consumer[['postcode','user_id']]

user_id_and_postcode=spark.createDataFrame(user_id_and_postcode) 
#merchant_and_consumer_postcode = spark.merge(merchant_abn_and_consumer_id, user_id_and_postcode)
merchant_and_consumer_postcode = merchant_abn_and_consumer_id.join(user_id_and_postcode,['user_id'])
merchant_and_consumer_postcode = merchant_and_consumer_postcode['merchant_abn', 'postcode']
#merchant_and_consumer_postcode

#https://stackoverflow.com/questions/36654162/mode-of-grouped-data-in-pyspark
counts = merchant_and_consumer_postcode.groupBy(['merchant_abn', 'postcode']).count().alias('counts')
merchant_postcode = (counts
          .groupBy('merchant_abn')
          .agg(F.max(F.struct(F.col('count'),
                              F.col('postcode'))).alias('max'))
          .select(F.col('merchant_abn'), F.col('max.postcode'))
         )
merchant_postcode




22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on

[Stage 774:>                                                        (0 + 8) / 9]

22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:26 WARN RowBasedKeyValueBatch: Calling spill() on



22/09/12 15:21:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:33 WARN RowBasedKeyValueBatch: Calling spill() on

[Stage 806:>                                                        (0 + 8) / 9]

22/09/12 15:21:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:34 WARN RowBasedKeyValueBatch: Calling spill() on

                                                                                

merchant_abn,postcode
10023283211,5582
10142254217,6438
10165489824,6421
10187291046,7054
10192359162,9015
10206519221,5273
10255988167,3287
10264435225,6001
10279061213,6321
10323485998,4856


## Percentage of regular customer

In [52]:
ori_transaction_1 = ori_transaction.groupby('merchant_abn', 'user_id').count()
o_t = ori_transaction.groupby('merchant_abn').agg(F.count('user_id').alias('cnt'))
ori_con = ori_transaction_1.join(o_t, ori_transaction_1.merchant_abn == o_t.merchant_abn).drop(o_t.merchant_abn)
ori_con_drop = ori_con.filter(F.col("count") > 10)
ori_con_fix = ori_con_drop.groupby('merchant_abn').agg(F.count('user_id').alias('fix_cus_num'), F.avg('cnt').alias('total_cus_num'))
ori_con_fix_prob = ori_con_fix.withColumn("fix_cus_prob", ori_con_fix.fix_cus_num /ori_con_fix.total_cus_num)
ori_con_fix_prob

22/09/12 15:21:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:46 WARN RowBasedKeyValueBatch: Calling spill() on

[Stage 831:>                (0 + 8) / 8][Stage 832:>                (0 + 0) / 8]

22/09/12 15:21:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:48 WARN RowBasedKeyValueBatch: Calling spill() on



22/09/12 15:21:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:50 WARN TaskMemoryManager: Failed to allocate a page (67108864 bytes), try again.
22/09/12 15:21:51 WARN TaskMemoryManager: Failed to allocate a page (67108864 bytes), try again.
22/09/12 15:21:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/09/12 15:21:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/09/12 15:21:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/09/12 15:21:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

22/09/12 15:21:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:55 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:55 WARN RowBasedKeyValueBatch: Calling spill() on

[Stage 852:>                (0 + 8) / 8][Stage 853:>                (0 + 0) / 8]

22/09/12 15:21:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:57 WARN RowBasedKeyValueBatch: Calling spill() on



22/09/12 15:21:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:21:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/09/12 15:22:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:22:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/12 15:22:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

merchant_abn,fix_cus_num,total_cus_num,fix_cus_prob
10648956813,10,21981.0,4.54938355852782E-4
86662713230,10,21545.0,4.641448131817127E-4
41383736952,1,11659.0,8.577064928381508E-5
27093785141,20,25928.0,7.713668620796051E-4
49891706470,22666,247526.0,0.09157017848630043
15560455575,40,29246.0,0.001367708404568...
52959528548,220,42507.0,0.005175618133483...
71528203369,46,32404.0,0.001419577829897...
67202032418,23,25504.0,9.018193224592221E-4
89726005175,21227,215963.0,0.09828998485851743


22/09/12 17:41:51 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 947443 ms exceeds timeout 120000 ms
22/09/12 17:41:52 WARN SparkContext: Killing executors is not supported by current scheduler.
22/09/12 20:48:43 WARN TransportChannelHandler: Exception in connection from /10.12.231.16:56224
java.io.IOException: Operation timed out
	at java.base/sun.nio.ch.SocketDispatcher.read0(Native Method)
	at java.base/sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:47)
	at java.base/sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:340)
	at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:294)
	at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:269)
	at java.base/sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:417)
	at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:258)
	at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1132)
	at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:350)
	at io.netty.channe