In [1]:
import pandas as pd
import numpy as np
import os
import re

# Set working directory
if not "/data/tables" in os.getcwd():
    os.chdir("../data/tables")

from pyspark.sql import SparkSession
from pyspark.shell import spark
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import matplotlib.pyplot as plt

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.3.0
      /_/

Using Python version 3.9.12 (main, Apr  4 2022 05:22:27)
Spark context Web UI available at http://192.168.1.6:4042
Spark context available as 'sc' (master = local[*], app id = local-1662966280318).
SparkSession available as 'spark'.


In [2]:
# Read csv file
consumer = spark.read.option("delimiter", "|").csv('tbl_consumer.csv', header = True)
consumer_postcode = consumer.drop("name","address")
consumer_postcode

state,postcode,gender,consumer_id
WA,6935,Female,1195503
NSW,2782,Female,179208
NT,862,Female,1194530
NSW,2780,Female,154128
WA,6355,Female,712975
NSW,2033,Female,407340
QLD,4606,Female,511685
WA,6056,Male,448088
NSW,2482,Female,650435
VIC,3220,Female,1058499


In [3]:
consumer_postcode = consumer_postcode.toPandas()
consumer_postcode

Unnamed: 0,state,postcode,gender,consumer_id
0,WA,6935,Female,1195503
1,NSW,2782,Female,179208
2,NT,862,Female,1194530
3,NSW,2780,Female,154128
4,WA,6355,Female,712975
...,...,...,...,...
499994,QLD,4400,Female,1385608
499995,VIC,3097,Undisclosed,1466964
499996,NSW,2756,Undisclosed,1253484
499997,VIC,3989,Female,175005


In [4]:
consumer_postcode["postcode"] = consumer_postcode["postcode"].apply(lambda x : int(x))

In [5]:
import pandas as pd
import io
import requests
url="https://www.matthewproctor.com/Content/postcodes/australian_postcodes.csv"
s=requests.get(url).content
df=pd.read_csv(io.StringIO(s.decode('utf-8')))
sa2_code = df[['postcode','SA2_MAINCODE_2016']]
sa2_code.rename(columns = {'SA2_MAINCODE_2016':'SA2_code'},inplace=True)
sa2_code = sa2_code.drop_duplicates()

sa2_code

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sa2_code.rename(columns = {'SA2_MAINCODE_2016':'SA2_code'},inplace=True)


Unnamed: 0,postcode,SA2_code
0,200,801051049.0
2,800,701011002.0
4,801,701011002.0
5,804,701011007.0
6,810,701021010.0
...,...,...
18437,9013,305011105.0
18438,9015,305011105.0
18439,9464,302031038.0
18440,9726,309101268.0


In [6]:
income = pd.read_csv("./SA2/AUS/2021Census_G02_AUST_SA2.csv")
income = income[['SA2_CODE_2021','Median_age_persons','Median_tot_prsnl_inc_weekly']]
income.rename(columns = {'SA2_CODE_2021':'SA2_code'},inplace=True)
income

Unnamed: 0,SA2_code,Median_age_persons,Median_tot_prsnl_inc_weekly
0,101021007,51,760
1,101021008,38,975
2,101021009,37,996
3,101021010,36,1104
4,101021012,37,1357
...,...,...,...
2467,901021002,40,741
2468,901031003,38,585
2469,901041004,50,736
2470,997979799,0,0


# join

In [7]:
merge1 = pd.merge(consumer_postcode, sa2_code, how='left', on = 'postcode')
merge2 = pd.merge(merge1,income, how = 'left',on='SA2_code')
merge2

Unnamed: 0,state,postcode,gender,consumer_id,SA2_code,Median_age_persons,Median_tot_prsnl_inc_weekly
0,WA,6935,Female,1195503,504031066.0,40.0,749.0
1,NSW,2782,Female,179208,124011455.0,50.0,740.0
2,NT,862,Female,1194530,702021055.0,26.0,416.0
3,NT,862,Female,1194530,702051066.0,28.0,276.0
4,NT,862,Female,1194530,702021056.0,33.0,671.0
...,...,...,...,...,...,...,...
882478,NSW,2756,Undisclosed,1253484,124041466.0,39.0,790.0
882479,NSW,2756,Undisclosed,1253484,115031300.0,40.0,907.0
882480,NSW,2756,Undisclosed,1253484,115041301.0,39.0,920.0
882481,VIC,3989,Female,175005,,,


In [8]:
#!pip install pandasql

In [30]:
from pandasql import sqldf

query = """
SELECT consumer_id,
avg(Median_age_persons) as mean_age,
avg(Median_tot_prsnl_inc_weekly) as mean_tot_prsnl_inc_weekly

from merge2
group by consumer_id
order by consumer_id

"""


df1 = sqldf(query)

df1

Unnamed: 0,consumer_id,mean_age,mean_tot_prsnl_inc_weekly
0,10,42.0,770.0
1,100,,
2,1000002,55.0,562.0
3,1000003,40.0,780.0
4,1000006,45.0,830.0
...,...,...,...
499994,999993,50.0,533.0
499995,999994,55.0,881.0
499996,999995,52.0,783.0
499997,999997,40.4,889.6


In [32]:
df1.to_csv("../curated/weekly_income.csv")

# VIP data 만들기

In [10]:
path = "transactions_20210228_20210827_snapshot/"
list_files = os.listdir(path)
list_files = list_files[1:(len(list_files)-1)]

# import modules
from pyspark.sql import SparkSession
import functools
 
# explicit function
def unionAll(dfs):
    return functools.reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), dfs)

# read files
file_name = os.listdir(path+ list_files[0])[1]
transaction = spark.read.parquet(path+ list_files[0] +"/" + file_name)
transaction = transaction.withColumn('order_datetime',lit(list_files[0][15:]))
for i in list_files[1:]:
    file_name = os.listdir(path + i)[1]
    tmp = spark.read.parquet(path+ i +"/" + file_name)
    tmp = tmp.withColumn('order_datetime',lit(i[15:]))
    transaction = unionAll([transaction, tmp] )

transaction

user_id,merchant_abn,dollar_value,order_id,order_datetime
1,28000487688,133.22689421562643,0c37b3f7-c7f1-48c...,2021-02-28
18485,62191208634,79.13140006851712,9e18b913-0465-4fd...,2021-02-28
1,83690644458,30.441348317517228,40a2ff69-ea34-465...,2021-02-28
18488,39649557865,962.8133405407584,f4c1a5ae-5b76-40d...,2021-02-28
2,80779820715,48.12397733548124,cd09bdd6-f56d-489...,2021-02-28
18489,43186523025,98.14878546968934,9008a98e-1b02-4de...,2021-02-28
3,29566626791,46.33087226118639,26b7574e-81c2-455...,2021-02-28
18490,93558142492,232.83335268750145,2bda0665-796f-4f2...,2021-02-28
3,32361057556,87.34942171371054,633a7656-2fcc-4b8...,2021-02-28
18491,64974914166,130.12601873970038,4bc15338-83eb-43d...,2021-02-28


In [11]:
user_detail = spark.read.parquet("consumer_user_details.parquet")
user_detail

user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975
6,407340
7,511685
8,448088
9,650435
10,1058499


In [12]:
# Join transaction data with customer data
customer_transaction = (transaction.join(user_detail, transaction.user_id == user_detail.user_id)
                                   .drop(user_detail.user_id))

customer_transaction = (customer_transaction.join(consumer, customer_transaction.consumer_id == consumer.consumer_id)
                                            .drop(consumer.consumer_id)
                                            .select(transaction['*'], consumer.postcode, consumer.state, consumer.gender))
#make 'month' column extracting from 'order_datetime'
customer_transaction = customer_transaction.select(col("*"),col("order_datetime"),
          month(col("order_datetime")).alias('month')
                           )
customer_transaction

user_id,merchant_abn,dollar_value,order_id,order_datetime,postcode,state,gender,order_datetime.1,month
3698,55778594682,21.941266654463465,8a806e0d-558d-468...,2021-02-28,2299,NSW,Male,2021-02-28,2
3698,10648956813,99.30549322421652,e493f287-efe0-425...,2021-03-03,2299,NSW,Male,2021-03-03,3
3698,75089928159,3.363306277086005,94b7fb1a-82d1-422...,2021-03-06,2299,NSW,Male,2021-03-06,3
3698,42543374304,351.2979463642349,451bdc08-cc6e-41b...,2021-03-08,2299,NSW,Male,2021-03-08,3
3698,54611298155,2034.6296050908,dccaacc6-a0fd-44c...,2021-03-10,2299,NSW,Male,2021-03-10,3
3698,24852446429,22.143273437576056,c6b2cfa3-2494-4fa...,2021-03-12,2299,NSW,Male,2021-03-12,3
3698,63123845164,384.9929053083648,ee723c3a-ad57-4b3...,2021-03-12,2299,NSW,Male,2021-03-12,3
3698,63290521567,42.178347325192256,7d5afc38-5be8-4c6...,2021-03-12,2299,NSW,Male,2021-03-12,3
3698,42355028515,76.8823425915479,274dfcce-a369-46c...,2021-03-14,2299,NSW,Male,2021-03-14,3
3698,65674339048,80.50821804740839,dd90c0e0-e343-40c...,2021-03-16,2299,NSW,Male,2021-03-16,3


In [13]:
# 상점 별로, user id당 몇명씩 방문했는지
# 예) 5024가 상점 43186523025에 12번 방문했다.
customer_transaction.createOrReplaceTempView("customer_transaction")


customer_transaction_count = spark.sql(
    """
    Select  user_id, merchant_abn,
    count(user_id) as count_user_id 

    from customer_transaction
    group by user_id, merchant_abn 
    order by  count_user_id desc
    """)
customer_transaction_count

user_id,merchant_abn,count_user_id
5024,43186523025,12
18011,24852446429,12
15077,24852446429,12
11998,86578477987,11
20370,24852446429,11
16270,24852446429,11
1640,64203420245,11
8633,86578477987,11
21030,64203420245,11
1782,45629217853,11


In [14]:
# 각 상점 별 1인당 평균 방문횟수와 표준편차
customer_transaction_count.createOrReplaceTempView("customer_transaction_count")


customer_transaction_count_mean = spark.sql(
    """
    Select  merchant_abn,
    avg(count_user_id) as mean_visited_user ,
    std(count_user_id) as std_visited_user 

    from customer_transaction_count
    group by merchant_abn 
    """)
customer_transaction_count_mean

merchant_abn,mean_visited_user,std_visited_user
83412691377,1.0780834072759538,0.2802080696053156
38700038932,1.0318613750698715,0.1756800491057901
73841664453,1.0077220077220077,0.0877044893920449
73256306726,1.029126213592233,0.1682231010041045
24406529929,1.0215475024485798,0.1452717364051517
35344855546,1.0078740157480317,0.0885018280030642
19839532017,1.0106382978723405,0.1028659738635932
15613631617,1.005859375,0.0763966100389298
29216160692,1.0239574090505768,0.1586833871813572
59128133246,1.0303217821782178,0.1750970233885955


In [19]:
#https://sixsigmadsi.com/standard-deviation-measure-of-dispersion/ -> 2인 이유
# merge / vip_standard: 
customer_transaction_count_mean.createOrReplaceTempView("customer_transaction_count_mean")\

join_data = spark.sql(
    """
    Select a.*, b.mean_visited_user, b.std_visited_user,
    b.mean_visited_user + 2* b.std_visited_user as vip_standard

    from customer_transaction_count as a left join customer_transaction_count_mean as b
    on  a.merchant_abn = b.merchant_abn 
    """)
join_data


user_id,merchant_abn,count_user_id,mean_visited_user,std_visited_user,vip_standard
21223,49891706470,11,2.843536198184501,1.5045697489880956,5.852675696160692
5024,43186523025,12,2.426085320539956,1.3264281797203297,5.078941679980615
1120,45629217853,11,2.640505638413969,1.4262816579485622,5.493068954311093
1782,45629217853,11,2.640505638413969,1.4262816579485622,5.493068954311093
8009,86578477987,11,3.060587461639632,1.6002360205596278,6.261059502758887
11998,86578477987,11,3.060587461639632,1.6002360205596278,6.261059502758887
8951,86578477987,11,3.060587461639632,1.6002360205596278,6.261059502758887
8633,86578477987,11,3.060587461639632,1.6002360205596278,6.261059502758887
21272,86578477987,11,3.060587461639632,1.6002360205596278,6.261059502758887
9928,64203420245,11,2.958622518898369,1.538645566811654,6.0359136525216766


In [20]:
# 각 customer들이 vip인지 아닌지 여부 확인
vip = join_data.withColumn("VIP", \
   when((join_data.count_user_id > join_data.vip_standard), lit(1)) \
     .otherwise(lit(0)) \
  )
vip

user_id,merchant_abn,count_user_id,mean_visited_user,std_visited_user,vip_standard,VIP
3539,73256306726,1,1.029126213592233,0.1682231010041045,1.365572415600442,0
14703,83412691377,1,1.0780834072759538,0.2802080696053156,1.638499546486585,0
17474,19839532017,1,1.0106382978723405,0.1028659738635932,1.216370245599527,0
4014,73841664453,1,1.0077220077220077,0.0877044893920449,1.1831309865060975,0
20627,24406529929,1,1.0215475024485798,0.1452717364051517,1.3120909752588834,0
21162,34440496342,1,1.0,0.0,1.0,0
18306,24406529929,1,1.0215475024485798,0.1452717364051517,1.3120909752588834,0
15297,24406529929,1,1.0215475024485798,0.1452717364051517,1.3120909752588834,0
8586,19839532017,1,1.0106382978723405,0.1028659738635932,1.216370245599527,0
6238,73256306726,1,1.029126213592233,0.1682231010041045,1.365572415600442,0


In [25]:
vip.createOrReplaceTempView("vip")


vip_count = spark.sql(
    """
    Select   merchant_abn,
    sum(VIP) as sum_vip,
    avg(VIP) * 100 as percentage_vip 

    from vip
    group by merchant_abn 
    order by  sum_vip desc, percentage_vip desc
    """)
vip_count

merchant_abn,sum_vip,percentage_vip
80324045558,1433,6.777656907723596
49891706470,1260,5.634306667262889
91923722701,1206,8.824820722962096
68216911708,1108,5.389367187119996
63290521567,1049,5.11059144499659
46804135891,1028,4.652636343064041
21439773999,995,5.713137344970142
50315283629,972,14.95154591601292
24852446429,936,4.079497907949791
49505931725,925,7.421373555840821


In [26]:
vip_count = vip_count.toPandas()

In [28]:
vip_count.to_csv("../curated/vip_count.csv")

In [18]:
# 각 상점을 방문하는 손님의 수입이 평균 + 2sd 이상이 사람의 N수 