In [1]:
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

In [3]:
period1 = pd.date_range('2021-2-28','2021-08-27', freq='D').strftime("%Y-%m-%d").tolist()
period2 = pd.date_range('2021-8-28','2022-02-27', freq='D').strftime("%Y-%m-%d").tolist()
period3 = pd.date_range('2022-2-28','2022-08-28', freq='D').strftime("%Y-%m-%d").tolist()
path_prefix1 = '../data/tables/transactions_20210228_20210827_snapshot/order_datetime='
path_prefix2 = '../data/tables/transactions_20210828_20220227_snapshot/order_datetime='
path_prefix3 = '../data/tables/transactions_20220228_20220828_snapshot/order_datetime='


In [4]:
# add a column of date
from pyspark.sql.functions import lit

In [6]:
dfSchema = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot/order_datetime=2021-02-28')
dfSchema.schema

StructType([StructField('user_id', LongType(), True), StructField('merchant_abn', LongType(), True), StructField('dollar_value', DoubleType(), True), StructField('order_id', StringType(), True)])

In [5]:
# Create a empty dataframe to 

from pyspark.sql.types import StructType,StructField, StringType, LongType, DoubleType, IntegerType
emptyRDD = spark.sparkContext.emptyRDD()
schema = StructType([StructField('user_id', LongType(), True), StructField('merchant_abn', LongType(), True), StructField('dollar_value', DoubleType(), True), StructField('order_id', StringType(), True), StructField('date', IntegerType(), False)])
df = spark.createDataFrame(emptyRDD,schema)
df

user_id,merchant_abn,dollar_value,order_id,date


In [6]:

for date in period1:

    tep = spark.read.parquet(path_prefix1 + date)
    
    tep = tep.withColumn("date", lit(date))
    df = df.union(tep)
    


In [7]:
for date in period2:

    tep = spark.read.parquet(path_prefix2 + date)
    
    tep = tep.withColumn("date", lit(date))
    df = df.union(tep)

for date in period3:

    tep = spark.read.parquet(path_prefix3 + date)
    
    tep = tep.withColumn("date", lit(date))
    df = df.union(tep)

In [None]:
df.limit(3)

In [None]:
df1=df.select(F.date_format('date','yyyy-MM').alias('month'),'merchant_abn','dollar_value')
df1.limit(5)

In [None]:
df2 = df1.groupby('month','merchant_abn').agg(sum('dollar_value').alias('monthly_value'))

In [None]:
from pyspark.sql.window import Window

price_window = Window.partitionBy("merchant_abn").orderBy("month")
df5 = df2.withColumn("prev_value", F.lag('monthly_value').over(price_window))


In [None]:
df5.limit(5)

In [None]:
df6 = df5.withColumn("growth_rate_per_month", F.when(F.isnull(df5.monthly_value - df5.prev_value), 0).otherwise((df5.monthly_value - df5.prev_value)/df5.monthly_value))

In [None]:
df6.limit(5)

In [None]:
df7=df6.groupby("merchant_abn").agg(sum("growth_rate_per_month"))

In [None]:
df7.limit(10)