In [0]:
# Update_Transactions_and_2-gram_Table_for_Dashboard

In [0]:
from pyspark.sql.functions import countDistinct, col, sum as sum_, concat, lit, year, month
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
import re

In [0]:
event_data_delta_file_path = "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/retailers_us_with_levels"
all_retailers_with_category_levels = spark.read.option("header", "true").format("delta").load(event_data_delta_file_path)
all_retailers_with_category_levels = all_retailers_with_category_levels.withColumn("ds", all_retailers_with_category_levels.timestamp.cast('date'))
all_retailers_with_category_levels = all_retailers_with_category_levels.withColumn("date_year", year(col('date')))
all_retailers_with_category_levels = all_retailers_with_category_levels.withColumn("date_month", month(col('date')))

In [0]:
# pull demographic data table and weights table 

main_demo_data_delta_file_path = "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/dashboard_demo_data"
demo_data = spark.read.option("header", "true").format("delta").load(main_demo_data_delta_file_path)

weighting_file_path = "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/monthly_weighting_dictionary"
weighting_df = spark.read.option("header", "true").format("delta").load(weighting_file_path)

In [0]:
all_retailers_with_category_levels = all_retailers_with_category_levels.withColumn("category", col("level_0"))

# here addToCart is as the proxy of checkOut as data between Sep and Dec seems to be broken
filtered_event_data_df = all_retailers_with_category_levels.filter((col('category').isNotNull())& \
                                             (col('event_name') == 'addToCart') & \
                                             (col('price').isNotNull()) & \
                                             (col('quantity').isNotNull()))
# add demo information
filtered_event_data_df = filtered_event_data_df.join(demo_data, ['user_id'])

# add weight information
filtered_event_data_df = filtered_event_data_df.join(weighting_df, ['date_year', 'date_month', 'state', 'gender', 'age_group','ethnicity'])

# for normal case
filtered_event_data_df = filtered_event_data_df.select(col("user_id"),
                                                       col("date"),
                                                       col("quantity"), 
                                                       col("price"), 
                                                       col("category"),
                                                       col("weights"))

filtered_event_data_df = filtered_event_data_df.withColumn("total_amount_weighted", col("price")*col("quantity")*col("weights"))
filtered_event_data_df = filtered_event_data_df.withColumn("total_amount", col("price")*col("quantity"))

filtered_event_data_df = filtered_event_data_df.groupBy("category", "user_id",  "date", "weights")\
                                            .agg(sum_("total_amount").alias("total_spend"), sum_("total_amount_weighted").alias("total_spent_weighted"))

In [0]:
all_retailers_transaction_data_delta_file_path = "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/all_retailers_transaction_per_category_per_person"
dbutils.fs.rm(all_retailers_transaction_data_delta_file_path, True)
filtered_event_data_df.write.format("delta").save(all_retailers_transaction_data_delta_file_path)

In [0]:
%sql
drop table if exists disqo_dashboard_db.all_retailers_dashboard_transaction_per_category_per_person;
create TABLE disqo_dashboard_db.all_retailers_dashboard_transaction_per_category_per_person
USING delta
LOCATION "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/all_retailers_transaction_per_category_per_person"

# Process Amazon Product Name into 2-gram for Dashboard

In [0]:
def generate_ngrams(s, n, stop_words):
    # Convert to lowercases
    s = s.lower().strip()
    if len(s) == 0:
      return ['']
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    if len(tokens) <= 1:
      return ['']
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    if n == 2:
      ans = []
      for ngram in ngrams:
        # skip 2-grams that includes stop words
        if len(set(ngram).intersection(stop_words)) != 0:
          continue
        # skip 2-grams that consist of two decimal number
        if ngram[0].isdecimal() and ngram[1].isdecimal():
          continue
        ans.append(" ".join(ngram))
      return ans
    else:
      return [" ".join(ngram) for ngram in ngrams]

In [0]:
selected_all_retailers_data = all_retailers_with_category_levels.select('user_id', 'session_id', 'page_domain', 'event_name', 'date', 'level_0', 'level_1','product_name')

In [0]:
#generate 2-gram of product name entry

all_retilers_product_name = selected_all_retailers_data.filter((col('product_name').isNotNull()) & (col('level_0').isNotNull())).rdd.cache()
all_retailers_2gram_result = all_retilers_product_name.map(lambda x: ((x[0]+'_'+\
                                                    str(x[1])+'_'+\
                                                    str(x[2])+'_'+\
                                                    str(x[3])+'_'+\
                                                    str(x[4])+'_'+\
                                                    str(x[5])+'_'+\
                                                    str(x[6])+'_'+\
                                                    str(x[7])), generate_ngrams(x[7], 2, stop_words)))\
                                      .flatMapValues(lambda y: y)\
                                          .map(lambda x: (x[0].split('_')[0], 
                                                          x[0].split('_')[1], 
                                                          x[0].split('_')[2], 
                                                          x[0].split('_')[3], 
                                                          x[0].split('_')[4], 
                                                          x[0].split('_')[5], 
                                                          x[0].split('_')[6], 
                                                          x[0].split('_')[7],
                                                          x[1]))\
                                                .toDF(('user_id', 'session_id', 'page_domain', 'event_name','date', 'level_0', 'level_1', 'product_name', '2_gram_product_name'))

In [0]:
all_retailers_bigram_product_name = "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/all_retailers_bigram_product_name"
dbutils.fs.rm(all_retailers_bigram_product_name, True)
all_retailers_2gram_result.write.format("delta").save(all_retailers_bigram_product_name)

In [0]:
%sql
drop table if exists disqo_dashboard_db.all_retailers_bigram_product_name;
CREATE TABLE disqo_dashboard_db.all_retailers_bigram_product_name
USING delta
LOCATION "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/all_retailers_bigram_product_name"