# Customer retention

In this notebook, we will generate features that indicate customer retention for merchants.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Customer Retention")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "8g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("OFF")

Read all data.

In [None]:
all_data_sdf = spark.read.parquet("../data/curated/fraud_watch/")
all_data_sdf.printSchema()
print(all_data_sdf.count())
all_data_sdf.limit(5)

## 1. Derive necessary information from merchant-customer relationship.
1. Calculate median transaction frequency between each customer and merchant.

In [None]:
from pyspark.sql import functions as F

# Step 1: Extract year and month from the 'order_datetime' field
all_data_sdf = all_data_sdf.withColumn("year_month", F.date_format("order_datetime", "yyyy-MM"))

# Step 2: Group by 'merchant_abn', 'user_id', and 'year_month' to get monthly transaction counts
monthly_trans_count = all_data_sdf.groupBy("merchant_abn", "user_id", "year_month") \
    .agg(F.count("order_id").alias("monthly_count"))

# Step 3: Calculate the median transaction frequency for each 'merchant_abn' and 'user_id'
# We use approx_percentile to get the median (50th percentile)
median_trans_freq_sdf = monthly_trans_count.groupBy("merchant_abn", "user_id") \
    .agg(F.expr('percentile_approx(monthly_count, 0.5)').alias('median_transaction_frequency'))

# Step 4: Show the new dataframe with unique merchant_abn, user_id, and median_transaction_frequency
median_trans_freq_sdf.show()


In [None]:
median_trans_freq_sdf.count()

2. Generate a boolean field that indicates whether a customer is a returning customer for a certain merchant.
Returning customer is defined by having more than one transaction with the same merchant.

In [None]:
from pyspark.sql import functions as F

# Step 1: Group by 'merchant_abn' and 'user_id' to count transactions
customer_transaction_count = all_data_sdf.groupBy("merchant_abn", "user_id") \
    .agg(F.count("order_id").alias("transaction_count"))

# Step 2: Create a boolean column indicating if the customer is returning (more than 1 transaction)
returning_customer_sdf = customer_transaction_count.withColumn(
    "is_returning_customer", F.when(F.col("transaction_count") > 1, True).otherwise(False)
)

# Step 3: Show the resulting DataFrame
returning_customer_sdf.select("merchant_abn", "user_id", "is_returning_customer").show()

returning_customer_sdf.show()

In [None]:
returning_customer_sdf.count()

3. Find median transaction value between each merchant and customer.

In [None]:
from pyspark.sql import functions as F

# Step 1: Group by 'merchant_abn' and 'user_id', then calculate the median transaction value
median_transaction_value_sdf = all_data_sdf.groupBy("merchant_abn", "user_id") \
    .agg(F.expr('percentile_approx(dollar_value, 0.5)').alias('median_transaction_value'))

# Step 2: Show the resulting DataFrame with merchant_abn, user_id, and median transaction value
median_transaction_value_sdf.show()


Now join these new columns together, on key (merchant_abn, user_id).

In [None]:
from pyspark.sql import functions as F

# Using the following dataframes:
# 1. median_trans_freq_sdf: Contains 'merchant_abn', 'user_id', and 'median_transaction_frequency'
# 2. median_transaction_value_sdf: Contains 'merchant_abn', 'user_id', and 'median_transaction_value'
# 3. returning_customer_sdf: Contains 'merchant_abn', 'user_id', and 'is_returning_customer'

# Step 1: Join median transaction frequency with median transaction value
merchant_customer_sdf = median_trans_freq_sdf.join(
    median_transaction_value_sdf,
    on=["merchant_abn", "user_id"],
    how="left"
)

# Step 2: Join the result with the returning customer DataFrame
merchant_customer_sdf = merchant_customer_sdf.join(
    returning_customer_sdf,
    on=["merchant_abn", "user_id"],
    how="left"
)

# Step 3: Show the final result
merchant_customer_sdf.show()


Now we have information for merchant-customer relationship on:
1. median_transaction_frequency
2. is_returning_customer
3. median_transaction_value

Do some visualisation to check for reasonableness.

In [None]:
# This cell takes a while to run. 

# Step 1: Take a 1% sample of the Spark DataFrame (without replacement)
merchant_customer_sample_sdf = merchant_customer_sdf.sample(fraction=0.0001, seed=42)

# Step 2: Convert the sampled Spark DataFrame to Pandas DataFrame
merchant_customer_pdf = merchant_customer_sample_sdf.toPandas()

# Step 3: Visualize distributions using the Pandas DataFrame

# Median Transaction Frequency Distribution
plt.figure(figsize=(10, 5))
plt.hist(merchant_customer_pdf['median_transaction_frequency'], bins=10, color='blue', edgecolor='black')
plt.title('Distribution of Median Transaction Frequency')
plt.xlabel('Median Transaction Frequency')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Median Transaction Value Distribution
plt.figure(figsize=(10, 5))
plt.hist(merchant_customer_pdf['median_transaction_value'], bins=10, color='green', edgecolor='black')
plt.title('Distribution of Median Transaction Value')
plt.xlabel('Median Transaction Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Is Returning Customer Distribution (Boolean Distribution)
plt.figure(figsize=(10, 5))
merchant_customer_pdf['is_returning_customer'].value_counts().plot(kind='bar', color='orange', edgecolor='black')
plt.title('Distribution of Returning Customers')
plt.xlabel('Is Returning Customer')
plt.ylabel('Count')
plt.xticks([0, 1], ['True', 'False'], rotation=0)
plt.grid(True)
plt.show()


## 2. Aggregate information by merchant

1. Count number of customers per merchant.
2. Find proprotion of returning customers per merchant.
3. Find median of median monthly customer transaction frequencies.
4. Find median of median customer transaction value.

In [None]:
from pyspark.sql import functions as F

# Calculate number of customers, returning customer proportion, and medians
merchant_aggregated_sdf = merchant_customer_sdf.groupBy("merchant_abn").agg(
    # Number of unique customers
    F.countDistinct("user_id").alias("number_of_customers"),
    
    # Returning customer proportion
    (F.sum(F.when(F.col("is_returning_customer") == True, 1).otherwise(0)) / 
     F.countDistinct("user_id")).alias("returning_customer_proportion"),
)

# Show the result
merchant_aggregated_sdf.show()
merchant_aggregated_sdf.count()


In [None]:
merchant_aggregated_sdf.write.mode("overwrite").parquet("../data/curated/customer_retention/")

In [None]:
merchant_two_aggregated_sdf = merchant_customer_sdf.groupBy("merchant_abn").agg(
    # Median transaction frequency
    F.expr('percentile_approx(median_transaction_frequency, 0.5)').alias('median_transaction_frequency'),
    
    # Median transaction value
    F.expr('percentile_approx(median_transaction_value, 0.5)').alias('median_transaction_value'),
)
merchant_two_aggregated_sdf.show()

In [None]:
merchant_aggregated_sdf.show()