# ReadMe

This notebook requires: 

1. Estimated CTM-3 model in Python, then upload **counts_phi.csv** to Databricks (make sure to uncheck the box 'first row contains header')
2. Run Data Pre-Processing notebook that processed the set of 'to-predict' customers, which uploads **y_segmentation.csv** and  **customers_segmentation.csv** to Databricks

Then: 

3. Run this notebook to generate the M3 customer segmentation

Note: Keep in mind that in different model fits, the order of motivations may vary. Hence, please revise the motivations_result.csv file in the python output, to identify the order of motivations and the corresponding labels (e.g. the healthy/conscious motivation may be motivation 1 in one fitted model, but become motivation 2 in the next fitted model) 

In [0]:
import pyspark.sql.functions as F


## Prerequisites

- upload counts_phi from python and y_segmentation from Data Pre Processing notebook 

### 1. Load the motivations (counts_phi)
- This file contains the 3 motivations, with corresponding product probabilities

In [0]:
# Upload latest counts_phi from fitted model 
df_phi_data = table('default.counts_phi')

# Rename the columns using aliasing
df_phi_data = df_phi_data.select(
    F.col('_c0').alias('M0'),
    F.col('_c1').alias('M1'),
    F.col('_c2').alias('M2')
)

# Verify whether the probabilities add up to 1

# Calculate the sum of the columns M0, M1, and M2
column_sums = df_phi_data.agg(
    F.sum('M0').alias('Sum_M0'),
    F.sum('M1').alias('Sum_M1'),
    F.sum('M2').alias('Sum_M2')
).collect()[0]

# Print the sums
print("Sum of M0:", column_sums['Sum_M0'])
print("Sum of M1:", column_sums['Sum_M1'])
print("Sum of M2:", column_sums['Sum_M2'])

# If sums are 1, continue 

# Show the resulting counts_phi dataframe
df_phi_data.display()

## 2. Loading the customers (y_segmentation and customers_segmentation)
- Getting the purchase data from the customers we want to segment and getting their original customer IDs

In [0]:
# Load y_segmentation

# Must be in the format customer_id, basket_id, product_id 
df_y_segmentation = table('default.y_segmentation') 

# Count how many unique products (lowest levels) are in y_segmentation
unique_product_count = df_y_segmentation.select("product_id").distinct().count()

# Check: y_segmentation cannot contain more unique products than the amount of rows of counts_phi (otherwise the alignment of product ids failed)
# Usually it is slightly less than the number of rows in counts_phi, as some lowest levels are not bought 
display(unique_product_count)

In [0]:
# Loading the customers_segmentation dataset, to retrieve the original customer IDs
df_customers_segmentation = table('default.customers_segmentation')

In [0]:
display(df_y_segmentation)

#### Necessary pandas on spark imports

In [0]:
import pandas as pd
pd.DataFrame.iteritems = pd.DataFrame.items
import pyspark.pandas as ps 
from pyspark.pandas.config import option_context
 # Required option 
ps.set_option('compute.ops_on_diff_frames', True)

#### Count amount of unique products

In [0]:
# Get total number of unique products in transaction data  
unique_products = df_phi_data.count() 

# Create an emtpy pandas-on-Spark 0 series with length equal to the amount of unique products   
empty_customer_product_count = ps.Series(0, index=range(unique_products))

print("Total number of unique products when fitting the model:", unique_products)

## 3. Processing

##### Function to generate segmentation for each customer

In [0]:
def process_customer_simple(df_y_segmentation, df_phi_data, i):

  # Filter data on the specific customer 
  customer_data = df_y_segmentation.filter(F.col("customer_id") == i)

  # Group by product_id and count purchases
  customer_product_counts = customer_data.groupBy("product_id").count()

  # Convert customer_product_counts to a Pandas DataFrame for further processing
  customer_product_counts = customer_product_counts.toPandas()

  # Create empty pandas-on-Spark series, with all unique product id's in it, to update with the product counts of the customer
  customer_purchase_frequencies = ps.Series(0, index=range(unique_products))

  # Iterate through product_counts_customer_product_counts and update customer_purchase_frequencies
  for row in customer_product_counts.itertuples():
      product_id = row.product_id
      count = row.count
      customer_purchase_frequencies.loc[product_id] = count

  # Convert df_phi_data to pandas-on-Spark dataframe 
  phi_data = ps.DataFrame(df_phi_data)

  # Calculate the dot product between product-purchase count vector and counts phi to obtain motivation probabilities for the customer 
  motivation_probabilities = customer_purchase_frequencies.dot(phi_data)

  # Calculate sum of motivation probabilities for normalisation purposes 
  sum_motivation_probabilities = motivation_probabilities.sum()

  # Apply normalisation to get motivation probabilities 
  if sum_motivation_probabilities > 0:
    motivation_percentages = motivation_probabilities / sum_motivation_probabilities * 100
  else:
    motivation_percentages = ps.Series(0, index=range(len(motivation_probabilities)))

  # Return probabilities of each motivation for the specific customer 
  return motivation_percentages 

##### Generate the segmentations for whole set of customers!

In [0]:
# This needs to be optimised to become faster! 

# Initialize an empty list to store customer results
customer_results = []

# Necessary settings 
import pyspark.pandas as ps 
from pyspark.pandas.config import option_context
# Set max rows 
with option_context(
  'compute.max_rows', 2200, "compute.ops_on_diff_frames", True
):
  
  # For each customer in the customer base, calculate the motivation probabilities 
  for i in range(50):

      # Call the process_customer_simple function to calculate motivation percentages for the customer
      motivation_percentages = process_customer_simple(df_y_segmentation, df_phi_data, i)

      # Create a list for the current customer's results
      customer_result = [i, motivation_percentages[0], motivation_percentages[1], motivation_percentages[2]]

      # Append the customer result list to the list of results
      customer_results.append(customer_result)

      # Convert the list of lists to a DataFrame with appropriate column names
      result_df = pd.DataFrame(customer_results, columns=['customer_id', 'M0', 'M1', 'M2'])


# Convert resulting dataframe back to pyspark dataframe 
result_df = spark.createDataFrame(result_df)

# Retrieve original customer IDs by joining customers_segmentation table 
result_df = result_df.join(df_customers_segmentation, "customer_id", "left")

## Final segmentation table 

In [0]:
# Display the results for all customers 
result_df.select("CustomerID", "M0", "M1", "M2").display()