In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

spark = SparkSession.builder.appName("customer_profile").getOrCreate()

### MySQL and Spark Connection

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

DB_USERNAME = os.getenv('DB_USERNAME')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')

def table_df(schema_name, table_name):
    url = f"jdbc:mysql://{DB_HOST}:{DB_PORT}/{schema_name}"
    properties = {
        "user": DB_USERNAME,
        "password": DB_PASSWORD,
        "driver": "com.mysql.cj.jdbc.Driver"
    }
    df = spark.read.jdbc(url=url, table=table_name, properties=properties)
    return df

In [3]:
rw_transaction_data = table_df('customer', 'rw_transaction_data')
product_category_map = table_df('customer', 'product_category_map')

In [4]:
rw_transaction_data.show(5)

+---------+------------------+---------------------+------------+------+------+---------+----------+---------------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+
|   txn_id|last_modified_date|last_modified_date_bs|created_date|amount|status|module_id|product_id|product_type_id|payer_account_id|receiver_account_id|reward_point|cash_back_amount|revenue_amount|transactor_module_id|    time|
+---------+------------------+---------------------+------------+------+------+---------+----------+---------------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+
|660612529|        2023-03-29|           2079-12-15|  2022-11-23|  50.0|     1|        1|        77|             29|             531|                  2|         0.0|             0.0|           0.0|                   4|14:07:40|
|666435422|        2022-12-01|           2079-08-15|  2022-12-01| 750.0|     1|     

In [5]:
rw_transaction_data.count()

193367

In [6]:
product_category_map.show(5)

+---------+----------+---------------+--------------------+-------------------+-----------+
|module_id|product_id|product_type_id|        product_name|product_category_id|   txn_flow|
+---------+----------+---------------+--------------------+-------------------+-----------+
|        1|       670|             11|Siddhartha Sishu ...|                 33|Value Chain|
|        2|      2545|              3|Multipurpose Fina...|               NULL|    OutFlow|
|        2|      1444|              6|Century Corporate...|               NULL|    OutFlow|
|        2|      1008|              7|Srijana Corporate...|               NULL|    OutFlow|
|        2|       885|             10| Pokhara_Convergence|               NULL|    OutFlow|
+---------+----------+---------------+--------------------+-------------------+-----------+
only showing top 5 rows



In [7]:
product_category_map.count()

183

### Join

In [8]:
df = rw_transaction_data.join(product_category_map, on = ['product_id', 'product_type_id', 'module_id'], how='inner')

In [9]:
df.count()

216423

In [10]:
df.show(5)

+----------+---------------+---------+---------+------------------+---------------------+------------+------+------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+--------------------+-------------------+-----------+
|product_id|product_type_id|module_id|   txn_id|last_modified_date|last_modified_date_bs|created_date|amount|status|payer_account_id|receiver_account_id|reward_point|cash_back_amount|revenue_amount|transactor_module_id|    time|        product_name|product_category_id|   txn_flow|
+----------+---------------+---------+---------+------------------+---------------------+------------+------+------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+--------------------+-------------------+-----------+
|       143|             59|        1|693893736|        2023-01-06|           2079-09-22|  2023-01-06| 175.0|     1|             531|                466| 

### Date Operations

In [11]:

df = df.withColumn("last_modified_date", col("last_modified_date").cast("string"))
df = df.withColumn("dates", concat_ws(" ", col("last_modified_date"), col("time")))
df = df.withColumn('dates', to_timestamp('dates', 'yyyy-MM-dd HH:mm:ss'))
df = df.withColumn("last_modified_date", to_date('last_modified_date', 'yyyy-MM-dd'))


In [12]:
df = df.withColumn("month", month('last_modified_date'))

In [13]:
df.show(5)

+----------+---------------+---------+---------+------------------+---------------------+------------+------+------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+--------------------+-------------------+-----------+-------------------+-----+
|product_id|product_type_id|module_id|   txn_id|last_modified_date|last_modified_date_bs|created_date|amount|status|payer_account_id|receiver_account_id|reward_point|cash_back_amount|revenue_amount|transactor_module_id|    time|        product_name|product_category_id|   txn_flow|              dates|month|
+----------+---------------+---------+---------+------------------+---------------------+------------+------+------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+--------------------+-------------------+-----------+-------------------+-----+
|       143|             59|        1|693893736|        2023-01-06|         

### Txn Flow

In [14]:
grouped_df = df.groupBy(['payer_account_id', 'month', 'txn_flow']).agg(count('amount').alias('count'), sum('amount').alias('sum'))

In [15]:
grouped_df.show()

+----------------+-----+-----------+-----+-----------+
|payer_account_id|month|   txn_flow|count|        sum|
+----------------+-----+-----------+-----+-----------+
|              34|    1|Value Chain|  517|  1198676.0|
|              34|    1|     InFlow|  730|1.2991992E7|
|             222|    5|    OutFlow|    1|      650.0|
|             531|    3|Value Chain|17121|  5572004.0|
|              26|    5|     InFlow|   79|   331040.0|
|             471|    1|Value Chain|  508|    97023.0|
|              56|    1|Value Chain|  790|   329529.0|
|             531|    1|Value Chain|20695|  5440140.0|
|            1056|    1|Value Chain|    8|      650.0|
|            1056|    1|    OutFlow|    1|      755.0|
|              56|    5|    OutFlow|   76|   710800.0|
|              34|    1|    OutFlow|  793|1.3915812E7|
|              34|    3|Value Chain|  557|   960637.0|
|              56|    1|     InFlow|   94|   114575.0|
|             222|    1|Value Chain|   20|     2090.0|
|         

In [16]:

pivot_grouped_df_sum = grouped_df.groupBy('payer_account_id').pivot('txn_flow').agg(sum('sum').alias('total_amount'), avg('sum').alias('monthly_amount'))

pivot_grouped_df_sum.show()

+----------------+-------------------+---------------------+--------------------+----------------------+------------------------+--------------------------+
|payer_account_id|InFlow_total_amount|InFlow_monthly_amount|OutFlow_total_amount|OutFlow_monthly_amount|Value Chain_total_amount|Value Chain_monthly_amount|
+----------------+-------------------+---------------------+--------------------+----------------------+------------------------+--------------------------+
|             471|            54320.0|              10864.0|             55320.0|               11064.0|                454930.0|         75821.66666666667|
|              34|        6.2469919E7| 1.0411653166666666E7|         6.5480803E7|  1.0913467166666666E7|               4258781.0|         709796.8333333334|
|              26|          1437011.0|             287402.2|           2997891.0|              599578.2|                878291.0|        146381.83333333334|
|            1056|             6255.0|               3127.

In [18]:
pivot_grouped_df_count = grouped_df.groupBy('payer_account_id').pivot('txn_flow').agg(sum('count').alias('total_count'), avg('count').alias('monthly_count'))

pivot_grouped_df_count.show()

+----------------+------------------+--------------------+-------------------+---------------------+-----------------------+-------------------------+
|payer_account_id|InFlow_total_count|InFlow_monthly_count|OutFlow_total_count|OutFlow_monthly_count|Value Chain_total_count|Value Chain_monthly_count|
+----------------+------------------+--------------------+-------------------+---------------------+-----------------------+-------------------------+
|             471|                65|                13.0|                 66|                 13.2|                   2392|        398.6666666666667|
|              34|              3477|               579.5|               3709|    618.1666666666666|                   2379|                    396.5|
|              26|               469|                93.8|                571|                114.2|                   1733|        288.8333333333333|
|            1056|                 2|                 1.0|                  3|                

### Reward Points

In [48]:
reward_agg = df.groupBy('payer_account_id').agg(sum('reward_point').alias('reward_point'))

In [49]:
reward_agg.show()

+----------------+------------+
|payer_account_id|reward_point|
+----------------+------------+
|             471|       180.0|
|              34|      2639.0|
|              26|       207.0|
|            1056|        52.0|
|             222|        22.0|
|            1176|         0.0|
|             531|         0.0|
|             538|         0.0|
|              56|       805.0|
+----------------+------------+



### Revenue

In [21]:
monthly_revenue = df.groupBy(['payer_account_id', 'month']).agg(sum('revenue_amount').alias('revenue_amount'))

In [22]:
revenue = monthly_revenue.groupBy('payer_account_id').agg(avg('revenue_amount').alias('monthly_average_lifetime_revenue'), sum('revenue_amount').alias('total_revenue'))

In [23]:
revenue.show()

+----------------+--------------------------------+-------------------+
|payer_account_id|monthly_average_lifetime_revenue|      total_revenue|
+----------------+--------------------------------+-------------------+
|             471|              229.40999999999977| 1376.4599999999987|
|              34|              2050.7116666666657| 12304.269999999993|
|              26|              285.86833333333396|  1715.210000000004|
|            1056|               21.88333333333333| 131.29999999999998|
|             222|              18.179999999999996| 109.07999999999998|
|            1176|                            0.56|               0.56|
|             531|              19670.311666666665|          118021.87|
|             538|             0.08000000000000007|0.08000000000000007|
|              56|              1172.4933333333402|  7034.960000000041|
+----------------+--------------------------------+-------------------+



### Latest Transaction Date

In [24]:
latest_trans_date = df.groupBy('payer_account_id').agg(max('dates').alias('latest_transaction_date'))

In [25]:
df.show(5)

+----------+---------------+---------+---------+------------------+---------------------+------------+------+------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+--------------------+-------------------+-----------+-------------------+-----+
|product_id|product_type_id|module_id|   txn_id|last_modified_date|last_modified_date_bs|created_date|amount|status|payer_account_id|receiver_account_id|reward_point|cash_back_amount|revenue_amount|transactor_module_id|    time|        product_name|product_category_id|   txn_flow|              dates|month|
+----------+---------------+---------+---------+------------------+---------------------+------------+------+------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+--------------------+-------------------+-----------+-------------------+-----+
|       143|             59|        1|693893736|        2023-01-06|         

In [26]:
latest_trans_date.show()

+----------------+-----------------------+
|payer_account_id|latest_transaction_date|
+----------------+-----------------------+
|             471|    2023-05-24 18:30:54|
|              34|    2023-05-24 19:53:52|
|              26|    2023-05-24 20:46:04|
|            1056|    2023-05-21 19:34:59|
|             222|    2023-05-16 14:58:00|
|            1176|    2023-03-04 15:19:19|
|             531|    2023-05-24 23:59:25|
|             538|    2023-05-18 18:48:37|
|              56|    2023-05-24 19:57:46|
+----------------+-----------------------+



In [27]:
df.join(latest_trans_date, on=latest_trans_date['latest_transaction_date'] == df['dates'], how = 'semi').show()

+----------+---------------+---------+---------+------------------+---------------------+------------+------+------+----------------+-------------------+------------+----------------+--------------------+--------------------+--------+--------------------+-------------------+-----------+-------------------+-----+
|product_id|product_type_id|module_id|   txn_id|last_modified_date|last_modified_date_bs|created_date|amount|status|payer_account_id|receiver_account_id|reward_point|cash_back_amount|      revenue_amount|transactor_module_id|    time|        product_name|product_category_id|   txn_flow|              dates|month|
+----------+---------------+---------+---------+------------------+---------------------+------------+------+------+----------------+-------------------+------------+----------------+--------------------+--------------------+--------+--------------------+-------------------+-----------+-------------------+-----+
|         2|             12|        1|809054178|        20

In [28]:
latest = df.join(latest_trans_date, on=latest_trans_date['latest_transaction_date'] == df['dates'], how = 'semi').select(df['payer_account_id'], df['dates'].alias('latest_transaction_date'), df['product_name'].alias('latest_used_product'))

In [29]:
latest = latest.dropDuplicates(['payer_account_id'])

In [30]:
latest.show()

+----------------+-----------------------+--------------------+
|payer_account_id|latest_transaction_date| latest_used_product|
+----------------+-----------------------+--------------------+
|              26|    2023-05-24 20:46:04|           WorldLink|
|              34|    2023-05-24 19:53:52|          Send Money|
|              56|    2023-05-24 19:57:46|         Ncell Topup|
|             222|    2023-05-16 14:58:00|    NT Prepaid Topup|
|             471|    2023-05-24 18:30:54|         Ncell Topup|
|             531|    2023-05-24 23:59:25|Ncell Topup via Bank|
|             538|    2023-05-18 18:48:37|    NT Prepaid Topup|
|            1056|    2023-05-21 19:34:59|    NT Prepaid Topup|
|            1176|    2023-03-04 15:19:19|         Ncell Topup|
+----------------+-----------------------+--------------------+



### Product Usage

In [42]:
product_usage = df.groupBy('payer_account_id').agg(count('product_id').alias('product usage'))
product_usage.show()

+----------------+-------------+
|payer_account_id|product usage|
+----------------+-------------+
|             471|         2523|
|              34|         9565|
|              26|         2773|
|            1056|           68|
|             222|           85|
|            1176|            2|
|             531|       195965|
|             538|            2|
|              56|         5440|
+----------------+-------------+



### Nth Used Product

In [32]:
product_counts = df.groupBy('payer_account_id', 'product_name').agg(count('product_name').alias('product usage'))

In [33]:
product_counts.show()

+----------------+--------------------+-------------+
|payer_account_id|        product_name|product usage|
+----------------+--------------------+-------------+
|              34|          Send Money|         6926|
|             531| Budhabare Khanepani|           29|
|             531|Chandragadi Khane...|           33|
|              26|     Dish Home Topup|           17|
|             531|   Bhaluhi khanepani|          210|
|              34|           Prabhu TV|           13|
|             531|Khanepani Sanstha...|          531|
|              56|             Cash In|           52|
|             531|    Brihat Khanepani|          112|
|              26|         Electricity|           18|
|              34|Chhimek Laghubitt...|            5|
|             531| Besisahar Khanepani|          125|
|              56|eSewa to Citizens...|          126|
|              56|          Ncell Pack|          338|
|              34|          Ncell Pack|            2|
|             471|          

In [34]:
# Sort by payer_account_id ascending and count descending

from pyspark.sql.window import Window

In [35]:
ranked_products = product_counts.withColumn("row_number", row_number().over(Window.partitionBy("payer_account_id").orderBy(col("product usage").desc())))

In [36]:
ranked_products.show()

+----------------+--------------------+-------------+----------+
|payer_account_id|        product_name|product usage|row_number|
+----------------+--------------------+-------------+----------+
|              26|          Send Money|          622|         1|
|              26|         Ncell Topup|          546|         2|
|              26|    NT Prepaid Topup|          472|         3|
|              26|       Prepaid Topup|          472|         4|
|              26|             Cash In|          276|         5|
|              26|     Fonepay Payment|           75|         6|
|              26|      eScrow Service|           40|         7|
|              26|Linked Account Wi...|           31|         8|
|              26|           WorldLink|           30|         9|
|              26|      Postpaid Topup|           27|        10|
|              26|   NT Postpaid Topup|           27|        11|
|              26|eSewa to Mahalaxm...|           21|        12|
|              26|       

In [37]:
# most_used_product = ranked_products.filter(col("row_number") == 1).drop("row_number")
# second_used_product = ranked_products.filter(col("row_number") == 2).drop("row_number")
# third_used_product = ranked_products.filter(col("row_number") == 3).drop("row_number")


In [38]:
nth_result_df = ranked_products.groupBy("payer_account_id").agg(
    max(when(col("row_number") == 1, col("product_name"))).alias("most_used_product"),
    max(when(col("row_number") == 2, col("product_name"))).alias("second_most_used_product"),
    max(when(col("row_number") == 3, col("product_name"))).alias("third_most_used_product")
)

In [39]:
nth_result_df.show()

+----------------+-----------------+------------------------+-----------------------+
|payer_account_id|most_used_product|second_most_used_product|third_most_used_product|
+----------------+-----------------+------------------------+-----------------------+
|              26|       Send Money|             Ncell Topup|       NT Prepaid Topup|
|              34|       Send Money|             Electricity|       NT Prepaid Topup|
|              56|      Ncell Topup|             Electricity|             Send Money|
|             222|      Ncell Topup|          Postpaid Topup|      NT Postpaid Topup|
|             471|      Ncell Topup|           Prepaid Topup|       NT Prepaid Topup|
|             531|NT Topup via Bank|    TOPUP VIA BANK DI...|   Ncell Topup via Bank|
|             538|    Prepaid Topup|        NT Prepaid Topup|                   NULL|
|            1056|    Prepaid Topup|        NT Prepaid Topup|            Ncell Topup|
|            1176|      Ncell Topup|                  

In [52]:
final_result = (
    pivot_grouped_df_sum
    .join(pivot_grouped_df_count, on='payer_account_id')
    .join(reward_agg, on='payer_account_id')
    .join(revenue, on='payer_account_id')
    .join(latest, on='payer_account_id')
    .join(product_usage, on='payer_account_id')
    .join(nth_result_df, on='payer_account_id')
)

In [53]:
final_result.show()

+----------------+-------------------+---------------------+--------------------+----------------------+------------------------+--------------------------+------------------+--------------------+-------------------+---------------------+-----------------------+-------------------------+------------+--------------------------------+-------------------+-----------------------+--------------------+-------------+-----------------+------------------------+-----------------------+
|payer_account_id|InFlow_total_amount|InFlow_monthly_amount|OutFlow_total_amount|OutFlow_monthly_amount|Value Chain_total_amount|Value Chain_monthly_amount|InFlow_total_count|InFlow_monthly_count|OutFlow_total_count|OutFlow_monthly_count|Value Chain_total_count|Value Chain_monthly_count|reward_point|monthly_average_lifetime_revenue|      total_revenue|latest_transaction_date| latest_used_product|product usage|most_used_product|second_most_used_product|third_most_used_product|
+----------------+-------------------+

In [47]:
len(final_result.columns)

22