In [1]:
from pyspark.sql import SparkSession
from pyspark import  SparkContext
import os
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
spark = SparkSession.builder.master("yarn").getOrCreate()

/spark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/14 15:56:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/14 15:56:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/03/14 15:56:24 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [3]:
import pandas as pd

# Sample data for Visits table
visits_data = {
    'user_id': [1, 2, 12, 19, 1, 2, 1, 7, 9, 8],
    'visit_date': ['2020-01-01', '2020-01-02', '2020-01-01', '2020-01-03', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-11', '2020-01-25', '2020-01-28']
}

# Sample data for Transactions table
transactions_data = {
    'user_id': [1, 2, 7, 1, 9, 9, 8, 9],
    'transaction_date': ['2020-01-02', '2020-01-03', '2020-01-11', '2020-01-04', '2020-01-25', '2020-01-25', '2020-01-28', '2020-01-25'],
    'amount': [120, 22, 232, 7, 33, 66, 1, 99]
}

# Convert date strings to datetime objects
visits_df = pd.DataFrame(visits_data)
visits_df['visit_date'] = pd.to_datetime(visits_df['visit_date'])

transactions_df = pd.DataFrame(transactions_data)
transactions_df['transaction_date'] = pd.to_datetime(transactions_df['transaction_date'])



df_person = spark.createDataFrame(visits_df)
df_person.createOrReplaceTempView("Visits")

df_person = spark.createDataFrame(transactions_df)
df_person.createOrReplaceTempView("Transactions")



In [4]:
visits_df

Unnamed: 0,user_id,visit_date
0,1,2020-01-01
1,2,2020-01-02
2,12,2020-01-01
3,19,2020-01-03
4,1,2020-01-02
5,2,2020-01-03
6,1,2020-01-04
7,7,2020-01-11
8,9,2020-01-25
9,8,2020-01-28


In [5]:
transactions_df

Unnamed: 0,user_id,transaction_date,amount
0,1,2020-01-02,120
1,2,2020-01-03,22
2,7,2020-01-11,232
3,1,2020-01-04,7
4,9,2020-01-25,33
5,9,2020-01-25,66
6,8,2020-01-28,1
7,9,2020-01-25,99


In [6]:
query = """
    select
        v.user_id,
        v.visit_date,
        t.amount
    from Visits v
    left join Transactions t on v.user_id = t.user_id and v.visit_date = t.transaction_date
    order by v.user_id asc, v.visit_date asc

"""

In [7]:
spark.sql(query).show()

                                                                                

+-------+-------------------+------+
|user_id|         visit_date|amount|
+-------+-------------------+------+
|      1|2020-01-01 00:00:00|  NULL|
|      1|2020-01-02 00:00:00|   120|
|      1|2020-01-04 00:00:00|     7|
|      2|2020-01-02 00:00:00|  NULL|
|      2|2020-01-03 00:00:00|    22|
|      7|2020-01-11 00:00:00|   232|
|      8|2020-01-28 00:00:00|     1|
|      9|2020-01-25 00:00:00|    99|
|      9|2020-01-25 00:00:00|    66|
|      9|2020-01-25 00:00:00|    33|
|     12|2020-01-01 00:00:00|  NULL|
|     19|2020-01-03 00:00:00|  NULL|
+-------+-------------------+------+



In [10]:
query_1 = f"""

    select
        user_id,
        visit_date,
        sum(
            case 
                when amount is null then 0
                else 1
            end
        ) as total_transaction
    from ({query})
    group by user_id, visit_date
    order by user_id, visit_date
"""

In [11]:
spark.sql(query_1).show()

+-------+-------------------+-----------------+
|user_id|         visit_date|total_transaction|
+-------+-------------------+-----------------+
|      1|2020-01-01 00:00:00|                0|
|      1|2020-01-02 00:00:00|                1|
|      1|2020-01-04 00:00:00|                1|
|      2|2020-01-02 00:00:00|                0|
|      2|2020-01-03 00:00:00|                1|
|      7|2020-01-11 00:00:00|                1|
|      8|2020-01-28 00:00:00|                1|
|      9|2020-01-25 00:00:00|                3|
|     12|2020-01-01 00:00:00|                0|
|     19|2020-01-03 00:00:00|                0|
+-------+-------------------+-----------------+



In [18]:
query_2 = f"""

    with t as (
        select
            total_transaction as transactions_count,
            count(distinct user_id, visit_date) as visits_count
        from ({query_1})
        group by total_transaction
        order by total_transaction asc
    )
    
    select
        k.transactions_count,
        ifnull(t.visits_count, 0)
    from
        (select explode(sequence(0, (select max(transactions_count) from t))) as transactions_count) as k
        left join t on k.transactions_count = t.transactions_count
    order by k.transactions_count asc
"""

In [19]:
spark.sql(query_2).show()

+------------------+-----------------------+
|transactions_count|ifnull(visits_count, 0)|
+------------------+-----------------------+
|                 0|                      4|
|                 1|                      5|
|                 2|                      0|
|                 3|                      1|
+------------------+-----------------------+

