In [1]:
from pyspark.sql import SparkSession
from pyspark import  SparkContext
import os
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
spark = SparkSession.builder.master("yarn").getOrCreate()

/spark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/19 16:46:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/19 16:46:36 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [3]:
import pandas as pd

# Sample data for Transactions table
transactions_data = {
    'transaction_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    'customer_id': [101, 101, 101, 102, 102, 102, 105, 105, 105, 105, 105, 105, 105],
    'transaction_date': ['2023-05-01', '2023-05-02', '2023-05-03', '2023-05-01', '2023-05-03', '2023-05-04', 
                         '2023-05-01', '2023-05-02', '2023-05-03', '2023-05-04', '2023-05-12', '2023-05-13', '2023-05-14'],
    'amount': [100, 150, 200, 50, 100, 200, 100, 150, 200, 300, 250, 260, 270]
}

# Create DataFrame for Transactions
transactions_df = pd.DataFrame(transactions_data)

# Convert date column to datetime type
transactions_df['transaction_date'] = pd.to_datetime(transactions_df['transaction_date'])

# Set transaction_id as the primary key
transactions_df.set_index('transaction_id', inplace=True)

# Display the DataFrame
print("Transactions table:")
print(transactions_df)



df_person = spark.createDataFrame(transactions_df)
df_person.createOrReplaceTempView("Transactions")



Transactions table:
                customer_id transaction_date  amount
transaction_id                                      
1                       101       2023-05-01     100
2                       101       2023-05-02     150
3                       101       2023-05-03     200
4                       102       2023-05-01      50
5                       102       2023-05-03     100
6                       102       2023-05-04     200
7                       105       2023-05-01     100
8                       105       2023-05-02     150
9                       105       2023-05-03     200
10                      105       2023-05-04     300
11                      105       2023-05-12     250
12                      105       2023-05-13     260
13                      105       2023-05-14     270


In [21]:
query = """
    with cus_and_day as (
        select
            customer_id,
            date_add(min_transaction_date, day_add) as day
        from (
            select
                customer_id,
                min(transaction_date) as min_transaction_date,
                max(transaction_date) as max_transaction_date,
                explode(sequence(0, date_diff(max(transaction_date), min(transaction_date)))) as day_add
            from Transactions t
            group by customer_id
        )
    )
    
    select
        *
    from (
        select
            c.customer_id,
            c.day,
            ifnull(t.amount, 0) as amount
        from cus_and_day c left join Transactions t on c.customer_id = t.customer_id and c.day = t.transaction_date
        order by c.customer_id, c.day
    )
"""

In [22]:
spark.sql(query).show(10000)

+-----------+----------+------+
|customer_id|       day|amount|
+-----------+----------+------+
|        101|2023-05-01|   100|
|        101|2023-05-02|   150|
|        101|2023-05-03|   200|
|        102|2023-05-01|    50|
|        102|2023-05-02|     0|
|        102|2023-05-03|   100|
|        102|2023-05-04|   200|
|        105|2023-05-01|   100|
|        105|2023-05-02|   150|
|        105|2023-05-03|   200|
|        105|2023-05-04|   300|
|        105|2023-05-05|     0|
|        105|2023-05-06|     0|
|        105|2023-05-07|     0|
|        105|2023-05-08|     0|
|        105|2023-05-09|     0|
|        105|2023-05-10|     0|
|        105|2023-05-11|     0|
|        105|2023-05-12|   250|
|        105|2023-05-13|   260|
|        105|2023-05-14|   270|
+-----------+----------+------+



In [42]:
query_1 = """
    select
        customer_id,
        consecutive_start,
        consecutive_end
    from (
        select
            customer_id,
            min(transaction_date) as consecutive_start,
            max(transaction_date) as consecutive_end,
            count(*) as number_in_group
        from (
            select
                *,
                sum(
                    if(is_consecutive, 0, 1)
                ) over(partition by customer_id order by transaction_date asc) as group_id
            from (
                select
                    *,
                    (is_previous_day and is_increase) as is_consecutive
                from (
                    select
                        customer_id,
                        transaction_date,
                        amount,
                        lag(transaction_date, 1, transaction_date) over(partition by customer_id order by transaction_date asc) as previous_transaction_date,
                        lag(amount, 1, amount) over(partition by customer_id order by transaction_date asc) as previous_amount,
                        date_diff(transaction_date, lag(transaction_date, 1, transaction_date) over(partition by customer_id order by transaction_date asc)) = 1 as is_previous_day,
                        amount > lag(amount, 1, amount) over(partition by customer_id order by transaction_date asc) as is_increase
                    from Transactions t
                )
            )
        ) group by customer_id, group_id
    ) where number_in_group >= 3
"""

In [43]:
spark.sql(query_1).show()

+-----------+-------------------+-------------------+
|customer_id|  consecutive_start|    consecutive_end|
+-----------+-------------------+-------------------+
|        101|2023-05-01 00:00:00|2023-05-03 00:00:00|
|        105|2023-05-01 00:00:00|2023-05-04 00:00:00|
|        105|2023-05-12 00:00:00|2023-05-14 00:00:00|
+-----------+-------------------+-------------------+

