In [53]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [8]:
spark = (
    SparkSession.builder
                .master('local')
                .appName('Interview_training')
                .getOrCreate()
)

### Q1 While ingesting customer data from an external source, you notice duplicate entries. How would you remove duplicates and retain only the latest entry based on a timestamp column

In [38]:
data = [(101, "2023-12-01", 100), [101, "2023-12-02", 150], 
        [102, "2023-12-01", 200], [102, "2023-12-02", 250]]
schema = 'customer_id int, date string, amount int'
df = spark.createDataFrame(data, schema)
df = df.withColumn('date', col('date').cast(DateType()))
df.printSchema()
df.show()

+-----------+----------+------+
|customer_id|      date|amount|
+-----------+----------+------+
|        101|2023-12-01|   100|
|        101|2023-12-02|   150|
|        102|2023-12-01|   200|
|        102|2023-12-02|   250|
+-----------+----------+------+



In [65]:
df.orderBy(col('date').desc()).dropDuplicates(subset = ['customer_id']).show() #solution1
df.sort(desc('date')).dropDuplicates(subset = ['customer_id']).show() #solution2
df.orderBy(df['date'], ascending = [0]).dropDuplicates(subset = ['customer_id']).show() # solution3
df.orderBy(df['date'], ascending = [False]).dropDuplicates(subset = ['customer_id']).show() # solution4

+-----------+----------+------+
|customer_id|      date|amount|
+-----------+----------+------+
|        101|2023-12-02|   150|
|        102|2023-12-02|   250|
+-----------+----------+------+

+-----------+----------+------+
|customer_id|      date|amount|
+-----------+----------+------+
|        101|2023-12-02|   150|
|        102|2023-12-02|   250|
+-----------+----------+------+

+-----------+----------+------+
|customer_id|      date|amount|
+-----------+----------+------+
|        101|2023-12-02|   150|
|        102|2023-12-02|   250|
+-----------+----------+------+

+-----------+----------+------+
|customer_id|      date|amount|
+-----------+----------+------+
|        101|2023-12-02|   150|
|        102|2023-12-02|   250|
+-----------+----------+------+



In [56]:
#solution 5
Window1 = Window.partitionBy(col('customer_id')).orderBy(col('date').desc())
rn = row_number().over(Window1)
df.withColumn('rank', rn).where('rank = 1').show()

+-----------+----------+------+----+
|customer_id|      date|amount|rank|
+-----------+----------+------+----+
|        101|2023-12-02|   150|   1|
|        102|2023-12-02|   250|   1|
+-----------+----------+------+----+



In [64]:
#solution 6
df.createOrReplaceTempView('table1')

spark.sql('''
WITH cte1 as (
    SELECT customer_id
           ,date
           ,amount
           , ROW_NUMBER() OVER(PARTITION BY customer_id ORDER BY date DESC) as rank
FROM table1
)

SELECT customer_id,
        date,
        amount FROM cte1 
        where rank = 1 

''').show()

+-----------+----------+------+
|customer_id|      date|amount|
+-----------+----------+------+
|        101|2023-12-02|   150|
|        102|2023-12-02|   250|
+-----------+----------+------+

