In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder \
                    .appName('Spark SQL query df') \
                    .getOrCreate()

In [3]:
data_path = '/home/lorenzo/spark-repo/0_data/gkanadeb.csv'
df = spark.read.option('header', 'True') \
                .option('inferSchema', 'True') \
                .csv(data_path)

In [4]:
 df = df.withColumn('snapshot',F.to_date(F.col('snapshot'),'dd/MM/yyyy')) \
        .withColumn('emissione',F.to_date(F.col('emissione'),'dd/MM/yyyy')) \

In [5]:
df.printSchema()

root
 |-- ndg: string (nullable = true)
 |-- snapshot: date (nullable = true)
 |-- emissione: date (nullable = true)
 |-- numero: string (nullable = true)
 |-- flag: string (nullable = true)



In [6]:
df.show()

+---+----------+----------+------+----+
|ndg|  snapshot| emissione|numero|flag|
+---+----------+----------+------+----+
|AAA|2020-03-10|2020-01-05|  AAA2|   A|
|AAA|2020-03-10|2018-05-12|  AAA1|   B|
|AAA|2020-03-14|2020-01-05|  AAA2|   A|
|AAA|2020-03-14|2018-05-12|  AAA1|   B|
|AAA|2020-02-20|2020-01-05|  AAA2|   A|
|AAA|2020-02-20|2018-05-12|  AAA1|   B|
|AAA|2019-12-27|2018-05-12|  AAA1|   A|
|BBB|2020-03-10|2019-09-17|  BBB1|   B|
|BBB|2020-03-10|2019-12-20|  BBB2|   C|
|BBB|2020-03-14|2019-09-17|  BBB1|   B|
|BBB|2020-03-14|2019-12-20|  BBB2|   C|
|BBB|2020-02-20|2019-09-17|  BBB1|   B|
|BBB|2020-02-20|2019-12-20|  BBB2|   C|
|BBB|2019-12-27|2019-09-17|  BBB1|   B|
|BBB|2019-12-27|2019-12-20|  BBB2|   C|
|CCC|2020-03-10|2019-01-05|  CCC1|   A|
|CCC|2020-03-14|2019-01-05|  CCC1|   A|
|CCC|2020-03-03|2019-01-05|  CCC1|   A|
|CCC|2020-02-25|2019-01-05|  CCC1|   A|
|CCC|2020-02-18|2019-01-05|  CCC1|   A|
+---+----------+----------+------+----+
only showing top 20 rows



In [7]:
df.createOrReplaceTempView('gkanadeb')

In [8]:
spark.sql('SELECT ndg, \
                  last_day(snapshot) as dt_rif, \
                  max(emissione), \
                  first(numero), \
                  first(flag) \
           FROM gkanadeb \
           GROUP BY 1,2 \
           ORDER BY ndg, dt_rif DESC').show()

+---+----------+--------------+--------------------+------------------+
|ndg|    dt_rif|max(emissione)|first(numero, false)|first(flag, false)|
+---+----------+--------------+--------------------+------------------+
|AAA|2020-03-31|    2020-01-05|                AAA2|                 A|
|AAA|2020-02-29|    2020-01-05|                AAA2|                 A|
|AAA|2019-12-31|    2018-05-12|                AAA1|                 A|
|BBB|2020-03-31|    2019-12-20|                BBB1|                 B|
|BBB|2020-02-29|    2019-12-20|                BBB1|                 B|
|BBB|2019-12-31|    2019-12-20|                BBB1|                 B|
|CCC|2020-03-31|    2019-01-05|                CCC1|                 A|
|CCC|2020-02-29|    2019-01-05|                CCC1|                 A|
+---+----------+--------------+--------------------+------------------+



In [9]:
spark.sql('WITH summary AS (\
                            SELECT g.ndg, \
                            last_day(g.snapshot) as dt_rif, \
                            g.emissione, \
                            g.numero, \
                            g.flag, \
                            ROW_NUMBER() OVER(PARTITION BY g.ndg, last_day(g.snapshot) \
                                              ORDER BY g.emissione DESC \
                                              ) as rk \
                            FROM gkanadeb as g) \
           SELECT s.* \
           FROM summary as s \
           WHERE s.rk = 1 \
           ORDER BY ndg, dt_rif DESC').show()

+---+----------+----------+------+----+---+
|ndg|    dt_rif| emissione|numero|flag| rk|
+---+----------+----------+------+----+---+
|AAA|2020-03-31|2020-01-05|  AAA2|   A|  1|
|AAA|2020-02-29|2020-01-05|  AAA2|   A|  1|
|AAA|2019-12-31|2018-05-12|  AAA1|   A|  1|
|BBB|2020-03-31|2019-12-20|  BBB2|   C|  1|
|BBB|2020-02-29|2019-12-20|  BBB2|   C|  1|
|BBB|2019-12-31|2019-12-20|  BBB2|   C|  1|
|CCC|2020-03-31|2019-01-05|  CCC1|   A|  1|
|CCC|2020-02-29|2019-01-05|  CCC1|   A|  1|
+---+----------+----------+------+----+---+

