In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("partitioning").master("local[4]").getOrCreate()

In [None]:
spark

In [4]:
import pyspark.sql.functions as F
from pyspark.sql.types import * 

In [14]:
spotify_active_Listen = (
    spark.read.option('inferschema',True).option('header',True)
    .csv('./_spotify_partitioned_data/raw/Spotify_Listening_Activity.csv')
)

In [22]:
spotify_active_Listen.printSchema()
spotify_active_Listen.show(5)

root
 |-- activity_id: integer (nullable = true)
 |-- song_id: integer (nullable = true)
 |-- listen_date: string (nullable = true)
 |-- listen_duration: integer (nullable = true)

+-----------+-------+--------------------+---------------+
|activity_id|song_id|         listen_date|listen_duration|
+-----------+-------+--------------------+---------------+
|          1|     12|2023-06-27 10:15:...|             69|
|          2|     44|2023-06-27 10:15:...|            300|
|          3|     75|2023-06-27 10:15:...|             73|
|          4|     48|2023-06-27 10:15:...|            105|
|          5|     10|2023-06-27 10:15:...|            229|
+-----------+-------+--------------------+---------------+
only showing top 5 rows



In [26]:
spotify_active_Listen = (
    spotify_active_Listen
    .withColumnRenamed('listen_date','listen_datetime')
    .withColumn('listen_datetime', F.to_timestamp(F.col('listen_datetime'),'yyyy-MM-dd HH:mm:ss.SSSSSS'))
    .withColumn('listen_date',F.to_date(F.col('listen_datetime')))
    .withColumn('listen_hour',F.hour(col=F.col('listen_datetime')))
)

In [32]:
(
    spotify_active_Listen
        .write
        .partitionBy('listen_date')
        .mode("overwrite")
        .save('./_spotify_partitioned_data/raw/partitioned_data')
)

                                                                                

In [None]:
# It can be writtern with repartition or coalesce also to take control of how many files 

(
    spotify_active_Listen
        .repartitionBy(3)  # This controls how many files for each partition 
        .write
        .partitionBy('listen_date')
        .mode("overwrite")
        .save('./_spotify_partitioned_data/raw/partitioned_data')
)


In [None]:
# Reading the partitioned files 

df_partitioned_data  = (
        spark.read.parquet('/home/hari/python-notebooks/_spotify_partitioned_data/raw/partitioned_data/')
)

                                                                                

In [37]:
df_partitioned_data.filter(F.col('listen_date') == '2023-05-02').show()

+-----------+-------+--------------------+---------------+-----------+-----------+
|activity_id|song_id|     listen_datetime|listen_duration|listen_hour|listen_date|
+-----------+-------+--------------------+---------------+-----------+-----------+
|      10922|     73|2023-05-02 10:15:...|            225|         10| 2023-05-02|
|      10923|     16|2023-05-02 10:15:...|            270|         10| 2023-05-02|
|      10924|     14|2023-05-02 10:15:...|            172|         10| 2023-05-02|
|      10925|     78|2023-05-02 10:15:...|            167|         10| 2023-05-02|
|      10926|     53|2023-05-02 10:15:...|            244|         10| 2023-05-02|
|      10927|     63|2023-05-02 10:15:...|            145|         10| 2023-05-02|
|      10928|     32|2023-05-02 10:15:...|            222|         10| 2023-05-02|
|      10929|      6|2023-05-02 10:15:...|            146|         10| 2023-05-02|
|      10930|     37|2023-05-02 10:15:...|            190|         10| 2023-05-02|
|   

In [41]:
spark.conf.get('spark.sql.shuffle.partitions')

'200'

In [None]:
df_partitioned_data.rdd.getNumPartitions()
cached = df_partitioned_data.cache()
cached.write.format('noop')

rere = df_partitioned_data.repartition(4,'listen_date')

In [66]:
rere.cache()
rere.write.format('noop')

<pyspark.sql.readwriter.DataFrameWriter at 0x758397d26980>

In [68]:
(
    rere.withColumn('part_id',F.spark_partition_id())
    .groupBy(F.col('part_id')).agg(F.count('*').alias('count_id'))
).show(50)

+-------+--------+
|part_id|count_id|
+-------+--------+
|      1|    3742|
|      3|    3001|
|      2|    2495|
|      0|    2541|
+-------+--------+



In [None]:
(
    rere.write.mode("overwrite").parquet('./output_data/test_part/')
)

                                                                                

In [70]:
spark.stop()