In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, avg, desc, col
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window


In [2]:
spark = (SparkSession.builder
		 .appName('df_ex')
		 .getOrCreate())

24/01/25 08:40:43 WARN Utils: Your hostname, geoffroy-XPS-15-9520 resolves to a loopback address: 127.0.1.1; using 192.168.13.63 instead (on interface wlp0s20f3)
24/01/25 08:40:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/25 08:40:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
file = 'data/sparkify_log_small.json'
df = spark.read.json(file)

                                                                                

In [4]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)


# Q1
which page did user id "" (empty string) NOT visit?

In [5]:
df.filter(df.userId == '') \
	.select('page') \
    .dropDuplicates() \
	.show()

+-----+
| page|
+-----+
| Home|
|About|
|Login|
| Help|
+-----+


# Q2
How many female users do we have in the data set?

In [6]:
df.filter(df.gender == 'F') \
	.select(['userId', 'gender']) \
	.dropDuplicates() \
	.count()

462

# Q3
How many songs were played from the most played artist?

In [7]:
df.groupBy('artist').count().orderBy('count', ascending=False).show(5)

+--------------------+-----+
|              artist|count|
+--------------------+-----+
|                NULL| 1653|
|            Coldplay|   83|
|       Kings Of Leon|   69|
|Florence + The Ma...|   52|
|            BjÃÂ¶rk|   46|
+--------------------+-----+


# Q4
How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.

In [10]:
# This UDF converts the page visits to a binary flag, where visiting the home page ('Home') is marked as 1, and all other page visits are marked as 0.
function = udf(lambda ishome : int(ishome == 'Home'), IntegerType())

# This window specification partitions the data by userID and orders the rows by timestamp (ts) in descending order. The rangeBetween(Window.unboundedPreceding, 0) indicates that the window frame starts from the first row in the partition up to the current row.
user_window = Window \
    .partitionBy('userID') \
    .orderBy(desc('ts')) \
    .rangeBetween(Window.unboundedPreceding, 0)

cusum = df.filter((df.page == 'NextSong') | (df.page == 'Home')) \
    .select('userID', 'page', 'ts') \
    .withColumn('homevisit', function(col('page'))) \
    .withColumn('period', Fsum('homevisit').over(user_window))

cusum.filter((cusum.page == 'NextSong')) \
    .groupBy('userID', 'period') \
    .agg({'period':'count'}) \
    .agg({'count(period)':'avg'}).show()



+------------------+
|avg(count(period))|
+------------------+
| 6.898347107438017|
+------------------+


                                                                                

### Lets analyse step by step the process used in the solution above

#### step 1: cumsum
A new column, homevisit, is created using the UDF. It marks rows where the page is 'Home'.
Another column, period, is created to cumulatively sum the homevisit flags within each user's partition, effectively segmenting the data into periods based on home page visits. Each period represents the time from one home page visit to the next for a given user.

In [15]:
cusum.filter(cusum.userID == 100).show()

+------+--------+-------------+---------+------+
|userID|    page|           ts|homevisit|period|
+------+--------+-------------+---------+------+
|   100|    Home|1513839673284|        1|     1|
|   100|NextSong|1513776308284|        0|     1|
|   100|    Home|1513776194284|        1|     2|
|   100|NextSong|1513775710284|        0|     2|
|   100|NextSong|1513775556284|        0|     2|
|   100|    Home|1513775431284|        1|     3|
|   100|NextSong|1513750442284|        0|     3|
|   100|NextSong|1513750214284|        0|     3|
+------+--------+-------------+---------+------+


#### step 2: count the number of songs played in each period

In [17]:
period_count = cusum.filter((cusum.page == 'NextSong')) \
	.groupBy('userID', 'period') \
	.agg({'period':'count'}) \

# in our previous example we see that 1 song was listened to in period 1, 2 songs in period 2, and 2 songs in period 3.
period_count.filter(period_count.userID == 100).show()

+------+------+-------------+
|userID|period|count(period)|
+------+------+-------------+
|   100|     1|            1|
|   100|     2|            2|
|   100|     3|            2|
+------+------+-------------+


#### step 3: average the number of songs played across all periods

In [19]:
period_count.agg({'count(period)':'avg'}).collect()

[Row(avg(count(period))=6.898347107438017)]