# LAST.FM Project

In [28]:
import $ivy.`org.apache.spark::spark-sql:3.5.1`
import $ivy.`org.plotly-scala::plotly-almond:0.9.0`
import plotly._, plotly.element._, plotly.layout._, plotly.Almond._
init()

import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
import org.apache.logging.log4j.{LogManager, Level => LogLevel}
import org.apache.logging.log4j.core.Logger

import org.apache.spark.sql.types._

// Ajusta o nível de log para suprimir INFO antes de iniciar o Spark
System.setProperty("log4j2.level", "WARN")

val spark = SparkSession.builder()
  .appName("LastFM-EDA")
  .master("local[*]")
  .config("spark.sql.shuffle.partitions", "4")
  .getOrCreate()

// Reduz log para ERROR em loggers Spark e Hadoop
Seq(
  "org.apache.spark",
  "org.apache.spark.sql.execution",
  "org.apache.spark.storage",
  "org.apache.hadoop",
  "org.spark_project"
).foreach { name =>
  LogManager.getLogger(name).asInstanceOf[Logger].setLevel(LogLevel.ERROR)
}

LogManager.getRootLogger.asInstanceOf[Logger].setLevel(LogLevel.ERROR)

import spark.implicits._

Downloading https://repo1.maven.org/maven2/org/plotly-scala/plotly-almond_2.13/0.9.0/plotly-almond_2.13-0.9.0.pom
Failed to download https://repo1.maven.org/maven2/org/plotly-scala/plotly-almond_2.13/0.9.0/plotly-almond_2.13-0.9.0.pom
Downloading https://repo1.maven.org/maven2/org/plotly-scala/plotly-almond_2.13/0.9.0/plotly-almond_2.13-0.9.0.pom.sha1
Failed to download https://repo1.maven.org/maven2/org/plotly-scala/plotly-almond_2.13/0.9.0/plotly-almond_2.13-0.9.0.pom.sha1
Downloading https://jitpack.io/org/plotly-scala/plotly-almond_2.13/0.9.0/plotly-almond_2.13-0.9.0.pom
Failed to download https://jitpack.io/org/plotly-scala/plotly-almond_2.13/0.9.0/plotly-almond_2.13-0.9.0.pom
Downloading https://jitpack.io/org/plotly-scala/plotly-almond_2.13/0.9.0/plotly-almond_2.13-0.9.0.pom.sha1
Failed to download https://jitpack.io/org/plotly-scala/plotly-almond_2.13/0.9.0/plotly-almond_2.13-0.9.0.pom.sha1
Failed to resolve ivy dependencies:Error downloading org.plotly-scala:plotly-almond_2.13

## Creating DataFrames

In [4]:
val listeningSchema = StructType(Seq(
  StructField("userid", StringType, true),
  StructField("timestamp", TimestampType, true),
  StructField("artist_id", StringType, true),
  StructField("artist_name", StringType, true),
  StructField("track_id", StringType, true),
  StructField("track_name", StringType, true)
))

[36mlisteningSchema[39m: [32mStructType[39m = [33mSeq[39m(
  [33mStructField[39m(
    name = [32m"userid"[39m,
    dataType = StringType,
    nullable = [32mtrue[39m,
    metadata = {}
  ),
  [33mStructField[39m(
    name = [32m"timestamp"[39m,
    dataType = TimestampType,
    nullable = [32mtrue[39m,
    metadata = {}
  ),
  [33mStructField[39m(
    name = [32m"artist_id"[39m,
    dataType = StringType,
    nullable = [32mtrue[39m,
    metadata = {}
  ),
  [33mStructField[39m(
    name = [32m"artist_name"[39m,
    dataType = StringType,
    nullable = [32mtrue[39m,
    metadata = {}
  ),
  [33mStructField[39m(
    name = [32m"track_id"[39m,
    dataType = StringType,
    nullable = [32mtrue[39m,
    metadata = {}
  ),
  [33mStructField[39m(
    name = [32m"track_name"[39m,
    dataType = StringType,
    nullable = [32mtrue[39m,
    metadata = {}
  )
)

In [5]:
val profileSchema = StructType(Seq(
  StructField("userid", StringType, true),
  StructField("gender", StringType, true),
  StructField("age", IntegerType, true),
  StructField("country", StringType, true),
  StructField("signup", StringType, true)
))

[36mprofileSchema[39m: [32mStructType[39m = [33mSeq[39m(
  [33mStructField[39m(
    name = [32m"userid"[39m,
    dataType = StringType,
    nullable = [32mtrue[39m,
    metadata = {}
  ),
  [33mStructField[39m(
    name = [32m"gender"[39m,
    dataType = StringType,
    nullable = [32mtrue[39m,
    metadata = {}
  ),
  [33mStructField[39m(
    name = [32m"age"[39m,
    dataType = IntegerType,
    nullable = [32mtrue[39m,
    metadata = {}
  ),
  [33mStructField[39m(
    name = [32m"country"[39m,
    dataType = StringType,
    nullable = [32mtrue[39m,
    metadata = {}
  ),
  [33mStructField[39m(
    name = [32m"signup"[39m,
    dataType = StringType,
    nullable = [32mtrue[39m,
    metadata = {}
  )
)

## Importing the datasets

In [6]:
val df_listens = spark.read
  .option("delimiter", "\t")
  .schema(listeningSchema)
  .csv("../data/lastfm/lastfm-dataset-1k/userid-timestamp-artid-artname-traid-traname.tsv")

val df_profile = spark.read
  .option("delimiter", "\t")
  .schema(profileSchema)
  .csv("../data/lastfm/lastfm-dataset-1k/userid-profile.tsv")

[36mdf_listens[39m: [32mDataFrame[39m = [userid: string, timestamp: timestamp ... 4 more fields]
[36mdf_profile[39m: [32mDataFrame[39m = [userid: string, gender: string ... 3 more fields]

Looking at the data

In [7]:
df_listens.show()

+-----------+-------------------+--------------------+---------------+--------------------+--------------------+
|     userid|          timestamp|           artist_id|    artist_name|            track_id|          track_name|
+-----------+-------------------+--------------------+---------------+--------------------+--------------------+
|user_000001|2009-05-05 03:08:57|f1b1cf71-bd35-4e9...|      Deep Dish|                NULL|Fuck Me Im Famous...|
|user_000001|2009-05-04 17:54:10|a7f7df4a-77d8-4f1...|       坂本龍一|                NULL|Composition 0919 ...|
|user_000001|2009-05-04 17:52:04|a7f7df4a-77d8-4f1...|       坂本龍一|                NULL|Mc2 (Live_2009_4_15)|
|user_000001|2009-05-04 17:42:52|a7f7df4a-77d8-4f1...|       坂本龍一|                NULL|Hibari (Live_2009...|
|user_000001|2009-05-04 17:42:11|a7f7df4a-77d8-4f1...|       坂本龍一|                NULL|Mc1 (Live_2009_4_15)|
|user_000001|2009-05-04 17:38:31|a7f7df4a-77d8-4f1...|       坂本龍一|                NULL|To Stanford (Live...|
|us

## Exploratory Analysis

- Showing TOP 10 Artists

In [8]:
df_listens.groupBy($"artist_name")
  .count()
  .orderBy(desc("count"))
  .show(10, truncate = false)

+-------------------+------+
|artist_name        |count |
+-------------------+------+
|Radiohead          |115209|
|The Beatles        |100338|
|Nine Inch Nails    |84421 |
|Muse               |63351 |
|Coldplay           |62251 |
|Depeche Mode       |59910 |
|Pink Floyd         |58561 |
|Death Cab For Cutie|58083 |
|Placebo            |53543 |
|Elliott Smith      |50278 |
+-------------------+------+
only showing top 10 rows



- Showing the amount of distinct users

In [9]:
df_listens.select("userid").distinct().count()

[36mres9[39m: [32mLong[39m = [32m992L[39m

- Creating the joined DataFrame

In [11]:
val df_joined = df_listens.join(df_profile, "userid")

df_joined.select("userid", "gender", "country", "artist_name", "track_name")
  .show(5, truncate = false)

+-----------+------+-------+-----------+------------------------------------------+
|userid     |gender|country|artist_name|track_name                                |
+-----------+------+-------+-----------+------------------------------------------+
|user_000001|m     |Japan  |Deep Dish  |Fuck Me Im Famous (Pacha Ibiza)-09-28-2007|
|user_000001|m     |Japan  |坂本龍一   |Composition 0919 (Live_2009_4_15)         |
|user_000001|m     |Japan  |坂本龍一   |Mc2 (Live_2009_4_15)                      |
|user_000001|m     |Japan  |坂本龍一   |Hibari (Live_2009_4_15)                   |
|user_000001|m     |Japan  |坂本龍一   |Mc1 (Live_2009_4_15)                      |
+-----------+------+-------+-----------+------------------------------------------+
only showing top 5 rows



[36mdf_joined[39m: [32mDataFrame[39m = [userid: string, timestamp: timestamp ... 8 more fields]

- Looking for the TOP 10 Brazilian played songs

In [12]:
df_joined.filter($"country" === "Brazil")
  .groupBy("track_name")
  .count()
  .orderBy(desc("count"))
  .show(10, truncate = false)

+--------------------------------------------------+-----+
|track_name                                        |count|
+--------------------------------------------------+-----+
|Intro                                             |487  |
|Hechicera                                         |387  |
|Clavado En Un Bar                                 |350  |
|¡Justicia, Tierra Y Libertad! (Revolución De Amor)|321  |
|Friends                                           |309  |
|Angel De Amor                                     |296  |
|Helter Skelter                                    |292  |
|Decode                                            |291  |
|Floods                                            |285  |
|Somos Mar Y Arena                                 |282  |
+--------------------------------------------------+-----+
only showing top 10 rows



- Looking for the age distribuition 

In [13]:
df_profile
  .filter($"age".isNotNull)
  .groupBy("age")
  .count()
  .orderBy("age")
  .show(30)

+---+-----+
|age|count|
+---+-----+
|  3|    1|
|  4|    1|
|  7|    1|
| 15|    1|
| 16|    1|
| 17|    4|
| 18|   10|
| 19|   18|
| 20|   19|
| 21|   36|
| 22|   32|
| 23|   20|
| 24|   18|
| 25|   14|
| 26|   15|
| 27|   12|
| 28|   14|
| 29|   14|
| 30|    8|
| 31|    5|
| 32|    6|
| 33|    6|
| 34|    7|
| 35|    5|
| 36|    4|
| 38|    4|
| 39|    2|
| 40|    1|
| 42|    2|
| 48|    1|
+---+-----+
only showing top 30 rows



- Looking for user countries.

In [14]:
df_profile.groupBy("country")
  .count()
  .orderBy(desc("count"))
  .show(20, truncate = false)

+------------------+-----+
|country           |count|
+------------------+-----+
|United States     |228  |
|United Kingdom    |126  |
|NULL              |85   |
|Poland            |50   |
|Germany           |36   |
|Norway            |35   |
|Finland           |32   |
|Canada            |32   |
|Turkey            |28   |
|Italy             |27   |
|Sweden            |24   |
|Netherlands       |23   |
|Australia         |22   |
|Russian Federation|22   |
|Brazil            |20   |
|Spain             |17   |
|France            |14   |
|Mexico            |12   |
|Belgium           |9    |
|Argentina         |9    |
+------------------+-----+
only showing top 20 rows



- Looking for count of how many listeners for year-month

In [15]:
val df_by_month = df_listens
  .withColumn("year_month", date_format($"timestamp", "yyyy-MM"))
  .groupBy("year_month")
  .count()
  .orderBy("year_month")

df_by_month.show(20)

+----------+------+
|year_month| count|
+----------+------+
|   2005-02| 24269|
|   2005-03| 49394|
|   2005-04| 73815|
|   2005-05| 69510|
|   2005-06| 81892|
|   2005-07| 87185|
|   2005-08|100277|
|   2005-09|109551|
|   2005-10|135338|
|   2005-11|141079|
|   2005-12|198346|
|   2006-01|237711|
|   2006-02|244817|
|   2006-03|309983|
|   2006-04|321729|
|   2006-05|360120|
|   2006-06|352437|
|   2006-07|374190|
|   2006-08|392413|
|   2006-09|380929|
+----------+------+
only showing top 20 rows



[36mdf_by_month[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mDataset[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mRow[39m] = [year_month: string, count: bigint]

In [16]:
val monthlyCounts = df_by_month.collect()

[36mmonthlyCounts[39m: [32mArray[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mRow[39m] = [33mArray[39m(
  [2005-02,24269],
  [2005-03,49394],
  [2005-04,73815],
  [2005-05,69510],
  [2005-06,81892],
  [2005-07,87185],
  [2005-08,100277],
  [2005-09,109551],
  [2005-10,135338],
  [2005-11,141079],
  [2005-12,198346],
  [2006-01,237711],
  [2006-02,244817],
  [2006-03,309983],
  [2006-04,321729],
  [2006-05,360120],
  [2006-06,352437],
  [2006-07,374190],
  [2006-08,392413],
  [2006-09,380929],
  [2006-10,405260],
  [2006-11,421417],
  [2006-12,452020],
  [2007-01,471410],
  [2007-02,450666],
  [2007-03,447195],
  [2007-04,443335],
  [2007-05,467688],
  [2007-06,410891],
  [2007-07,406194],
  [2007-08,428135],
  [2007-09,433184],
  [2007-10,461423],
  [2007-11,473953],
  [2007-12,464276],
  [2008-01,464156],
  [2008-02,430047],
  [2008-03,482453],
...

- Looking for the count of each track

In [17]:
df_listens.groupBy("track_name")
  .count()
  .orderBy(desc("count"))
  .show(10, truncate = false)

+------------------+-----+
|track_name        |count|
+------------------+-----+
|Intro             |17601|
|[Untitled]        |6942 |
|Untitled          |6641 |
|Home              |6327 |
|All I Need        |6236 |
|Angel             |6020 |
|Heartbeats        |5678 |
|Wake Up           |5664 |
|Crazy             |5645 |
|Such Great Heights|5577 |
+------------------+-----+
only showing top 10 rows



- Looking for TOP artists in Brazil

In [18]:
df_joined.filter($"country" === "Brazil")
  .groupBy("artist_name")
  .count()
  .orderBy(desc("count"))
  .show(10, truncate = false)

+---------------------+-----+
|artist_name          |count|
+---------------------+-----+
|The Beatles          |12372|
|Mägo De Oz           |7202 |
|Maná                 |5758 |
|Engenheiros Do Hawaii|5745 |
|Mando Diao           |5400 |
|Metallica            |5104 |
|Backyard Babies      |5091 |
|Johnny Cash          |4788 |
|Pink Floyd           |4646 |
|Frank Sinatra        |4350 |
+---------------------+-----+
only showing top 10 rows



In [18]:
val gendered = df_joined.filter($"gender".isin("m", "f"))

gendered.groupBy("gender", "artist_name")
  .count()
  .withColumn("rank", row_number().over(
    Window.partitionBy("gender").orderBy(desc("count"))
  ))
  .filter($"rank" <= 5)
  .orderBy("gender", "rank")
  .select("gender", "artist_name", "count")
  .show(truncate = false)

cmd19.sc:6: not found: value Window
    Window.partitionBy("gender").orderBy(desc("count"))
    ^
cmd19.sc:11: not found: value truncate
  .show(truncate = false)
        ^
Compilation Failed

In [19]:
df_joined.filter($"age".isNotNull)
  .groupBy($"age", $"artist_name")
  .count()
  .orderBy($"age", desc("count"))
  .show(30, truncate = false)

+---+----------------------------+-----+
|age|artist_name                 |count|
+---+----------------------------+-----+
|3  |Oasis                       |387  |
|3  |Kasabian                    |358  |
|3  |Stereophonics               |227  |
|3  |The Strokes                 |209  |
|3  |The Killers                 |159  |
|3  |The Ends                    |130  |
|3  |Kaiser Chiefs               |80   |
|3  |The Beatles                 |71   |
|3  |Noel Gallagher              |69   |
|3  |Blur                        |62   |
|3  |Black Rebel Motorcycle Club |51   |
|3  |The Redwalls                |41   |
|3  |Giant Drag                  |41   |
|3  |Rilo Kiley                  |31   |
|3  |Arctic Monkeys              |29   |
|3  |The Rolling Stones          |23   |
|3  |Pulp                        |21   |
|3  |Black Market                |20   |
|3  |Spiritualized               |16   |
|3  |The Stone Roses             |16   |
|3  |Coldplay                    |14   |
|3  |John Lennon

In [20]:
df_listens.groupBy("userid")
  .count()
  .orderBy(desc("count"))
  .show(1)

+-----------+------+
|     userid| count|
+-----------+------+
|user_000949|183103|
+-----------+------+
only showing top 1 row



In [21]:
df_listens.groupBy("userid")
  .agg(countDistinct("track_id").as("unique_tracks"))
  .orderBy(desc("unique_tracks"))
  .show(10, truncate = false)

+-----------+-------------+
|userid     |unique_tracks|
+-----------+-------------+
|user_000691|59850        |
|user_000861|43860        |
|user_000681|36746        |
|user_000800|31872        |
|user_000774|29997        |
|user_000427|28592        |
|user_000882|24426        |
|user_000702|24225        |
|user_000345|24110        |
|user_000910|21812        |
+-----------+-------------+
only showing top 10 rows



In [22]:
val df_with_hour = df_listens.withColumn("hour", hour($"timestamp"))

df_with_hour.filter($"hour" >= 0 && $"hour" < 6)
  .groupBy("track_name")
  .count()
  .orderBy(desc("count"))
  .show(10, truncate = false)

+------------------+-----+
|track_name        |count|
+------------------+-----+
|Intro             |4650 |
|[Untitled]        |2321 |
|Untitled          |2138 |
|All I Need        |1768 |
|Home              |1708 |
|Crazy             |1648 |
|Heartbeats        |1611 |
|Such Great Heights|1606 |
|Hallelujah        |1601 |
|Angel             |1594 |
+------------------+-----+
only showing top 10 rows



[36mdf_with_hour[39m: [32mDataFrame[39m = [userid: string, timestamp: timestamp ... 5 more fields]

In [23]:
val topUser = df_listens.groupBy("userid")
  .count()
  .orderBy(desc("count"))
  .limit(1)
  .collect()
  .head
  .getString(0)

val topUserProfile = df_profile.filter($"userid" === topUser)
topUserProfile.show()

df_listens.filter($"userid" === topUser)
  .groupBy("artist_name")
  .count()
  .orderBy(desc("count"))
  .show(10, truncate = false)

+-----------+------+----+-------------+------------+
|     userid|gender| age|      country|      signup|
+-----------+------+----+-------------+------------+
|user_000949|     f|NULL|United States|May 30, 2005|
+-----------+------+----+-------------+------------+

+--------------+-----+
|artist_name   |count|
+--------------+-----+
|Everclear     |6910 |
|Rocky Votolato|5540 |
|Metric        |4545 |
|Elliott Smith |4520 |
|Girl Talk     |4392 |
|Four Tet      |4275 |
|Pedro The Lion|4221 |
|The Killers   |4221 |
|Daft Punk     |4159 |
|Cake          |4032 |
+--------------+-----+
only showing top 10 rows



[36mtopUser[39m: [32mString[39m = [32m"user_000949"[39m
[36mtopUserProfile[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mDataset[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mRow[39m] = [userid: string, gender: string ... 3 more fields]

### Creating some Plots to better see the Data

In [None]:
import plotly._, plotly.element._, plotly.layout._, plotly.Almond._
import org.apache.spark.sql.functions._

val genderCountsDF = df_profile
  .withColumn("gender_norm",
    when(trim(col("gender")).isNull || trim(col("gender")) === "", lit("Unknown"))
      .otherwise(col("gender"))
  )
  .groupBy("gender_norm").count()
  .orderBy(desc("count"))

val rows   = genderCountsDF.collect().toSeq
val labels = rows.map(_.getAs[String]("gender_norm"))  // guaranteed non-null
val values = rows.map(_.getAs[Long]("count"))

val fig = Bar(labels, values)
plot(Seq(fig), Layout().withTitle("Gender Distribution in LastFM Dataset"))