In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SpotifySongs").getOrCreate()

df = spark.read.csv("stream-processing-template/assets/data/spotify-2023.csv", header=True, inferSchema=True)
df.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/11 23:02:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/11 23:02:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

+--------------------+--------------------+------------+-------------+--------------+------------+--------------------+-----------------+----------+------------------+---------------+-------------------+----------------+----------------+---+----+-----+--------------+---------+--------+--------------+------------------+----------+-------------+
|          track_name|      artist(s)_name|artist_count|released_year|released_month|released_day|in_spotify_playlists|in_spotify_charts|   streams|in_apple_playlists|in_apple_charts|in_deezer_playlists|in_deezer_charts|in_shazam_charts|bpm| key| mode|danceability_%|valence_%|energy_%|acousticness_%|instrumentalness_%|liveness_%|speechiness_%|
+--------------------+--------------------+------------+-------------+--------------+------------+--------------------+-----------------+----------+------------------+---------------+-------------------+----------------+----------------+---+----+-----+--------------+---------+--------+--------------+-------

In [4]:
from pyspark.sql.functions import col

# Filter for the year 2023 and order by streams
top_5_streamed_2023 = df.filter(col("released_year") == 2023)\
                        .orderBy(col("streams").desc())\
                        .limit(5)
top_5_streamed_2023.show()

# ANOTHER WAY OF DOING THIS WOULD BE... (without `col`)
# Filter for the year 2023 and order by streams
# top_5_streamed_2023 = df.filter(df.released_year == 2023)\
#                         .orderBy(df.streams.desc())\
#                         .limit(5)
# top_5_streamed_2023.show()


+--------------------+------------------+------------+-------------+--------------+------------+--------------------+-----------------+--------+------------------+---------------+-------------------+----------------+----------------+---+---+-----+--------------+---------+--------+--------------+------------------+----------+-------------+
|          track_name|    artist(s)_name|artist_count|released_year|released_month|released_day|in_spotify_playlists|in_spotify_charts| streams|in_apple_playlists|in_apple_charts|in_deezer_playlists|in_deezer_charts|in_shazam_charts|bpm|key| mode|danceability_%|valence_%|energy_%|acousticness_%|instrumentalness_%|liveness_%|speechiness_%|
+--------------------+------------------+------------+-------------+--------------+------------+--------------------+-----------------+--------+------------------+---------------+-------------------+----------------+----------------+---+---+-----+--------------+---------+--------+--------------+------------------+---

In [5]:
# Assuming 'ranking' is a combination of charts across platforms
ranking_cols = ["in_spotify_charts", "in_apple_charts", "in_deezer_charts", "in_shazam_charts"]

# Calculate a combined ranking score and find top 5
df_with_ranking = df.withColumn("combined_ranking", sum(df[col] for col in ranking_cols))
top_5_ranking_tracks = df_with_ranking.orderBy(col("combined_ranking").desc()).limit(5)
top_5_ranking_tracks.show()

+--------------------+-----------------+------------+-------------+--------------+------------+--------------------+-----------------+---------+------------------+---------------+-------------------+----------------+----------------+---+---+-----+--------------+---------+--------+--------------+------------------+----------+-------------+----------------+
|          track_name|   artist(s)_name|artist_count|released_year|released_month|released_day|in_spotify_playlists|in_spotify_charts|  streams|in_apple_playlists|in_apple_charts|in_deezer_playlists|in_deezer_charts|in_shazam_charts|bpm|key| mode|danceability_%|valence_%|energy_%|acousticness_%|instrumentalness_%|liveness_%|speechiness_%|combined_ranking|
+--------------------+-----------------+------------+-------------+--------------+------------+--------------------+-----------------+---------+------------------+---------------+-------------------+----------------+----------------+---+---+-----+--------------+---------+--------+---

The error you're encountering is because the intersect method in PySpark requires both DataFrames to have the same number of columns and the same column names. In your code, top_5_streamed_2023 has the original number of columns from df, while top_5_ranking_tracks has an extra column combined_ranking added to it.

To find the crossover between the top streamed and top ranking tracks, you should ensure both DataFrames have the same schema. One way to do this is to select only the common columns you're interested in before performing the intersect operation. Assuming the common columns are track_name and artist(s)_name, here's how you can modify your code:

By selecting only the common columns ("track_name" and "artist(s)_name") from both DataFrames, you ensure they have the same schema, allowing the intersect operation to work as expected. This will give you the tracks that are common in both the top 5 streamed and top 5 ranking tracks lists.

In [9]:
from pyspark.sql.functions import col

top_5_streamed_2023 = df.filter(df.released_year == 2023)\
                        .orderBy(df.streams.desc())\
                        .limit(5)\
                        .select("track_name", "artist(s)_name")

# Assuming 'ranking' is a combination of charts across platforms
ranking_cols = ["in_spotify_charts", "in_apple_charts", "in_deezer_charts", "in_shazam_charts"]

# Calculate a combined ranking score
df_with_ranking = df.withColumn("combined_ranking", sum(col(col_name) for col_name in ranking_cols))

# Find top 5 based on combined ranking
top_5_ranking_tracks = df_with_ranking.orderBy(col("combined_ranking").desc())\
                                      .limit(5)\
                                      .select("track_name", "artist(s)_name")

# Check for crossover between top streamed and top ranking tracks
crossover_tracks = top_5_streamed_2023.intersect(top_5_ranking_tracks)
crossover_tracks.show()


+----------+--------------+
|track_name|artist(s)_name|
+----------+--------------+
|  fukumean|         Gunna|
+----------+--------------+



Task 2: Artists with Most Tracks in Spotify Charts

In [10]:
from pyspark.sql.functions import count

# Group by artist's name and count tracks in Spotify charts
artists_spotify_charts = df.filter(col("in_spotify_charts") > 0)\
                           .groupBy("artist(s)_name")\
                           .agg(count("track_name").alias("tracks_in_charts"))\
                           .orderBy(col("tracks_in_charts").desc())\
                           .limit(10)
artists_spotify_charts.show()


+--------------+----------------+
|artist(s)_name|tracks_in_charts|
+--------------+----------------+
|  Taylor Swift|              22|
|     Bad Bunny|              12|
|  Harry Styles|              10|
|    The Weeknd|               8|
|          Feid|               7|
| Morgan Wallen|               6|
|    Ed Sheeran|               6|
|      NewJeans|               6|
|Olivia Rodrigo|               5|
|       Karol G|               4|
+--------------+----------------+



Task 3: Filter and Select Attributes of Danceable Tracks

In [11]:
# Filter for danceability > 80%
danceable_tracks = df.filter(col("danceability_%") > 80)\
                     .select("track_name", "artist(s)_name", "bpm", "danceability_%")
danceable_tracks.show()


+--------------------+--------------------+---+--------------+
|          track_name|      artist(s)_name|bpm|danceability_%|
+--------------------+--------------------+---+--------------+
|            Sprinter|   Dave, Central Cee|141|            92|
|            fukumean|               Gunna|130|            85|
|     La Bebe - Remix|Peso Pluma, Yng L...|170|            81|
|          Classy 101|    Feid, Young Miko|100|            86|
|Peso Pluma: Bzrp ...|Bizarrap, Peso Pluma|133|            85|
|Popular (with Pla...|The Weeknd, Madon...| 99|            85|
|        MOJABI GHOST|    Tainy, Bad Bunny|122|            81|
|     Los del Espacio|Big One, Duki, Li...|120|            81|
|            AMARGURA|             Karol G|107|            92|
|          La Bachata|       Manuel Turizo|125|            84|
|                 S91|             Karol G|128|            86|
|                 T��|dennis, MC Kevin ...|130|            86|
|Left and Right (F...|Charlie Puth, BTS...|101|        

In [12]:
# Compute correlation between 'danceability_%' and 'bpm'
correlation = danceable_tracks.stat.corr("danceability_%", "bpm")
print(f"Correlation between danceability and bpm: {correlation}")


Correlation between danceability and bpm: 0.016846197056879803
