In [None]:
# Installing required packages
!pip install pyspark
!pip install findspark

In [79]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd

In [80]:
df = pd.read_csv("/content/music_streaming.csv")

In [None]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("music streaming") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [81]:
import findspark
findspark.init()

In [82]:
spark

In [83]:
#data cleaning 

# handling invalid rows and Null Values this is provided by spark
sdf=spark.read.option("header",True).option("mode",'DROPMALFORMED').csv('music_streaming.csv')

#showing missing values in all columns
from pyspark.sql.functions import col,isnan, when, count
sdf.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in sdf.columns]
   ).show()

#the mising values didnt exceed  even 10% inn any of the 3 columns that have them so 
#we wont remove them just fill with 0
sdf = sdf.fillna(0)


# droping Duplicates
sdf = sdf.dropDuplicates()  # Drop duplicate rows
sdf=sdf.dropDuplicates(['Track Name'])

+-----------+----------+----------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-----+------------------+--------------+-----+
|Artist Name|Track Name|Popularity|danceability|energy| key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|duration_in min/ms|time_signature|Genre|
+-----------+----------+----------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-----+------------------+--------------+-----+
|          0|         0|       394|           0|     0|1743|       0|   0|          0|           0|            3587|       0|      0|    0|                 0|             0|    0|
+-----------+----------+----------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-----+------------------+--------------+-----+



In [84]:
# Import required libraries
from pyspark.sql.types import StringType
from pyspark.sql.functions import col, expr,when


# Convert milliseconds to minutes
#if i understand right the values of duration have some in mins and some in ms
#So my logic is that if the value bigger than 1000 then definetly thats ms so i transform it to min else we leave it as it is

#so here i put a new column that if the value of duration_in_min bigger than 100O we put the old value/60000 
#otherwise we put the same old value
sdf = sdf.withColumn("duration_in min", when(col("duration_in min/ms") > 10000, col("duration_in min/ms") / 60000).otherwise(col("duration_in min/ms")))

#we drop the old column as we dont need it anymore
sdf =sdf.drop('duration_in min/ms')
sdf.show(10)

# Classification

#mapping = {   0: "Rock",1: "Indie",2: "Alt",3: "Pop",4: "Metal",5: "HipHop",6: "Alt_Music",7: "Blues",8: "Acoustic/Folk",9: "Instrumental",10: "Country"}

sdf = sdf.withColumn("Genre", when(col("Genre") == 0, "Rock").when(col("Genre") == 1, "Indie").when(col("Genre") == 2, "Alt").when(col("Genre") == 3, "Pop")
.when(col("Genre") == 4, "Metal").when(col("Genre") == 5, "HipHop").when(col("Genre") == 6, "Alt_Music").when(col("Genre") == 7, "Blues").when(col("Genre") == 8, "Acoustic/Folk")
.when(col("Genre") == 9, "Instrumental").when(col("Genre") == 10, "Country").otherwise(col("Genre")))



sdf.show(10)

#renaming the columns to be able to use them in queries
sdf = sdf.withColumnRenamed('duration_in min', 'duration_in_min') \
       .withColumnRenamed('Track Name', 'Track_Name') \
       .withColumnRenamed('Artist Name', 'Artist_Name')

+--------------------+--------------------+----------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----+------------------+
|         Artist Name|          Track Name|Popularity|danceability|energy| key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|Genre|   duration_in min|
+--------------------+--------------------+----------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----+------------------+
|Shaarib Toshi, Ar...|"Bandeya (feat. A...|        60|       0.465| 0.551|   7|   -6.58|   1|     0.0362|       0.619|        6.03E-06|   0.109|  0.415| 89.459|             4|    3|       3.077766667|
|B Praak, Tanishk ...|"Baras Baras (Fro...|        54|       0.493| 0.402|   2|  -6.629|   1|     0.0309|       0.619|            null|  0.0629|  0.431| 155.57|             4|    3|       3.335916

In [85]:
sdf.createOrReplaceTempView("music")

deleting songs that are more than 5 minutes

In [86]:
sdf = spark.sql("SELECT * FROM music WHERE duration_in_min <= 5.0").show(5)

+--------------------+--------------------+----------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+------+--------------+------------+---------------+
|         Artist_Name|          Track_Name|Popularity|danceability|energy| key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence| tempo|time_signature|       Genre|duration_in_min|
+--------------------+--------------------+----------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+------+--------------+------------+---------------+
|Shaarib Toshi, Ar...|"Bandeya (feat. A...|        60|       0.465| 0.551|   7|   -6.58|   1|     0.0362|       0.619|        6.03E-06|   0.109|  0.415|89.459|             4|         Pop|    3.077766667|
|B Praak, Tanishk ...|"Baras Baras (Fro...|        54|       0.493| 0.402|   2|  -6.629|   1|     0.0309|       0.619|            null|  0.0629|  0.431|155.57|             4|         P

Songs by J. Cole, Novo Amor and Anson Seabra

In [87]:
# Display songs by J. Cole, Novo Amor and Anson Seabra
spark.sql("SELECT * from music WHERE Artist_Name = 'J. Cole' OR Artist_Name = 'Novo Amor' OR Artist_Name = 'Anson Seabra' ").show(14)

+------------+--------------------+----------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+------------+------------------+
| Artist_Name|          Track_Name|Popularity|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|       Genre|   duration_in_min|
+------------+--------------------+----------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+------------+------------------+
|     J. Cole|     9 5 . s o u t h|        84|       0.713| 0.793|  2|  -5.277|   1|      0.397|       0.271|            null|   0.727|  0.203| 71.724|             4|      HipHop|3.2824333333333335|
|     J. Cole|           a m a r i|        86|       0.725| 0.713|  4|  -6.173|   0|      0.187|       0.127|            null|   0.617|  0.207| 65.988|             4|Instrumental|2.4736833333333332|
|    

Counting th Genre to see no of songs in every Genre as the category

In [88]:
#  How many songs are included in every category?

spark.sql("SELECT Genre , COUNT (*) AS no_of_songs FROM music GROUP BY Genre ORDER BY Genre ASC").show(12)

+-------------+-----------+
|        Genre|no_of_songs|
+-------------+-----------+
|      117.017|          1|
|Acoustic/Folk|       1504|
|          Alt|       1066|
|    Alt_Music|       1757|
|        Blues|        439|
|      Country|       3514|
|       HipHop|       1121|
|        Indie|        836|
| Instrumental|       1529|
|        Metal|        355|
|          Pop|        370|
|         Rock|        529|
+-------------+-----------+



Getting the average populatity for every of the songs to determine the hotest artists

In [89]:
spark.sql("SELECT Artist_Name, AVG(Popularity) AS AvgPopularity FROM music GROUP BY Artist_Name ORDER BY AVG(Popularity) DESC").show(10)

+--------------------+-------------+
|         Artist_Name|AvgPopularity|
+--------------------+-------------+
|            Måneskin|        100.0|
|The Kid LAROI, Ju...|         97.0|
|     Los Legendarios|         95.0|
|Justin Bieber, Da...|         95.0|
|          Nio Garcia|         93.0|
|          Tion Wayne|         90.0|
|        Bella Poarch|         90.0|
|             Cardi B|         90.0|
|    Dua Lipa, DaBaby|         90.0|
|          The Weeknd|        89.25|
+--------------------+-------------+
only showing top 10 rows



Getting top 10 billboard based on the songs popularity

In [90]:
spark.sql("SELECT COUNT(*) as CNT , Popularity , Artist_Name , Track_Name from music GROUP BY Popularity , Track_Name , Artist_Name ORDER BY Popularity DESC").show(10)

+---+----------+--------------------+--------------------+
|CNT|Popularity|         Artist_Name|          Track_Name|
+---+----------+--------------------+--------------------+
|  1|        99|      Olivia Rodrigo|            good 4 u|
|  1|        98|            Doja Cat|Kiss Me More (fea...|
|  1|        97|The Kid LAROI, Ju...|STAY (with Justin...|
|  1|        95|     Los Legendarios|                Fiel|
|  1|        95|Justin Bieber, Da...|Peaches (feat. Da...|
|  1|        95|                 BTS| Permission to Dance|
|  1|        95|      Rauw Alejandro|          Todo De Ti|
|  1|        95|      Olivia Rodrigo|             deja vu|
|  1|        94|               Riton|Friday (feat. Muf...|
|  1|        94|          The Weeknd|Save Your Tears (...|
+---+----------+--------------------+--------------------+
only showing top 10 rows



Ranking the songs that could be played in a party based on the danceability as it is a party

In [91]:
spark.sql("SELECT COUNT(*) as COUNT ,  danceability, Track_Name from music GROUP BY Track_Name , danceability ORDER BY danceability DESC").show(5)

+-----+------------+--------------------+
|COUNT|danceability|          Track_Name|
+-----+------------+--------------------+
|    1|       0.989|      Gucci Umbrella|
|    1|       0.982|Divine Gosa - Swi...|
|    1|        0.98|           kawamurra|
|    1|       0.979|  Dancing in My Room|
|    1|       0.974|    OG Bobby Johnson|
+-----+------------+--------------------+
only showing top 5 rows

