In [1]:
# Import Relevant Modules
import numpy as np 
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
# pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, split

In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Spotify Analysis Spark Session").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/10 20:06:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load Data Using Spark
spotify_data = (spark.read.option("header", "true").csv('data/Final database.csv'))
# Pandas Method --> spotify_data = pd.read_csv('data/Final database.csv')

In [4]:
# Secondary Feature 1: Lyric Mood Score -------------------------------------------------------------------
# Scale: 1 - 8, where 1 is "low" mood and 8 is "high" mood. -----------------------------------------------
# ---------------------------------------------------------------------------------------------------------

# Create Conditions: Map Feelings/Mood to numerical value
feelings_conditions = when(col("Thug") == 1, 1)\
                    .when(col("Hope") == 1, 2)\
                    .when(col("Desire") == 1, 3)\
                    .when(col("Explore") == 1, 4)\
                    .when(col("Nostalgia") == 1, 5)\
                    .when(col("Love") == 1, 6)\
                    .when(col("Fun") == 1, 7)\
                    .when(col("Celebrate") == 1, 8)\
                    .otherwise(0)

# Map Feelings to Numerical Value: Use Above As Dictionary
spotify_data = spotify_data.withColumn('lyric_mood_score', feelings_conditions)

In [5]:
# Secondary Feature 2: Song Positvity Feature -------------------------------------------------------------
# Binary Encoding of If Song Is Positive of Not -----------------------------------------------------------
# ---------------------------------------------------------------------------------------------------------

spotify_data = spotify_data.withColumn('song_positivity', when(col("positive") >  col("negative"), 1)
                                       .otherwise(0))

In [6]:
# Secondary Feature 3-4: English/Spanish Speaking Country Boolean Feature --------------------------------
# Binary Encoding of If A Song Record Is Shown In (a) an English or (b) A Spanish Speaking Country or Not 
# --------------------------------------------------------------------------------------------------------

spotify_data = spotify_data.withColumn('english_song', when(col("Cluster") == "english speaking and nordic", 1)
                                       .otherwise(0))

spotify_data = spotify_data.withColumn('spanish_song', when(col("Cluster") == "spanish speaking", 1).otherwise(0))

In [7]:
# Secondary Feature 5-7: Date Splitting Date into Year, Month, & Quarter ---------------------------------
# --------------------------------------------------------------------------------------------------------

spotify_data = spotify_data.withColumn('year', split(spotify_data['Release_date'], '-').getItem(0))\
                           .withColumn('month', split(spotify_data['Release_date'], '-').getItem(1))

quarters = when((1 <= col("month")) & (col("month") <= 3), 1)\
          .when((4 <= col("month")) & (col("month") <= 6), 2)\
          .when((7 <= col("month")) & (col("month") <= 9), 3)\
          .when((10 <= col("month")) & (col("month") <= 12), 4)\
          .otherwise(0)

spotify_data = spotify_data.withColumn('quarter', quarters)

In [8]:
# Removed Unwanted Columns From Data
spotify_data = spotify_data\
                            .drop("Album/Single")\
                            .drop("Artist_followers")\
                            .drop("Explicit")\
                            .drop("Track_number")\
                            .drop("Tracks_in_album")\
                            .drop("danceability")\
                            .drop("energy")\
                            .drop("key")\
                            .drop("loudness")\
                            .drop("mode")\
                            .drop("speechiness")\
                            .drop("acoustics")\
                            .drop("instrumentalness")\
                            .drop("liveliness")\
                            .drop("valence")\
                            .drop("tempo")\
                            .drop("duration_ms")\
                            .drop("time_signature")\
                            .drop("Days_since_release")\
                            .drop("Released_after_2017")\
                            .drop("Explicit_false")\
                            .drop("Explicit_true")\
                            .drop("album")\
                            .drop("compilation")\
                            .drop("single")\
                            .drop("syuzhet_norm")\
                            .drop("bing_norm")\
                            .drop("afinn_norm")\
                            .drop("nrc_norm")\
                            .drop("syuzhet")\
                            .drop("bing")\
                            .drop("afinn")\
                            .drop("nrc")\
                            .drop("n_words")\
                            .drop("anger_norm")\
                            .drop("anticipation_norm")\
                            .drop("disgust_norm")\
                            .drop("fear_norm")\
                            .drop("joy_norm")\
                            .drop("sadness_norm")\
                            .drop("surprise_norm")\
                            .drop("trust_norm")\
                            .drop("negative_norm")\
                            .drop("positive_norm")\
                            .drop("anger_norm2")\
                            .drop("anticipation_norm2")\
                            .drop("disgust_norm2")\
                            .drop("fear_norm2")\
                            .drop("joy_norm2")\
                            .drop("sadness_norm2")\
                            .drop("surprise_norm2")\
                            .drop("trust_norm2")\
                            .drop("negative_norm2")\
                            .drop("positive_norm2")\
                            .drop("negative_bog_jr")\
                            .drop("positive_bog_jr")\
                            .drop("Bayes")\
                            .drop("Negative_Bayes")\
                            .drop("Neutral_Bayes")\
                            .drop("Positive_Bayes")\
                            .drop("LDA_Topic")\
                            .drop("bing_norm_negative")\
                            .drop("bing_norm_neutral")\
                            .drop("bing_norm_positive")\
                            .drop("Argentina")\
                            .drop("Australia")\
                            .drop("Austria")\
                            .drop("Belgium")\
                            .drop("Brazil")\
                            .drop("Canada")\
                            .drop("Chile")\
                            .drop("Colombia")\
                            .drop("Costa Rica")\
                            .drop("Denmark")\
                            .drop("Ecuador")\
                            .drop("Finland")\
                            .drop("France")\
                            .drop("Germany")\
                            .drop("Global")\
                            .drop("Indonesia")\
                            .drop("Ireland")\
                            .drop("Italy")\
                            .drop("Malaysia")\
                            .drop("Mexico")\
                            .drop("Netherlands")\
                            .drop("New Zealand")\
                            .drop("Norway")\
                            .drop("Peru")\
                            .drop("Philippines")\
                            .drop("Poland")\
                            .drop("Portugal")\
                            .drop("Singapore")\
                            .drop("Spain")\
                            .drop("Sweden")\
                            .drop("Switzerland")\
                            .drop("Taiwan")\
                            .drop("Turkey")\
                            .drop("UK")\
                            .drop("USA")\
                            .drop("Popu_max")\
                            .drop("Top10_dummy")\
                            .drop("Top50_dummy")\
                            .drop("Cluster")\
                            .drop("bolero")\
                            .drop("boy band")\
                            .drop("country36")\
                            .drop("dance/electronic")\
                            .drop("else")\
                            .drop("funk")\
                            .drop("hip hop")\
                            .drop("house")\
                            .drop("indie")\
                            .drop("jazz")\
                            .drop("k-pop")\
                            .drop("latin")\
                            .drop("metal")\
                            .drop("opm")\
                            .drop("pop")\
                            .drop("r&b/soul")\
                            .drop("rap")\
                            .drop("reggae")\
                            .drop("reggaeton")\
                            .drop("rock")\
                            .drop("trap")\
                            .drop("anticipation")\
                            .drop("anger")\
                            .drop("disgust")\
                            .drop("fear")\
                            .drop("joy")\
                            .drop("sadness")\
                            .drop("surprise")\
                            .drop("trust")\
                            .drop("negative")\
                            .drop("positive")\
                            .drop("Celebrate")\
                            .drop("Desire")\
                            .drop("Explore")\
                            .drop("Fun")\
                            .drop("Hope")\
                            .drop("Love")\
                            .drop("Nostalgia")\
                            .drop("Thug")\
                            .drop("Genre")\
                            .drop("album31")

In [9]:
# Clean DataSet - Deal with N/A Values For Remaining Columns Using Imputation
clean_spotify_data = spotify_data.na.fill(0)

In [10]:
# Rename Columns
clean_spotify_data = clean_spotify_data\
                    .withColumnRenamed("Country0", "Country")\
                    .withColumnRenamed("Uri", "SpotifySongID")\
                    .withColumnRenamed("Album9", "Album")\
                    .withColumnRenamed("Release_date", "ReleaseDate")\
                    .withColumnRenamed("Genre_new", "Genre")\
                    .withColumnRenamed("genre_score", "GenreScore")\
                    .withColumnRenamed("lyric_mood_score", "LyricMoodScore")\
                    .withColumnRenamed("song_positivity", "IsPositiveBool")\
                    .withColumnRenamed("english_song", "IsEnglishBool")\
                    .withColumnRenamed("spanish_song", "IsSpanishBool")\
                    .withColumnRenamed("month", "ReleaseMonth")\
                    .withColumnRenamed("year", "ReleaseYear")\
                    .withColumnRenamed("quarter", "ReleaseQuarter")


# Reorder Columns & Print
clean_spotify_data = clean_spotify_data.select("SpotifySongID", "Title", "Artist", "Album", "Genre", "ReleaseDate",
                                               "ReleaseYear", "ReleaseMonth", "ReleaseQuarter", "Country", 
                                               "IsEnglishBool", "IsSpanishBool", "IsPositiveBool", "Popularity",
                                               "LyricMoodScore")

# Convert Popularity From String to Double
clean_spotify_data = clean_spotify_data.withColumn("Popularity", clean_spotify_data.Popularity.cast('double'))

# Print
clean_spotify_data.printSchema()

root
 |-- SpotifySongID: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Artist: string (nullable = true)
 |-- Album: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- ReleaseDate: string (nullable = true)
 |-- ReleaseYear: string (nullable = true)
 |-- ReleaseMonth: string (nullable = true)
 |-- ReleaseQuarter: integer (nullable = false)
 |-- Country: string (nullable = true)
 |-- IsEnglishBool: integer (nullable = false)
 |-- IsSpanishBool: integer (nullable = false)
 |-- IsPositiveBool: integer (nullable = false)
 |-- Popularity: double (nullable = true)
 |-- LyricMoodScore: integer (nullable = false)



In [11]:
# Give short alias
df = clean_spotify_data

In [12]:
# Descriptions & Notes
# - Additional Features Engineered Above
# - Further ML Capabilities with these features include:
#     - Predicting Lyric Mood (Secondary Feature 1), given other Features (Artist, Year, Country, Album, etc.)
#     - Ability to predict if a song is English or Spanish (Secondary Features 2 & 3) given other details
#     - Ability to predict if a song will be deemed as overall "Positive" (Secondary Feature 4) given other details
#     - Date Splitting (Secondary Features 5 - 7) would allow better visulaizations and more precise trends
#     (i.e. artist release predictions and trend forecasts).
    
# Possibble Visualizations Given New Features:
#     - Artist Evolution over time In Respect to lyric-tone.
#     - Geographical Visualization: Porpotion of Spanish/English speaking Song Releases by Country
#     - With Date Split Features: Statistics over time by year, month, quarter, etc.

In [13]:
# Visual 1: Genre Popularity by Year for USA since 2010.
vis1 = df.filter(df.Country == "USA").filter(df.ReleaseYear >= 2010).groupBy("ReleaseYear", "Genre").sum("Popularity").orderBy("ReleaseYear")
vis1.toPandas()

22/08/10 20:06:41 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Country, Popularity, Release_date, Genre_new
 Schema: Country0, Popularity, Release_date, Genre_new
Expected: Country0 but found: Country
CSV file: file:///Users/kkissoon/Desktop/UWaterloo%20-%20Final%20Project/data/Final%20database.csv


                                                                                

Unnamed: 0,ReleaseYear,Genre,sum(Popularity)
0,2010,house,33.60
1,2010,rock,7783.40
2,2010,pop,836.45
3,2010,hip hop,401.65
4,2010,indie,394.95
...,...,...,...
123,2020,trap,168605.30
124,2020,dance/electronic,36788.95
125,2020,house,11422.00
126,2020,indie,21158.00


In [14]:
# Visual 2: Number of Records Pertaining to Entry From an English speaking or Spanish speakig Country over time.
vis2 = df.filter(df.ReleaseYear >= 2010).groupBy("ReleaseYear").sum("IsEnglishBool", "IsSpanishBool").orderBy("ReleaseYear")
vis2.toPandas()

                                                                                

Unnamed: 0,ReleaseYear,sum(IsEnglishBool),sum(IsSpanishBool)
0,2010,476,134
1,2011,886,189
2,2012,677,199
3,2013,940,242
4,2014,846,297
5,2015,1575,649
6,2016,4819,1649
7,2017,23029,4695
8,2018,26086,5571
9,2019,23592,4874


In [15]:
# Visual 3: Artist Evolution over time In Respect to Average lyric-tone for Top 10 USA-based Artists since 2010.

# Precursor To Get Top 10 Artists by Count
# df.filter(df.Country == "USA").filter(df.ReleaseYear > 2010).groupBy("Artist")\
# .count().orderBy("count", ascending=False).limit(10).toPandas()

vis3 = df.filter(df.Country == "USA").filter(df.ReleaseYear > 2010)\
.filter((df.Artist == "Future") | (df.Artist == "Drake") | (df.Artist == "Taylor Swift")
        | (df.Artist == "Juice WRLD") | (df.Artist == "Trippie Redd") | (df.Artist == "YoungBoy Never Broke Again")
        | (df.Artist == "Logic") | (df.Artist == "BTS") | (df.Artist == "XXXTENTACION")
        | (df.Artist == "Lil Uzi Vert")).groupBy("ReleaseYear", "Artist")\
.mean("LyricMoodScore").orderBy("ReleaseYear")

vis3.toPandas()

22/08/10 20:06:44 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Country, Artist, Release_date, Celebrate, Desire, Explore, Fun, Hope, Love, Nostalgia, Thug
 Schema: Country0, Artist, Release_date, Celebrate, Desire, Explore, Fun, Hope, Love, Nostalgia, Thug
Expected: Country0 but found: Country
CSV file: file:///Users/kkissoon/Desktop/UWaterloo%20-%20Final%20Project/data/Final%20database.csv


Unnamed: 0,ReleaseYear,Artist,avg(LyricMoodScore)
0,2011,Drake,2.0
1,2012,Taylor Swift,2.0
2,2013,Drake,0.0
3,2014,Taylor Swift,4.714286
4,2015,Drake,2.5
5,2015,Future,0.0
6,2016,Lil Uzi Vert,4.75
7,2016,Drake,2.0
8,2016,Future,6.0
9,2017,Taylor Swift,2.315789


In [16]:
# Visual 4: Number of Albums Released Each Year & Quarter Since 2010
vis4 = df.filter(df.ReleaseYear >= 2010)\
.groupBy("Album", "ReleaseYear", "ReleaseQuarter").count()\
.groupBy("ReleaseYear", "ReleaseQuarter").count().filter(df.ReleaseQuarter != 0)\
.orderBy("ReleaseYear", "ReleaseQuarter")

vis4.toPandas()

22/08/10 20:06:45 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Album, Release_date
 Schema: Album9, Release_date
Expected: Album9 but found: Album
CSV file: file:///Users/kkissoon/Desktop/UWaterloo%20-%20Final%20Project/data/Final%20database.csv


                                                                                

Unnamed: 0,ReleaseYear,ReleaseQuarter,count
0,2010,1,76
1,2010,2,34
2,2010,3,35
3,2010,4,56
4,2011,1,100
5,2011,2,30
6,2011,3,45
7,2011,4,52
8,2012,1,92
9,2012,2,45
