In [0]:
# Import Relevant Modules
import numpy as np 
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
# pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, split, udf

In [0]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Spotify Analysis Spark Session").getOrCreate()

In [0]:
# Load Data Using Spark

# Local Machine:
# spotify_data = (spark.read.option("header", "true").csv('data/Final database.csv'))

# DataBricks:
spotify_data = (spark.read.option("header", "true").csv('/FileStore/tables/Final_database-1.csv'))

# Pandas Method --> spotify_data = pd.read_csv('data/Final database.csv')

In [0]:
# Secondary Feature 1: Lyric Mood Score -------------------------------------------------------------------
# Scale: 1 - 8, where 1 is "low" mood and 8 is "high" mood. -----------------------------------------------
# ---------------------------------------------------------------------------------------------------------

# Create Conditions: Map Feelings/Mood to numerical value
feelings_conditions = when(col("Thug") == 1, 1)\
                    .when(col("Hope") == 1, 2)\
                    .when(col("Desire") == 1, 3)\
                    .when(col("Explore") == 1, 4)\
                    .when(col("Nostalgia") == 1, 5)\
                    .when(col("Love") == 1, 6)\
                    .when(col("Fun") == 1, 7)\
                    .when(col("Celebrate") == 1, 8)\
                    .otherwise(0)

# Map Feelings to Numerical Value: Use Above As Dictionary
spotify_data = spotify_data.withColumn('lyric_mood_score', feelings_conditions)

In [0]:
# Secondary Feature 2: Song Positvity Feature -------------------------------------------------------------
# Binary Encoding of If Song Is Positive of Not -----------------------------------------------------------
# ---------------------------------------------------------------------------------------------------------

spotify_data = spotify_data.withColumn('song_positivity', when(col("positive") >  col("negative"), 1)
                                       .otherwise(0))

In [0]:
# Secondary Feature 3-4: English/Spanish Speaking Country Boolean Feature --------------------------------
# Binary Encoding of If A Song Record Is Shown In (a) an English or (b) A Spanish Speaking Country or Not 
# --------------------------------------------------------------------------------------------------------

spotify_data = spotify_data.withColumn('english_song', when(col("Cluster") == "english speaking and nordic", 1)
                                       .otherwise(0))

spotify_data = spotify_data.withColumn('spanish_song', when(col("Cluster") == "spanish speaking", 1).otherwise(0))

In [0]:
# Secondary Feature 5-7: Date Splitting Date into Year, Month, & Quarter ---------------------------------
# --------------------------------------------------------------------------------------------------------

spotify_data = spotify_data.withColumn('year', split(spotify_data['Release_date'], '-').getItem(0))\
                           .withColumn('month', split(spotify_data['Release_date'], '-').getItem(1))

quarters = when((1 <= col("month")) & (col("month") <= 3), 1)\
          .when((4 <= col("month")) & (col("month") <= 6), 2)\
          .when((7 <= col("month")) & (col("month") <= 9), 3)\
          .when((10 <= col("month")) & (col("month") <= 12), 4)\
          .otherwise(0)

spotify_data = spotify_data.withColumn('quarter', quarters)

In [0]:
# Removed Unwanted Columns From Data
spotify_data = spotify_data\
                            .drop("Album/Single")\
                            .drop("Artist_followers")\
                            .drop("Explicit")\
                            .drop("Track_number")\
                            .drop("Tracks_in_album")\
                            .drop("danceability")\
                            .drop("energy")\
                            .drop("key")\
                            .drop("loudness")\
                            .drop("mode")\
                            .drop("speechiness")\
                            .drop("acoustics")\
                            .drop("instrumentalness")\
                            .drop("liveliness")\
                            .drop("valence")\
                            .drop("tempo")\
                            .drop("duration_ms")\
                            .drop("time_signature")\
                            .drop("Days_since_release")\
                            .drop("Released_after_2017")\
                            .drop("Explicit_false")\
                            .drop("Explicit_true")\
                            .drop("album")\
                            .drop("compilation")\
                            .drop("single")\
                            .drop("syuzhet_norm")\
                            .drop("bing_norm")\
                            .drop("afinn_norm")\
                            .drop("nrc_norm")\
                            .drop("syuzhet")\
                            .drop("bing")\
                            .drop("afinn")\
                            .drop("nrc")\
                            .drop("n_words")\
                            .drop("anger_norm")\
                            .drop("anticipation_norm")\
                            .drop("disgust_norm")\
                            .drop("fear_norm")\
                            .drop("joy_norm")\
                            .drop("sadness_norm")\
                            .drop("surprise_norm")\
                            .drop("trust_norm")\
                            .drop("negative_norm")\
                            .drop("positive_norm")\
                            .drop("anger_norm2")\
                            .drop("anticipation_norm2")\
                            .drop("disgust_norm2")\
                            .drop("fear_norm2")\
                            .drop("joy_norm2")\
                            .drop("sadness_norm2")\
                            .drop("surprise_norm2")\
                            .drop("trust_norm2")\
                            .drop("negative_norm2")\
                            .drop("positive_norm2")\
                            .drop("negative_bog_jr")\
                            .drop("positive_bog_jr")\
                            .drop("Bayes")\
                            .drop("Negative_Bayes")\
                            .drop("Neutral_Bayes")\
                            .drop("Positive_Bayes")\
                            .drop("LDA_Topic")\
                            .drop("bing_norm_negative")\
                            .drop("bing_norm_neutral")\
                            .drop("bing_norm_positive")\
                            .drop("Argentina")\
                            .drop("Australia")\
                            .drop("Austria")\
                            .drop("Belgium")\
                            .drop("Brazil")\
                            .drop("Canada")\
                            .drop("Chile")\
                            .drop("Colombia")\
                            .drop("Costa Rica")\
                            .drop("Denmark")\
                            .drop("Ecuador")\
                            .drop("Finland")\
                            .drop("France")\
                            .drop("Germany")\
                            .drop("Global")\
                            .drop("Indonesia")\
                            .drop("Ireland")\
                            .drop("Italy")\
                            .drop("Malaysia")\
                            .drop("Mexico")\
                            .drop("Netherlands")\
                            .drop("New Zealand")\
                            .drop("Norway")\
                            .drop("Peru")\
                            .drop("Philippines")\
                            .drop("Poland")\
                            .drop("Portugal")\
                            .drop("Singapore")\
                            .drop("Spain")\
                            .drop("Sweden")\
                            .drop("Switzerland")\
                            .drop("Taiwan")\
                            .drop("Turkey")\
                            .drop("UK")\
                            .drop("USA")\
                            .drop("Popu_max")\
                            .drop("Top10_dummy")\
                            .drop("Top50_dummy")\
                            .drop("Cluster")\
                            .drop("bolero")\
                            .drop("boy band")\
                            .drop("country36")\
                            .drop("dance/electronic")\
                            .drop("else")\
                            .drop("funk")\
                            .drop("hip hop")\
                            .drop("house")\
                            .drop("indie")\
                            .drop("jazz")\
                            .drop("k-pop")\
                            .drop("latin")\
                            .drop("metal")\
                            .drop("opm")\
                            .drop("pop")\
                            .drop("r&b/soul")\
                            .drop("rap")\
                            .drop("reggae")\
                            .drop("reggaeton")\
                            .drop("rock")\
                            .drop("trap")\
                            .drop("anticipation")\
                            .drop("anger")\
                            .drop("disgust")\
                            .drop("fear")\
                            .drop("joy")\
                            .drop("sadness")\
                            .drop("surprise")\
                            .drop("trust")\
                            .drop("negative")\
                            .drop("positive")\
                            .drop("Celebrate")\
                            .drop("Desire")\
                            .drop("Explore")\
                            .drop("Fun")\
                            .drop("Hope")\
                            .drop("Love")\
                            .drop("Nostalgia")\
                            .drop("Thug")\
                            .drop("Genre")\
                            .drop("album31")

In [0]:
# Clean DataSet - Deal with N/A Values For Remaining Columns Using Imputation
clean_spotify_data = spotify_data.na.fill(0)

In [0]:
# Rename Columns
clean_spotify_data = clean_spotify_data\
                    .withColumnRenamed("Country0", "Country")\
                    .withColumnRenamed("Uri", "SpotifySongID")\
                    .withColumnRenamed("Album9", "Album")\
                    .withColumnRenamed("Release_date", "ReleaseDate")\
                    .withColumnRenamed("Genre_new", "Genre")\
                    .withColumnRenamed("genre_score", "GenreScore")\
                    .withColumnRenamed("lyric_mood_score", "LyricMoodScore")\
                    .withColumnRenamed("song_positivity", "IsPositiveBool")\
                    .withColumnRenamed("english_song", "IsEnglishBool")\
                    .withColumnRenamed("spanish_song", "IsSpanishBool")\
                    .withColumnRenamed("month", "ReleaseMonth")\
                    .withColumnRenamed("year", "ReleaseYear")\
                    .withColumnRenamed("quarter", "ReleaseQuarter")


# Reorder Columns & Print
clean_spotify_data = clean_spotify_data.select("SpotifySongID", "Title", "Artist", "Album", "Genre", "ReleaseDate",
                                               "ReleaseYear", "ReleaseMonth", "ReleaseQuarter", "Country", 
                                               "IsEnglishBool", "IsSpanishBool", "IsPositiveBool", "Popularity",
                                               "LyricMoodScore")

# Convert Popularity From String to Double
clean_spotify_data = clean_spotify_data.withColumn("Popularity", clean_spotify_data.Popularity.cast('double'))

# Print
clean_spotify_data.printSchema()

root
 |-- SpotifySongID: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Artist: string (nullable = true)
 |-- Album: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- ReleaseDate: string (nullable = true)
 |-- ReleaseYear: string (nullable = true)
 |-- ReleaseMonth: string (nullable = true)
 |-- ReleaseQuarter: integer (nullable = false)
 |-- Country: string (nullable = true)
 |-- IsEnglishBool: integer (nullable = false)
 |-- IsSpanishBool: integer (nullable = false)
 |-- IsPositiveBool: integer (nullable = false)
 |-- Popularity: double (nullable = true)
 |-- LyricMoodScore: integer (nullable = false)



In [0]:
# Give short alias
df = clean_spotify_data

In [0]:
# Descriptions & Notes
# - Additional Features Engineered Above
# - Further ML Capabilities with these features include:
#     - Predicting Lyric Mood (Secondary Feature 1), given other Features (Artist, Year, Country, Album, etc.)
#     - Ability to predict if a song is English or Spanish (Secondary Features 2 & 3) given other details
#     - Ability to predict if a song will be deemed as overall "Positive" (Secondary Feature 4) given other details
#     - Date Splitting (Secondary Features 5 - 7) would allow better visulaizations and more precise trends
#     (i.e. artist release predictions and trend forecasts).

In [0]:
# Visual 1: Genre Popularity by Year for USA since 2010.
vis1 = df.filter(df.Country == "USA").filter(df.ReleaseYear >= 2010).groupBy("ReleaseYear", "Genre").sum("Popularity").orderBy("ReleaseYear")
display(vis1)

ReleaseYear,Genre,sum(Popularity)
2010,rock,7783.400000000002
2010,house,33.6
2010,hip hop,401.65
2010,indie,394.95000000000016
2010,pop,836.4499999999999
2010,else,154.4
2010,country,4.800000000000002
2011,hip hop,901.65
2011,else,223.45
2011,funk,96.75


In [0]:
# Visual 2: Number of Records Pertaining to Entry From an English speaking or Spanish speakig Country over time.
vis2 = df.filter(df.ReleaseYear >= 2010).groupBy("ReleaseYear").sum("IsEnglishBool", "IsSpanishBool").orderBy("ReleaseYear")
display(vis2)

ReleaseYear,sum(IsEnglishBool),sum(IsSpanishBool)
2010,476,134
2011,886,189
2012,677,199
2013,940,242
2014,846,297
2015,1575,649
2016,4819,1649
2017,23029,4695
2018,26086,5571
2019,23592,4874


In [0]:
# Visual 3: Artist Evolution over time In Respect to Average lyric-tone for Top 10 USA-based Artists since 2010.

# Precursor To Get Top 10 Artists by Count
# df.filter(df.Country == "USA").filter(df.ReleaseYear > 2010).groupBy("Artist")\
# .count().orderBy("count", ascending=False).limit(10).toPandas()

vis3 = df.filter(df.Country == "USA").filter(df.ReleaseYear > 2010)\
.filter((df.Artist == "Future") | (df.Artist == "Drake") | (df.Artist == "Taylor Swift")
        | (df.Artist == "Juice WRLD") | (df.Artist == "Trippie Redd") | (df.Artist == "YoungBoy Never Broke Again")
        | (df.Artist == "Logic") | (df.Artist == "BTS") | (df.Artist == "XXXTENTACION")
        | (df.Artist == "Lil Uzi Vert")).groupBy("ReleaseYear", "Artist")\
.mean("LyricMoodScore").orderBy("ReleaseYear")

display(vis3)

ReleaseYear,Artist,avg(LyricMoodScore)
2011,Drake,2.0
2012,Taylor Swift,2.0
2013,Drake,0.0
2014,Taylor Swift,4.714285714285714
2015,Drake,2.5
2015,Future,0.0
2016,Future,6.0
2016,Drake,2.0
2016,Lil Uzi Vert,4.75
2017,Taylor Swift,2.3157894736842106


In [0]:
# Visual 4: Number of Albums Released Each Year & Quarter Since 2010
vis4 = df.filter(df.ReleaseYear >= 2010)\
.groupBy("Album", "ReleaseYear", "ReleaseQuarter").count()\
.groupBy("ReleaseYear", "ReleaseQuarter").count().filter(df.ReleaseQuarter != 0)\
.orderBy("ReleaseYear", "ReleaseQuarter")

display(vis4)

ReleaseYear,ReleaseQuarter,count
2010,1,76
2010,2,34
2010,3,35
2010,4,56
2011,1,100
2011,2,30
2011,3,45
2011,4,52
2012,1,92
2012,2,45


In [0]:
# Visual 5: Total Popularity By Country

# PIP Install & Import Pycountry
# pip install pycountry
import pycountry

# Create Dictionary of Country ISO Codes
countries = {}
for country in pycountry.countries:
    countries[country.name] = country.alpha_3
    
# Manually Fix Erroneous Country Names (USA & UK) To ISO Mapping:
df = df.withColumn("Country", when(col("Country") == "USA", "United States").when(col("Country") == "UK", "United Kingdom").otherwise(col("Country")))

# Map Country to ISO Codes Using Lambda Function
udf_country_lookup = udf(lambda x:countries.get(x, 'UNKNOWN'))

# Filter UNKNOWN Country Codes
vis5 = df.withColumn("CountryCode", udf_country_lookup(col("Country")))
vis5 = vis5.filter(vis5.CountryCode != "UNKNOWN")

# Using DataBricks Display Map Visualization
display(vis5)

SpotifySongID,Title,Artist,Album,Genre,ReleaseDate,ReleaseYear,ReleaseMonth,ReleaseQuarter,Country,IsEnglishBool,IsSpanishBool,IsPositiveBool,Popularity,LyricMoodScore,CountryCode
https://open.spotify.com/track/6FyRXC8tJUh863JCkyWqtk,adan y eva,Paulo Londra,Adan y Eva,hip hop,2018-11-05,2018,11,4,United States,1,0,0,8.0,0,USA
https://open.spotify.com/track/6FyRXC8tJUh863JCkyWqtk,adan y eva,Paulo Londra,Adan y Eva,hip hop,2018-11-05,2018,11,4,Argentina,0,1,0,76924.4,0,ARG
https://open.spotify.com/track/6FyRXC8tJUh863JCkyWqtk,adan y eva,Paulo Londra,Adan y Eva,hip hop,2018-11-05,2018,11,4,Belgium,1,0,0,849.6000000000001,0,BEL
https://open.spotify.com/track/6FyRXC8tJUh863JCkyWqtk,adan y eva,Paulo Londra,Adan y Eva,hip hop,2018-11-05,2018,11,4,Switzerland,1,0,0,20739.1,0,CHE
https://open.spotify.com/track/6FyRXC8tJUh863JCkyWqtk,adan y eva,Paulo Londra,Adan y Eva,hip hop,2018-11-05,2018,11,4,Chile,0,1,0,60264.19999999997,0,CHL
https://open.spotify.com/track/6FyRXC8tJUh863JCkyWqtk,adan y eva,Paulo Londra,Adan y Eva,hip hop,2018-11-05,2018,11,4,Colombia,0,1,0,60131.15000000014,0,COL
https://open.spotify.com/track/6FyRXC8tJUh863JCkyWqtk,adan y eva,Paulo Londra,Adan y Eva,hip hop,2018-11-05,2018,11,4,Costa Rica,0,1,0,64081.30000000009,0,CRI
https://open.spotify.com/track/6FyRXC8tJUh863JCkyWqtk,adan y eva,Paulo Londra,Adan y Eva,hip hop,2018-11-05,2018,11,4,Germany,1,0,0,208.0,0,DEU
https://open.spotify.com/track/6FyRXC8tJUh863JCkyWqtk,adan y eva,Paulo Londra,Adan y Eva,hip hop,2018-11-05,2018,11,4,Ecuador,0,1,0,75636.9,0,ECU
https://open.spotify.com/track/6FyRXC8tJUh863JCkyWqtk,adan y eva,Paulo Londra,Adan y Eva,hip hop,2018-11-05,2018,11,4,Spain,0,1,0,69318.34999999998,0,ESP
