## Reading the spotify-2023.csv

In [238]:
import pandas as pd

#We start the data analysis with reading the spotify-2023.csv file 
#The line of text "encoding=('ISO-8859-1')" is to change the encoding from the default utf-8 to ISO-8859-1 because proceeding with UTF-8 results to a UnicodeDecodeError
df = pd.read_csv('spotify-2023.csv', encoding=('ISO-8859-1'))


In [240]:
#Due to the value at the column "streams" at the 574th index it will be dropped 
df.loc[[574], ['streams']]

Unnamed: 0,streams
574,BPM110KeyAModeMajorDanceability53Valence75Ener...


In [242]:
#To drop of the row 574 due to incomplete data "No value of streams"
df = df.drop([574])

In [244]:
#Checking the datatypes of each columns present in the dataframe
df.dtypes

track_name              object
artist(s)_name          object
artist_count             int64
released_year            int64
released_month           int64
released_day             int64
in_spotify_playlists     int64
in_spotify_charts        int64
streams                 object
in_apple_playlists       int64
in_apple_charts          int64
in_deezer_playlists     object
in_deezer_charts         int64
in_shazam_charts        object
bpm                      int64
key                     object
mode                    object
danceability_%           int64
valence_%                int64
energy_%                 int64
acousticness_%           int64
instrumentalness_%       int64
liveness_%               int64
speechiness_%            int64
dtype: object

In [246]:
#To replace the commas present in the columns so that they may be converted to an integer value
df['in_deezer_playlists'] = df['in_deezer_playlists'].str.replace(',', '')
df['in_shazam_charts'] = df['in_shazam_charts'].str.replace(',', '')


In [248]:
#Converting the three columns into datatype int64
df[['streams', 'in_deezer_playlists', 'in_shazam_charts' ]] = df[['streams', 'in_deezer_playlists', 'in_shazam_charts']].apply(pd.to_numeric)
df = df.sort_values('streams', ascending=False).reset_index().drop(columns=['index'])
df

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,in_shazam_charts,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Blinding Lights,The Weeknd,1,2019,11,29,43899,69,3703895074,672,199,3421,20,,171,C#,Major,50,38,80,0,0,9,7
1,Shape of You,Ed Sheeran,1,2017,1,6,32181,10,3562543890,33,0,6808,7,0.0,96,C#,Minor,83,93,65,58,0,9,8
2,Someone You Loved,Lewis Capaldi,1,2018,11,8,17836,53,2887241814,440,125,1800,0,,110,C#,Major,50,45,41,75,0,11,3
3,Dance Monkey,Tones and I,1,2019,5,10,24529,0,2864791672,533,167,3595,6,,98,F#,Minor,82,54,59,69,0,18,10
4,Sunflower - Spider-Man: Into the Spider-Verse,"Post Malone, Swae Lee",2,2018,10,9,24094,78,2808096550,372,117,843,4,69.0,90,D,Major,76,91,50,54,0,7,5
5,One Dance,"Drake, WizKid, Kyla",3,2016,4,4,43257,24,2713922350,433,107,3631,0,26.0,104,C#,Major,77,36,63,1,0,36,5
6,STAY (with Justin Bieber),"Justin Bieber, The Kid Laroi",2,2021,7,9,17050,36,2665343922,492,99,798,31,0.0,170,C#,Major,59,48,76,4,0,10,5
7,Believer,Imagine Dragons,1,2017,1,31,18986,23,2594040133,250,121,2969,10,31.0,125,A#,Minor,77,74,78,4,0,23,11
8,Closer,"The Chainsmokers, Halsey",2,2016,5,31,28032,0,2591224264,315,159,2179,0,44.0,95,G#,Major,75,64,52,41,0,11,3
9,Starboy,"The Weeknd, Daft Punk",2,2016,9,21,29536,79,2565529693,281,137,2445,1,140.0,186,G,Major,68,49,59,16,0,13,28


In [250]:
#To check whether or not the dataset have multiple entries of songs
df_duplicate = df[df.duplicated(['track_name', 'artist(s)_name'], keep = False)].sort_values(by='track_name')
df_duplicate

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,in_shazam_charts,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
214,About Damn Time,Lizzo,1,2022,7,15,2332,2,723894473,0,0,25,0,0.0,109,A#,Minor,84,72,74,10,0,34,7
215,About Damn Time,Lizzo,1,2022,4,14,9021,0,723894473,242,49,272,21,24.0,109,A#,Minor,84,72,74,10,0,34,7
212,SNAP,Rosa Linn,1,2022,3,19,3202,18,726307468,148,80,226,24,0.0,170,,Major,56,53,64,11,0,45,6
221,SNAP,Rosa Linn,1,2022,3,19,1818,0,711366595,3,0,63,0,353.0,170,,Major,56,52,64,11,0,45,7
458,SPIT IN MY FACE!,ThxSoMch,1,2022,10,31,629,14,303216294,32,3,9,0,0.0,94,G#,Major,73,65,79,5,2,11,6
460,SPIT IN MY FACE!,ThxSoMch,1,2022,10,31,573,0,301869854,1,0,18,0,24.0,166,C#,Major,70,57,57,9,20,11,7
346,Take My Breath,The Weeknd,1,2021,8,6,6392,0,432702334,174,73,344,0,0.0,121,G#,Major,75,53,74,2,0,11,5
739,Take My Breath,The Weeknd,1,2021,8,6,2597,0,130655803,17,80,38,0,0.0,121,A#,Minor,70,35,77,1,0,26,4


In [252]:
#Finalizing of data to check if we have succesfully dropped the other entries
df_duplicate = df_duplicate.drop([214, 221, 460, 739])
df_duplicate


Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,in_shazam_charts,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
215,About Damn Time,Lizzo,1,2022,4,14,9021,0,723894473,242,49,272,21,24.0,109,A#,Minor,84,72,74,10,0,34,7
212,SNAP,Rosa Linn,1,2022,3,19,3202,18,726307468,148,80,226,24,0.0,170,,Major,56,53,64,11,0,45,6
458,SPIT IN MY FACE!,ThxSoMch,1,2022,10,31,629,14,303216294,32,3,9,0,0.0,94,G#,Major,73,65,79,5,2,11,6
346,Take My Breath,The Weeknd,1,2021,8,6,6392,0,432702334,174,73,344,0,0.0,121,G#,Major,75,53,74,2,0,11,5


In [263]:
#Finalizing into the main dataframe by adding 0 to all the NaN values and reseting the index
df = df.drop([214, 221, 460, 739]).fillna(0).reset_index().drop(columns=['index'])
df

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,in_shazam_charts,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Blinding Lights,The Weeknd,1,2019,11,29,43899,69,3703895074,672,199,3421,20,0.0,171,C#,Major,50,38,80,0,0,9,7
1,Shape of You,Ed Sheeran,1,2017,1,6,32181,10,3562543890,33,0,6808,7,0.0,96,C#,Minor,83,93,65,58,0,9,8
2,Someone You Loved,Lewis Capaldi,1,2018,11,8,17836,53,2887241814,440,125,1800,0,0.0,110,C#,Major,50,45,41,75,0,11,3
3,Dance Monkey,Tones and I,1,2019,5,10,24529,0,2864791672,533,167,3595,6,0.0,98,F#,Minor,82,54,59,69,0,18,10
4,Sunflower - Spider-Man: Into the Spider-Verse,"Post Malone, Swae Lee",2,2018,10,9,24094,78,2808096550,372,117,843,4,69.0,90,D,Major,76,91,50,54,0,7,5
5,One Dance,"Drake, WizKid, Kyla",3,2016,4,4,43257,24,2713922350,433,107,3631,0,26.0,104,C#,Major,77,36,63,1,0,36,5
6,STAY (with Justin Bieber),"Justin Bieber, The Kid Laroi",2,2021,7,9,17050,36,2665343922,492,99,798,31,0.0,170,C#,Major,59,48,76,4,0,10,5
7,Believer,Imagine Dragons,1,2017,1,31,18986,23,2594040133,250,121,2969,10,31.0,125,A#,Minor,77,74,78,4,0,23,11
8,Closer,"The Chainsmokers, Halsey",2,2016,5,31,28032,0,2591224264,315,159,2179,0,44.0,95,G#,Major,75,64,52,41,0,11,3
9,Starboy,"The Weeknd, Daft Punk",2,2016,9,21,29536,79,2565529693,281,137,2445,1,140.0,186,G,Major,68,49,59,16,0,13,28


In [265]:
#Saving of clean dataframe to a new .csv file for the Data Analysis
df.to_csv('spotify-2023Updated.csv')