# Exploratory data analysis


### Importing the data

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5000)
%matplotlib inline

Settings for getting figures in high resolution

In [3]:
# sns.set(rc={"figure.dpi": 300, 'savefig.dpi': 300})
# sns.set_context('notebook')
# sns.set_style("ticks")

In [4]:
df=pd.read_pickle('./dataset_/dataset_cleaned_popularity.pkl')
df.drop(columns=['index', 'level_0'], inplace=True)  # Comes from pickeling the file
df.reset_index(drop=True, inplace=True)

In [5]:
df.head()

Unnamed: 0,country,uri,popularity,title,artist,release_type,sub_genre,artist_followers,explicit,album,release_date,track_number,tracks_in_album,danceability,energy,key,loudness,mode,speechiness,acoustics,instrumentalness,liveness,valence,tempo,time_signature,genre,days_since_release,released_after_2017,explicit_false,explicit_true,popu_max,top10_dummy,top50_dummy,cluster,duration_min,release_year,release_month,release_day
0,Global,https://open.spotify.com/track/6FyRXC8tJUh863J...,31833.95,adan y eva,Paulo Londra,single,argentine hip hop,11427104.0,False,Adan y Eva,2018-11-05,1,1,0.767,0.709,1,-4.47,1,0.336,0.323,0.0,0.0676,0.72,171.993,4,hip hop,740.0,1.0,1,0,10,1.0,1.0,global,4.31065,2018,11,5
1,USA,https://open.spotify.com/track/6FyRXC8tJUh863J...,8.0,adan y eva,Paulo Londra,single,argentine hip hop,11427104.0,False,Adan y Eva,2018-11-05,1,1,0.767,0.709,1,-4.47,1,0.336,0.323,0.0,0.0676,0.72,171.993,4,hip hop,740.0,1.0,1,0,191,0.0,0.0,english speaking and nordic,4.31065,2018,11,5
2,Argentina,https://open.spotify.com/track/6FyRXC8tJUh863J...,76924.4,adan y eva,Paulo Londra,single,argentine hip hop,11427104.0,False,Adan y Eva,2018-11-05,1,1,0.767,0.709,1,-4.47,1,0.336,0.323,0.0,0.0676,0.72,171.993,4,hip hop,740.0,1.0,1,0,1,1.0,1.0,spanish speaking,4.31065,2018,11,5
3,Belgium,https://open.spotify.com/track/6FyRXC8tJUh863J...,849.6,adan y eva,Paulo Londra,single,argentine hip hop,11427104.0,False,Adan y Eva,2018-11-05,1,1,0.767,0.709,1,-4.47,1,0.336,0.323,0.0,0.0676,0.72,171.993,4,hip hop,740.0,1.0,1,0,126,0.0,0.0,english speaking and nordic,4.31065,2018,11,5
4,Switzerland,https://open.spotify.com/track/6FyRXC8tJUh863J...,20739.1,adan y eva,Paulo Londra,single,argentine hip hop,11427104.0,False,Adan y Eva,2018-11-05,1,1,0.767,0.709,1,-4.47,1,0.336,0.323,0.0,0.0676,0.72,171.993,4,hip hop,740.0,1.0,1,0,21,0.0,1.0,english speaking and nordic,4.31065,2018,11,5


Take a look into the global playlist and make sense of why we have so many songs in it. Assumption is that each playlist should only have 200 songs in it.

In [15]:
df[df['country'] == 'Global']['title'].value_counts()

intro                                                                                        8
paradise                                                                                     6
all night                                                                                    6
sleigh ride                                                                                  6
have yourself a merry little christmas                                                       5
x                                                                                            5
baby                                                                                         5
christmas time                                                                               5
changes                                                                                      5
stay                                                                                         5
christmas                                         

Check multiple songs based on the example of the song sleigh ride.

In [16]:
df[(df['country'] == 'Global') & (df['title'] == 'sleigh ride')]

Unnamed: 0,country,uri,popularity,title,artist,release_type,sub_genre,artist_followers,explicit,album,release_date,track_number,tracks_in_album,danceability,energy,key,loudness,mode,speechiness,acoustics,instrumentalness,liveness,valence,tempo,time_signature,genre,days_since_release,released_after_2017,explicit_false,explicit_true,popu_max,top10_dummy,top50_dummy,cluster,duration_min,release_year,release_month,release_day
15215,Global,https://open.spotify.com/track/4ukUoXLuFzMixyZ...,200.15,sleigh ride,Ella Fitzgerald,album,adult standards,1766898.0,False,Ella Wishes You A Swinging Christmas (Expanded...,1960-01-01,5,18,0.508,0.287,1,-12.472,1,0.0523,0.764,0.0,0.153,0.644,154.759,4,pop,22224.0,0.0,1,0,79,0.0,0.0,global,2.933117,1960,1,1
16619,Global,https://open.spotify.com/track/597IVBYr60vm0HK...,248.55,sleigh ride,Carpenters,album,adult standards,1562181.0,False,Christmas Portrait (Special Edition),1978,5,21,0.694,0.243,11,-15.632,1,0.0723,0.871,0.0,0.165,0.578,100.774,4,pop,15115.0,False,1,0,90,0.0,0.0,global,2.657333,1978,1,1
38848,Global,https://open.spotify.com/track/6XAdcAseYtijN0Q...,76.0,sleigh ride,Johnny Mathis,album,adult standards,243064.0,False,Merry Christmas,1958-10-06,3,12,0.605,0.663,1,-8.949,1,0.0274,0.699,0.0,0.121,0.893,103.18,4,pop,22676.0,0.0,1,0,132,0.0,0.0,global,2.978667,1958,10,6
46766,Global,https://open.spotify.com/track/1ODZGydlown4Ves...,2169.35,sleigh ride,The Ronettes,album,brill building pop,153854.0,False,NOW 100 Hits Christmas,2019-11-08,9,100,0.527,0.768,2,-7.557,1,0.0304,0.457,2e-06,0.304,0.799,91.75,4,pop,372.0,1.0,1,0,23,0.0,1.0,global,3.013883,2019,11,8
49133,Global,https://open.spotify.com/track/5ymVfeMK8cDew5v...,76.0,sleigh ride,Andy Williams,album,adult standards,315467.0,False,Merry Christmas,1965,1,12,0.571,0.634,5,-9.155,0,0.0537,0.724,0.0,0.0864,0.805,109.575,4,pop,19863.0,False,1,0,138,0.0,0.0,global,2.14,1965,1,1
76936,Global,https://open.spotify.com/track/5ASM6Qjiav2xPe7...,1368.15,sleigh ride,The Ronettes,compilation,brill building pop,153854.0,False,A Christmas Gift For You From Phil Spector,1963,5,13,0.529,0.772,2,-7.013,1,0.0287,0.403,2e-06,0.316,0.853,91.751,4,pop,20594.0,False,1,0,20,0.0,1.0,global,3.021117,1963,1,1


We have muliple songs with the same name in this example or placed in different albums. The problem is that if a song is listed in two playlists it should still have the same popularity. Also the numeric music features are different in both songs, what might be possible because one song has a new release date and maybe has a better song quality. 

In [20]:
df[(df['country'] == 'Global')]['uri'].value_counts()


https://open.spotify.com/track/6FyRXC8tJUh863JCkyWqtk    1
https://open.spotify.com/track/3kml9ZdpT7UYk1ugkjqOzt    1
https://open.spotify.com/track/5p3JJehpKHjr9pJT5eD2Lz    1
https://open.spotify.com/track/64kfyGcf5dvbw92Vv4THCj    1
https://open.spotify.com/track/6cX1iTffMIAU8lEWSOYIIO    1
                                                        ..
https://open.spotify.com/track/4TzXqUIcdpAmre24puQ6Uu    1
https://open.spotify.com/track/6XjEOPgbgHKcaHcunEHSJJ    1
https://open.spotify.com/track/05KOgYg8PGeJyyWBPi5ja8    1
https://open.spotify.com/track/3ZVw1HR1QAQoKBygbhG4CE    1
https://open.spotify.com/track/7cfvP13Bqb1EyCZQiYsLum    1
Name: uri, Length: 5423, dtype: int64

The top200 ranking of course is not static for a period of 3 years so we have many more songs that are listed over the years. 

In [10]:
df_numeric = df[['popularity', 'artist_followers', 'track_number', 'tracks_in_album', 'danceability', 'energy',
                 'loudness', 'speechiness', 'acoustics', 'instrumentalness', 'liveness', 'valence', 'tempo',
                 'days_since_release', 'popu_max','duration_min']]