# Library importation

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Data Exploration

In [3]:
df = pd.read_csv("spotify_data.csv")

In [4]:
df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [8]:
df.shape

(169909, 19)

In [6]:
df.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,year
count,169909.0,169909.0,169909.0,169909.0,169909.0,169909.0,169909.0,169909.0,169909.0,169909.0,169909.0,169909.0,169909.0,169909.0,169909.0
mean,0.493214,0.53815,231406.2,0.488593,0.084863,0.161937,5.200519,0.20669,-11.370289,0.708556,31.55661,0.094058,116.948017,0.532095,1977.223231
std,0.376627,0.175346,121321.9,0.26739,0.278679,0.309329,3.515257,0.176796,5.666765,0.454429,21.582614,0.149937,30.726937,0.262408,25.593168
min,0.0,0.0,5108.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,1921.0
25%,0.0945,0.417,171040.0,0.263,0.0,0.0,2.0,0.0984,-14.47,0.0,12.0,0.0349,93.516,0.322,1957.0
50%,0.492,0.548,208600.0,0.481,0.0,0.000204,5.0,0.135,-10.474,1.0,33.0,0.045,114.778,0.544,1978.0
75%,0.888,0.667,262960.0,0.71,0.0,0.0868,8.0,0.263,-7.118,1.0,48.0,0.0754,135.712,0.749,1999.0
max,0.996,0.988,5403500.0,1.0,1.0,1.0,11.0,1.0,3.855,1.0,100.0,0.969,244.091,1.0,2020.0


In [9]:
df.dtypes

acousticness        float64
artists              object
danceability        float64
duration_ms           int64
energy              float64
explicit              int64
id                   object
instrumentalness    float64
key                   int64
liveness            float64
loudness            float64
mode                  int64
name                 object
popularity            int64
release_date         object
speechiness         float64
tempo               float64
valence             float64
year                  int64
dtype: object

## Are there missing values ?

In [29]:
df.isna().any().any()

False

# Data Restructuration

In [31]:
# remove the release_date column because of the inconsistency of its data
# sometimes we have the whole date and other times, it's just the year
# it seems like just keeping the year column is relevant
new_df = df.drop(columns=["release_date"])

In [32]:
new_df.shape

(169909, 18)

# Correlation between columns

In [39]:
# compute the pairwise correlation coefficient between all the columns but name, artists, year, id, key
new_df2 = new_df.drop(columns=["name", "artists", "year", "id", "key"])
new_df2.corr(numeric_only=False)

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,liveness,loudness,mode,popularity,speechiness,tempo,valence
acousticness,1.0,-0.26595,-0.079311,-0.750283,-0.25369,0.335821,-0.023871,-0.567072,0.046475,-0.593345,-0.056077,-0.204982,-0.18554
danceability,-0.26595,1.0,-0.1345,0.220569,0.241891,-0.281429,-0.105532,0.29417,-0.045306,0.221077,0.225305,-0.004872,0.560242
duration_ms,-0.079311,-0.1345,1.0,0.036396,-0.043811,0.084814,0.03427,-0.014687,-0.046981,0.063292,-0.058449,-0.028816,-0.19876
energy,-0.750283,0.220569,0.036396,1.0,0.142677,-0.287692,0.126293,0.782982,-0.038355,0.497488,-0.045226,0.249936,0.350086
explicit,-0.25369,0.241891,-0.043811,0.142677,1.0,-0.138292,0.039272,0.152695,-0.083221,0.214044,0.413074,0.011484,-0.022327
instrumentalness,0.335821,-0.281429,0.084814,-0.287692,-0.138292,1.0,-0.047397,-0.417033,-0.035051,-0.299829,-0.115735,-0.10757,-0.193929
liveness,-0.023871,-0.105532,0.03427,0.126293,0.039272,-0.047397,1.0,0.052985,0.005393,-0.075293,0.147667,0.008124,-0.000426
loudness,-0.567072,0.29417,-0.014687,0.782982,0.152695,-0.417033,0.052985,1.0,-0.013147,0.466546,-0.105796,0.211114,0.308418
mode,0.046475,-0.045306,-0.046981,-0.038355,-0.083221,-0.035051,0.005393,-0.013147,1.0,-0.032854,-0.057493,0.014539,0.014727
popularity,-0.593345,0.221077,0.063292,0.497488,0.214044,-0.299829,-0.075293,0.466546,-0.032854,1.0,-0.135707,0.135047,0.009327


From this pairwise correlation, the data below have a "strong" correlation(there, we consider as 
strong correlation a correlation for which Pearson coefficient r is >= 0.5) : 
- acousticness and energy are strongly correlated with r = -0.75
- acousticness is also correlated to loudness with r = -0.56
- acousticness and popularity are correlated with r = -0.59
- danceability is correlated to valence with r = 0.56
- energy is strongly correlated to loudness with r = 0.78

From those observations, some questions pop up :
- will PCA be enough to visualize the underlying structure of those data ? In fact, 8/13 columns are pairwise correlated. And we know that PCA is based on the linear dependence of features. So, is there any linear dependence between all those data ?
- can we use acoustic to predict energy, loudness and/or popularity ?
- can we use danceability to predict valence ?
- can we use energy to predict loudness ?

See you on the next episode(which is the remainder of this project 😉) !!!