The goal here is to get songs that were popular in different time intervals and:

1. check what feature was most correlated with popularity at that interval

2. see if over time there was an increase for example, if there was an increase in danceability over time in the popular songs or if it remained the same throughout the years.

Here a popular song is defined as having a score greater than 50

In [14]:
# ! pip install pandas
import pandas as pd

df = pd.read_csv('spotify_data.csv')

X = df['year']
popularity_corr = {}
for i in range(1928, 2020, 5):
    new_interval = df[(X.between(i, i + 5)) & (df['popularity'] > 50)]
    
    popularity_corr[f"{i} - {i+5}"] = new_interval.corr()['popularity']


print(popularity_corr)


{'1928 - 1933': acousticness       -1.0
danceability        1.0
duration_ms        -1.0
energy             -1.0
explicit            NaN
instrumentalness    1.0
key                -1.0
liveness            1.0
loudness           -1.0
mode                NaN
popularity          1.0
speechiness         1.0
tempo              -1.0
valence             1.0
year                NaN
Name: popularity, dtype: float64, '1933 - 1938': acousticness        0.269212
danceability       -0.320882
duration_ms         0.202466
energy             -0.468155
explicit                 NaN
instrumentalness   -0.403839
key                -0.927708
liveness            0.581691
loudness           -0.996341
mode               -0.409644
popularity          1.000000
speechiness        -0.153886
tempo              -0.292582
valence             0.071775
year               -0.409644
Name: popularity, dtype: float64, '1938 - 1943': acousticness        8.313420e-01
danceability       -7.141084e-01
duration_ms        -5.011

  popularity_corr[f"{i} - {i+5}"] = new_interval.corr()['popularity']


From the first few results above, we see that when the interval is 5 years, we are not getting alot of information in the correlation matrix. Therefore, we will now increase the innterval to 10

In [15]:
# ! pip install pandas
import pandas as pd

df = pd.read_csv('spotify_data.csv')

X = df['year']

popularity_corr = {}

for i in range(1928, 2020, 10):
    new_interval = df[(X.between(i, i + 10)) & (df['popularity'] > 50)]
    
    popularity_corr[f"{i} - {i+10}"] = new_interval.corr()['popularity']


print(popularity_corr)

{'1928 - 1938': acousticness        0.269212
danceability       -0.320882
duration_ms         0.202466
energy             -0.468155
explicit                 NaN
instrumentalness   -0.403839
key                -0.927708
liveness            0.581691
loudness           -0.996341
mode               -0.409644
popularity          1.000000
speechiness        -0.153886
tempo              -0.292582
valence             0.071775
year               -0.409644
Name: popularity, dtype: float64, '1938 - 1948': acousticness       -0.081570
danceability       -0.260637
duration_ms        -0.544042
energy             -0.206500
explicit                 NaN
instrumentalness   -0.421290
key                 0.209416
liveness            0.494983
loudness            0.053249
mode                0.036555
popularity          1.000000
speechiness        -0.472676
tempo               0.165327
valence             0.108494
year                0.157662
Name: popularity, dtype: float64, '1948 - 1958': acousticness    

  popularity_corr[f"{i} - {i+10}"] = new_interval.corr()['popularity']


Now we are getting alot more information about the feture strongly correlated with popularity in this interval, we can do a correlation and save it in a variable. We want to now see, if a song is popular, based on the features that make it popular, can we guess thhe year.

In [18]:
import pandas as pd

df = pd.read_csv('spotify_data.csv')

X = df['year']

popularity_corr = {}

for i in range(1928, 2020, 10):
    # recall that we defined popularity by having a score greater than 50, so those are the songs that we are interested in
    new_interval = df[(X.between(i, i + 10)) & (df['popularity'] > 50)]
    popularity_corr[f"{i} - {i+10}"] = new_interval.corr()

  popularity_corr[f"{i} - {i+10}"] = new_interval.corr()


We want to save the results gotten to a csv so the operation must not be repeated each time

In [11]:

corr_df = pd.DataFrame(list(popularity_corr.items()), columns=['interval', 'correlation matrix'])
corr_df.to_csv('popular_corr_matrix_10_yr_inter.csv', index=False)

Now, we use the csv created.

In [35]:
import pandas as pd
import numpy as np

popularity_corr_df = pd.read_csv('popular_corr_matrix_10_yr_inter.csv')


for i in range(len(corr_df)):
    # get ith row
    row = corr_df.iloc[i]
    row_interval = row['interval']
    row_corr_matrix = row['correlation matrix']

    # get the absolute values
    abs_vals = row_corr_matrix.abs()

    # unstack the correlation matrix, ie change it from the table like structi=ure to something more like a listing
    unstacked_corr = abs_vals.unstack()

    # sort values in decending order
    sort_values_desc = unstacked_corr.sort_values(ascending=False)

    pairs = set()
    i = 0
    result = pd.DataFrame()

    for index, value in sort_values_desc.iteritems():
        # (year(var1), popularity(var2)), 1
        #     ^         ^
        # index         value
        # if the absolute correlation score is greater than 0.5, if var1 !- var2, if index pair not already saved and if the duplicate is not saved
        if value >= 0.5 \
            and index[0] != index[1] \
            and (index[0], index[1]) not in pairs \
            and (index[1], index[0]) not in pairs:
                result.loc[i, ['variable 1', 'variable 2', 'correlation coefficient']] = [index[0], index[1], row_corr_matrix.loc[(index[0], index[1])]]
                pairs.add((index[0], index[1]))

        i += 1

    # features most correlated with popularity
    feat_cor_popu = result[(result['variable 1'] == 'popularity') | (result['variable 2'] == 'popularity')]
    
    # features most correlated with year
    feat_cor_yr = result[(result['variable 1'] == 'year') | (result['variable 2'] == 'year')]

    yr_corr_list = list(feat_cor_yr.items())

    # get features correlated to both
    # both_corr = feat_cor_popu[(feat_cor_popu['variable 1'] in yr_corr_list) | (feat_cor_popu['variable 2'] in yr_corr_list)]
    print(yr_corr_list)
    break
    


[('variable 1', 0                  year
16     instrumentalness
24                 year
36         danceability
48                 year
56                 year
66          duration_ms
76                 year
106             valence
126                year
130                year
Name: variable 1, dtype: object), ('variable 2', 0              mode
16             year
24           energy
36             year
48     acousticness
56         liveness
66             year
76      speechiness
106            year
126           tempo
130             key
Name: variable 2, dtype: object), ('correlation coefficient', 0      1.000000
16     0.999980
24     0.997879
36     0.995453
48    -0.988847
56    -0.980316
66    -0.976291
76     0.964418
106    0.880490
126   -0.752472
130    0.720577
Name: correlation coefficient, dtype: float64)]


  for index, value in sort_values_desc.iteritems():


The aim of this section is to try to estimate the genres based on the clusters formed

In [1]:
# ! pip install scikit-learn
from sklearn.manifold import TSNE

TSNE()



You should consider upgrading via the 'C:\Users\user\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


Here, we want to split the data into intervals of 10 years to see overtime, which feature was most correlated with popularity at which interval

In [None]:
df_f_5_years = df[(X.between(1928, 1938)) & (df['popularity'] > 50)]
rq = df_f_5_years.corr()
rq
# rq['acousticness']
# for k, v in rq:
#     print(k, v)
# rq = rq[(rq.iloc[:, 0:19] != 1)]['popularity']
# rq.max()

In [None]:
for index, value in s.sort_values(ascending=False).iteritems():
    print(index, value)

In [33]:
pairs = set()

i = 0

result = pd.DataFrame()

print("|    Variable 1    |    Variable 2    | Correlation Coefficient    |")
print("|------------------|------------------|----------------------------|")

for index, value in s.sort_values(ascending=False).iteritems():
    if value >= 0.5 and index[0] != index[1] and (index[0], index[1]) not in pairs and (index[1], index[0]) not in pairs:
        print(f'|    {index[0]}    |    {index[1]}    |    {rq.loc[(index[0], index[1])]}    |')
        result.loc[i, ['Variable 1', 'Variable 2', 'Correlation Coefficient']] = [index[0], index[1], rq.loc[(index[0], index[1])]]
        pairs.add((index[0], index[1]))

        i += 1


|    Variable 1    |    Variable 2    | Correlation Coefficient    |
|------------------|------------------|----------------------------|
|    year    |    mode    |    1.0000000000000142    |
|    instrumentalness    |    year    |    0.9999798116611293    |
|    mode    |    instrumentalness    |    0.9999798116611152    |
|    speechiness    |    duration_ms    |    -0.9987812465198703    |
|    acousticness    |    danceability    |    -0.998537676273812    |
|    year    |    energy    |    0.9978793037385202    |
|    energy    |    mode    |    0.997879303738506    |
|    acousticness    |    duration_ms    |    0.9976411815743371    |
|    energy    |    instrumentalness    |    0.9974455520908899    |
|    loudness    |    popularity    |    -0.996340880260685    |
|    instrumentalness    |    danceability    |    0.9960377892792965    |
|    danceability    |    year    |    0.9954525926991463    |
|    mode    |    danceability    |    0.9954525926991322    |
|    acousticn

  for index, value in s.sort_values(ascending=False).iteritems():


In [5]:
df.corr()['popularity']

  df.corr()['popularity']


acousticness       -0.593345
danceability        0.221077
duration_ms         0.063292
energy              0.497488
explicit            0.214044
instrumentalness   -0.299829
key                 0.010675
liveness           -0.075293
loudness            0.466546
mode               -0.032854
popularity          1.000000
speechiness        -0.135707
tempo               0.135047
valence             0.009327
year                0.880724
Name: popularity, dtype: float64