In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

### Reading CSV files

In [2]:
df_stream_history = pd.read_csv('./dataset/csv_data/streaming_history_data.csv')

printing the dataframe

In [3]:
df_stream_history.head(5)

Unnamed: 0,song_id,artist_name,track_name,uri_id,release_date,type,year,month,day,explicit,...,ms_played,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,4PV0uE5pZSh44E3NqNNDEH,Madison Beer,Selfish,spotify:track:4PV0uE5pZSh44E3NqNNDEH,2020-02-14,track,2020,11,17,False,...,132290,0.375,0.461,-6.202,0.0279,0.627,0.0,0.386,0.233,75.217
1,4PV0uE5pZSh44E3NqNNDEH,Madison Beer,Selfish,spotify:track:4PV0uE5pZSh44E3NqNNDEH,2020-02-14,track,2020,11,18,False,...,68420,0.375,0.461,-6.202,0.0279,0.627,0.0,0.386,0.233,75.217
2,6SBJ2XHc4jm6Abqt7zocMz,Demi Lovato,Still Have Me,spotify:track:6SBJ2XHc4jm6Abqt7zocMz,2020-10-01,track,2020,11,18,False,...,11750,0.281,0.462,-6.638,0.0674,0.498,0.0,0.108,0.367,75.975
3,76cy1WJvNGJTj78UqeA5zr,Dua Lipa,IDGAF,spotify:track:76cy1WJvNGJTj78UqeA5zr,2017-06-02,track,2020,11,18,True,...,217946,0.836,0.544,-5.975,0.0943,0.0403,0.0,0.0824,0.51,97.028
4,4NSW0Km5ZG60L8FthUebPJ,Jonas Blue,What I Like About You (feat. Theresa Rex),spotify:track:4NSW0Km5ZG60L8FthUebPJ,2019-03-22,track,2020,11,18,False,...,220396,0.46,0.8,-3.584,0.05,0.289,6e-06,0.121,0.553,102.46


In [4]:
df_stream_history.dtypes

song_id              object
artist_name          object
track_name           object
uri_id               object
release_date         object
type                 object
year                  int64
month                 int64
day                   int64
explicit               bool
popularity            int64
end_time             object
 ms_played            int64
danceability        float64
energy              float64
loudness            float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
dtype: object

Converting release_date to datetime

In [5]:
df_stream_history['release_date'] =  pd.to_datetime(df_stream_history['release_date'])

## Streaming Analysis

In [6]:
df_stream_history.columns

Index(['song_id', 'artist_name', 'track_name', 'uri_id', 'release_date',
       'type', 'year', 'month', 'day', 'explicit', 'popularity', 'end_time',
       ' ms_played', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'],
      dtype='object')

In [7]:
# most artists listened
count_greater_1 = df_stream_history['artist_name'].value_counts()
artist_name = count_greater_1[count_greater_1 > 10].index
artist_count = count_greater_1[count_greater_1 > 10].values

fig = go.Figure(
    data=[
        go.Bar(
            x = artist_name, 
            y = artist_count,
            marker = {
                'color' : artist_count
            },
            text = artist_name,
        )
    ]
)
fig.update_layout(
    title_text= 'Count of Artists listened',
    barmode='group', 
    xaxis_tickangle=45,
    xaxis_title = "Name",
    yaxis_title = "Count",
    hovermode = 'x',
    height = 650,
    width = 1210,
)
fig.show()


most songs listened

In [8]:
# plot for count of track listened
count_greater_1 = df_stream_history['track_name'].value_counts()

track_name = count_greater_1[count_greater_1 > 5].index
track_count = count_greater_1[count_greater_1 > 5].values

fig = go.Figure(
    data=[
        go.Bar(
            x = track_name, 
            y = track_count,
            marker = {
                'color' : track_count
            },
            text = track_name,
            textposition = 'inside'
        )
    ]
)
fig.update_layout(
    title_text='Count of tracks listened',
    barmode='group', 
    xaxis_tickangle=45,
    xaxis_title = "track_name",
    yaxis_title = "track_count",
    height = 800,
    width = 1210,
    hovermode = 'x'
)
fig.show()

Songs listened as per months

In [9]:
# plot for count of track listened
month = df_stream_history['month'].value_counts().sort_index(ascending=True)

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'July', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec']

fig = go.Figure(
    data=[
        go.Bar(
            x = months, 
            y = month.values,
            marker = {
                'color' : month.values
            },
            text = months,
            textposition = 'inside'
        )
    ]
)
fig.update_layout(
    title_text='Count of songs listened as per month',
    barmode='group', 
    xaxis_title = "month",
    yaxis_title = "count",
    height = 800,
    width = 1210,
    hovermode = 'x'
)
fig.show()

Pie chart for explicit content

In [10]:
explicit_content = df_stream_history['explicit'].value_counts().sort_index(ascending=True)
colors = ['lightgreen', 'red']

labels = explicit_content.index
values = explicit_content.values

fig = go.Figure(
    data=[
        go.Pie(
            labels=labels, 
            values=values,
            hole=.3
        )
    ]
)
fig.update_layout(
    title_text='Explicit songs distribution',
    height = 600,
    width = 900,
    hovermode = 'x'
)
fig.update_traces(
    hoverinfo='label+percent', 
    textinfo='value', 
    textfont_size=20,
    marker=dict(
        colors = colors, 
        line=dict(
            color='#000000', width=2
            )
        )
    )
fig.show()

Heatmap for songs attributes

In [11]:
correlation = df_stream_history[['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

In [12]:
correlation = correlation.corr(method='spearman')

In [13]:
correlation

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
danceability,1.0,0.042307,0.131355,0.242165,-0.089197,-0.022703,-0.094747,0.397129,-0.101531
energy,0.042307,1.0,0.711381,0.04046,-0.530361,-0.057683,0.08729,0.337796,0.087466
loudness,0.131355,0.711381,1.0,0.014232,-0.388136,-0.271188,0.043753,0.328879,0.017494
speechiness,0.242165,0.04046,0.014232,1.0,0.024421,-0.124487,0.000951,0.105774,0.056379
acousticness,-0.089197,-0.530361,-0.388136,0.024421,1.0,-0.004602,-0.024566,-0.15301,-0.126377
instrumentalness,-0.022703,-0.057683,-0.271188,-0.124487,-0.004602,1.0,-0.053366,-0.132799,0.007457
liveness,-0.094747,0.08729,0.043753,0.000951,-0.024566,-0.053366,1.0,-0.01625,0.004022
valence,0.397129,0.337796,0.328879,0.105774,-0.15301,-0.132799,-0.01625,1.0,0.002955
tempo,-0.101531,0.087466,0.017494,0.056379,-0.126377,0.007457,0.004022,0.002955,1.0


In [14]:
fig = px.imshow(
    correlation,
    labels = dict(
        color="Correlation"
        )
    )
fig.update_layout(
    title_text='Heatmap Correlation between songs attributes',
    height = 600,
    width = 800,
)
fig.show()

Popular artist

In [15]:
popularity = df_stream_history[['artist_name', 'popularity']]

In [16]:
most_popular = popularity.groupby('artist_name').agg('max').reset_index().sort_values(by='popularity', ascending=False)[:1]
least_popular = popularity.groupby('artist_name').agg('max').reset_index().sort_values(by='popularity', ascending=False)[::-1][:1]

In [17]:
print(f'Most popular: {" ".join(most_popular["artist_name"])}')
print(f'least popular: {" ".join(least_popular["artist_name"])}')

least popular: Adele
least popular: Amel Bent


Radar Plot

In [18]:
# radar plot
song_attributes = df_stream_history[['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

In [19]:
fig = go.Figure(
    data = go.Scatterpolar(
        r = song_attributes.mean(),
        theta = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'],
        fill = 'tonext'
    )
)
fig.update_layout(
    polar = dict(
        radialaxis = dict(
            visible = True
        ),
    ),
    showlegend=False,
    title_text = 'Radar chart for songs features'
)

fig.show()

Correlation between song attributes

In [20]:
def get_correlation_song_attribute(x, y):
    fig = px.scatter(
        df_stream_history,
        x = x, 
        y = y, 
        color = "explicit", 
        facet_row = "year",
        marginal_y = "box"
    )
    fig.update_layout(
        title = f'Correlation between {x} and {y} attributes',
    )
    return fig

In [21]:
get_correlation_song_attribute('speechiness', 'danceability')

In [22]:
get_correlation_song_attribute('energy', 'liveness')

Total time listened per artist (in seconds)

In [70]:
minutes_listend = df_stream_history[['artist_name', 'track_name', ' ms_played']]
group_ms_played = minutes_listend.groupby(by=['artist_name']).agg('sum').reset_index().sort_values(by=' ms_played', ascending=False)
ms_played = group_ms_played.head(50)

In [75]:
# most artists listened
fig = go.Figure(
    data=[
        go.Bar(
            x = ms_played['artist_name'], 
            y = ms_played[' ms_played'],
            marker = {
                'color' : ms_played[' ms_played']
            },
            text = ms_played['artist_name'],
        )
    ]
)
fig.update_layout(
    title_text= 'Total time listened per artist (in seconds)',
    barmode = 'group', 
    xaxis_tickangle=45,
    xaxis_title = "Artist Name",
    yaxis_title = "Seconds",
    hovermode = 'x',
    height = 650,
    width = 1210,
)
fig.show()
