In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [4]:
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

### Reading CSV files

In [5]:
history = pd.read_csv('./dataset/csv_data/streaming_history_data.csv')

In [8]:
history.head(5)

Unnamed: 0,song_id,artist_name,track_name,uri_id,release_date,type,year,month,day,explicit,popularity,end_time,ms_played,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,4PV0uE5pZSh44E3NqNNDEH,Madison Beer,Selfish,spotify:track:4PV0uE5pZSh44E3NqNNDEH,2020-02-14,track,2020,11,17,False,69,2020-11-17 12:42:00,132290,0.375,0.461,-6.202,0.0279,0.627,0.0,0.386,0.233,75.217
1,4PV0uE5pZSh44E3NqNNDEH,Madison Beer,Selfish,spotify:track:4PV0uE5pZSh44E3NqNNDEH,2020-02-14,track,2020,11,18,False,69,2020-11-18 05:27:00,68420,0.375,0.461,-6.202,0.0279,0.627,0.0,0.386,0.233,75.217
2,6SBJ2XHc4jm6Abqt7zocMz,Demi Lovato,Still Have Me,spotify:track:6SBJ2XHc4jm6Abqt7zocMz,2020-10-01,track,2020,11,18,False,59,2020-11-18 05:28:00,11750,0.281,0.462,-6.638,0.0674,0.498,0.0,0.108,0.367,75.975
3,76cy1WJvNGJTj78UqeA5zr,Dua Lipa,IDGAF,spotify:track:76cy1WJvNGJTj78UqeA5zr,2017-06-02,track,2020,11,18,True,79,2020-11-18 05:31:00,217946,0.836,0.544,-5.975,0.0943,0.0403,0.0,0.0824,0.51,97.028
4,4NSW0Km5ZG60L8FthUebPJ,Jonas Blue,What I Like About You (feat. Theresa Rex),spotify:track:4NSW0Km5ZG60L8FthUebPJ,2019-03-22,track,2020,11,18,False,67,2020-11-18 05:35:00,220396,0.46,0.8,-3.584,0.05,0.289,6e-06,0.121,0.553,102.46


Converting release_date to datetime

In [7]:
history['release_date'] =  pd.to_datetime(history['release_date'])
history['end_time'] =  pd.to_datetime(history['end_time'])

In [11]:
history['minutes_played'] = history[' ms_played'].divide(60000)
history.drop(' ms_played', axis=1, inplace=True)

KeyError: ' ms_played'

In [12]:
history.columns

Index(['song_id', 'artist_name', 'track_name', 'uri_id', 'release_date',
       'type', 'year', 'month', 'day', 'explicit', 'popularity', 'end_time',
       'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'minutes_played'],
      dtype='object')

## Most played artists name

In [25]:
most_played_artists_by_count = history.groupby(by='artist_name')['track_name'].count().sort_values(ascending=False)[:20]

In [26]:
fig = go.Figure(
    data=[
        go.Bar(
            x = most_played_artists_by_count.index, 
            y = most_played_artists_by_count.values,
            marker = {
                'color' : most_played_artists_by_count.values
            },
            text = most_played_artists_by_count.index,
        )
    ]
)
fig.update_layout(
    title_text= 'Popularity Of Artists By Number Of Times Their Song Was Played',
    barmode='group', 
    xaxis_tickangle=45,
    xaxis_title = "Name",
    yaxis_title = "Count",
    hovermode = 'x',
    height = 650,
    width = 1210,
)
fig.show()


## Count of songs listened

In [32]:
most_played_songs_count = history.groupby(by='track_name')['track_name'].count().sort_values(ascending=False)[:25]

In [36]:
# plot for count of track listened
fig = go.Figure(
    data=[
        go.Bar(
            x = most_played_songs_count.index, 
            y = most_played_songs_count.values,
            marker = {
                'color' : most_played_songs_count.values
            },
            text = most_played_songs_count.index,
            textposition = 'inside'
        )
    ]
)
fig.update_layout(
    title_text='Count of tracks listened',
    barmode='group', 
    xaxis_tickangle=45,
    xaxis_title = "track_name",
    yaxis_title = "track_count",
    height = 800,
    width = 1210,
    hovermode = 'x',
)
fig.show()

## Songs listened per month

In [37]:
# plot for count of track listened
month = history['month'].value_counts().sort_index(ascending=True)

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'July', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec']

fig = go.Figure(
    data=[
        go.Bar(
            x = months, 
            y = month.values,
            marker = {
                'color' : month.values
            },
            text = months,
            textposition = 'inside'
        )
    ]
)
fig.update_layout(
    title_text='Count of songs listened as per month',
    barmode='group', 
    xaxis_title = "month",
    yaxis_title = "count",
    height = 800,
    width = 1210,
    hovermode = 'x'
)
fig.show()

## Popularity of artist based on time listened

In [39]:
amount_of_time = history.groupby(by='artist_name')['minutes_played'].sum().sort_values(ascending=False)[:15]

In [44]:
# plot for count of track listened
fig = go.Figure(
    data=[
        go.Bar(
            x = amount_of_time.index, 
            y = amount_of_time.values,
            marker = {
                'color' : amount_of_time.values
            },
            text = amount_of_time.index,
            textposition = 'inside'
        )
    ]
)
fig.update_layout(
    title_text='Amount of Popularity of artists by amount of time spent listening to their song',
    barmode='group', 
    xaxis_tickangle=45,
    xaxis_title = "track_name",
    yaxis_title = "track_count",
    height = 800,
    width = 1210,
    hovermode = 'x',
)
fig.show()

## Some plot

In [50]:
history['days'] = [d.date() for d in history['end_time']]
history['time'] = [d.time() for d in history['end_time']]
history.drop('end_time', axis=1, inplace=True)
history.head()

KeyError: 'end_time'

In [56]:
day = history.groupby(by=['days'], as_index=False).sum()

In [63]:
fig = px.line(
    day, 
    x="days", 
    y="minutes_played",
    labels={
        "day": "Month",
        "minutes_played": "Minutes Played"
    },
    color_discrete_sequence = px.colors.sequential.Agsunset, 
    title = "Timeline Of My Streaming History"
)

fig.update_layout(
    hovermode = 'x'
)

fig.show()

In [65]:
history.head()

Unnamed: 0,song_id,artist_name,track_name,uri_id,release_date,type,year,month,day,explicit,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,minutes_played,days,time
0,4PV0uE5pZSh44E3NqNNDEH,Madison Beer,Selfish,spotify:track:4PV0uE5pZSh44E3NqNNDEH,2020-02-14,track,2020,11,17,False,69,0.375,0.461,-6.202,0.0279,0.627,0.0,0.386,0.233,75.217,2.204833,2020-11-17,12:42:00
1,4PV0uE5pZSh44E3NqNNDEH,Madison Beer,Selfish,spotify:track:4PV0uE5pZSh44E3NqNNDEH,2020-02-14,track,2020,11,18,False,69,0.375,0.461,-6.202,0.0279,0.627,0.0,0.386,0.233,75.217,1.140333,2020-11-18,05:27:00
2,6SBJ2XHc4jm6Abqt7zocMz,Demi Lovato,Still Have Me,spotify:track:6SBJ2XHc4jm6Abqt7zocMz,2020-10-01,track,2020,11,18,False,59,0.281,0.462,-6.638,0.0674,0.498,0.0,0.108,0.367,75.975,0.195833,2020-11-18,05:28:00
3,76cy1WJvNGJTj78UqeA5zr,Dua Lipa,IDGAF,spotify:track:76cy1WJvNGJTj78UqeA5zr,2017-06-02,track,2020,11,18,True,79,0.836,0.544,-5.975,0.0943,0.0403,0.0,0.0824,0.51,97.028,3.632433,2020-11-18,05:31:00
4,4NSW0Km5ZG60L8FthUebPJ,Jonas Blue,What I Like About You (feat. Theresa Rex),spotify:track:4NSW0Km5ZG60L8FthUebPJ,2019-03-22,track,2020,11,18,False,67,0.46,0.8,-3.584,0.05,0.289,6e-06,0.121,0.553,102.46,3.673267,2020-11-18,05:35:00


## Time spend listening on each day of week

In [67]:
history['week_day_name'] = pd.DatetimeIndex(history['days']).day_name()

In [68]:
week_day_date = history.groupby(by=['week_day_name'], as_index=False).sum()
week_day_date.head()

Unnamed: 0,week_day_name,year,month,day,explicit,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,minutes_played
0,Friday,941695,3046,7290,115,27857,300.9803,299.07096,-3187.958,41.3601,104.886935,33.258049,82.0313,236.2308,56217.641,1068.649417
1,Monday,1139757,3269,8859,163,36722,380.369,364.831,-3616.391,55.9825,136.451998,18.374698,93.4564,300.3914,67624.879,1324.638
2,Saturday,883072,2572,7083,131,27188,291.505,273.2388,-2916.636,41.9329,109.645701,10.002597,71.6116,222.7057,54057.831,1041.378383
3,Sunday,1192326,3019,10523,165,39331,398.893,384.968,-3805.52,55.2403,130.893839,15.24555,103.3164,322.2089,70319.833,1465.5014
4,Thursday,969970,3575,8356,149,31597,324.3023,294.7565,-3375.035,49.3092,117.212076,40.413943,78.7496,247.5078,56784.027,1181.557


In [75]:
fig = px.pie(
    history, 
    names="week_day_name", 
    values="minutes_played", 
    color_discrete_sequence = px.colors.sequential.Agsunset
)
fig.update_layout(
    title = 'Time spend listening on each day of week'
)
fig.show()

## Spider Graph plot

In [78]:
top_5_df = history[['track_name', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness']]
top_5_df.head(6)

Unnamed: 0,track_name,danceability,energy,loudness,speechiness,acousticness
0,Selfish,0.375,0.461,-6.202,0.0279,0.627
1,Selfish,0.375,0.461,-6.202,0.0279,0.627
2,Still Have Me,0.281,0.462,-6.638,0.0674,0.498
3,IDGAF,0.836,0.544,-5.975,0.0943,0.0403
4,What I Like About You (feat. Theresa Rex),0.46,0.8,-3.584,0.05,0.289
5,"break up with your girlfriend, i'm bored",0.726,0.554,-5.29,0.0917,0.0421


In [81]:
top_5 = top_5_df.head(6)
top_5.drop(top_5_df.index[0], inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [86]:
top_5

Unnamed: 0,track_name,danceability,energy,loudness,speechiness,acousticness
1,Selfish,0.375,0.461,-6.202,0.0279,0.627
2,Still Have Me,0.281,0.462,-6.638,0.0674,0.498
3,IDGAF,0.836,0.544,-5.975,0.0943,0.0403
4,What I Like About You (feat. Theresa Rex),0.46,0.8,-3.584,0.05,0.289
5,"break up with your girlfriend, i'm bored",0.726,0.554,-5.29,0.0917,0.0421


In [90]:
import plotly.graph_objects as go

categories = ['danceability','energy',
              'loudness', 'speechiness', 'acousticness']

fig = go.Figure()

fig.add_trace(
      go.Scatterpolar(
            r = [0.375, 0.461, -6.202, 0.0279, 0.6270],
            theta=categories,
            fill='toself',
            name='Selfish'
      )
)

fig.add_trace(go.Scatterpolar(
            r=[0.281, 0.462, -6.638, 0.0674, 0.4980],
            theta=categories,
            fill='toself',
            name='Still Have Me'
      )
)


fig.add_trace(go.Scatterpolar(
            r=[0.836, 0.544, -5.975, 0.0943, 0.0403],
            theta=categories,
            fill='toself',
            name='IDGAF'
      )
)

fig.add_trace(
      go.Scatterpolar(
            r = [0.460, 0.800, -3.584, 0.0500, 0.2890],
            theta=categories,
            fill = 'toself',
            name = 'What I Like About You (feat. Theresa Rex)'
      )
)

fig.add_trace(
      go.Scatterpolar(
            r = [0.726, 0.554, -5.290, 0.0917, 0.0421],
            theta = categories,
            fill='toself',
            name='break up with your girlfriend, i\'m bored'
      )
)

fig.update_layout(
      title = "Diversity in audio features of top 5 songs",
      polar=dict(
      radialaxis = dict(
            visible=True,
            range=[-10, 1]
            )
      ),
      showlegend=True
)
fig.show()

## Venn chart for valence

In [93]:
v = history['valence'].tolist()

less_count, more_count, middle_count = 0, 0, 0
  
# iterating each number in list 
for num in v: 
      
    # checking condition 
    if num >= 0 and num <0.5: 
        less_count += 1
        
    elif num >=0.5 and num < 0.6:
        middle_count += 1
  
    else: 
        more_count += 1
          
print("Less than 0.5: ", less_count) 
print("More than 0.6: ", more_count)
print("Between 0.5 and 0.6: ", middle_count)

Less than 0.5:  1617
More than 0.6:  1313
Between 0.5 and 0.6:  552


In [112]:
fig = go.Figure()

# Create scatter trace of text labels
fig.add_trace(go.Scatter(
    x=[1, 1.75, 2.5],
    y=[1, 1, 1],
    text=["Low Spirit: 1617", "Neutral: 552", "High Spirit: 1313"],
    mode="text",
    textfont=dict(
        color="black",
        size=18,
        family="Arail",
    )
))

# Update axes properties
fig.update_xaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False,
)

fig.update_yaxes(
    showticklabels=False,
    showgrid=False,
    zeroline=False,
)

# Add circles
fig.add_shape(type="circle",
    line_color="skyblue", fillcolor="skyblue",
    x0=0, y0=0, x1=2, y1=2
)

fig.add_shape(type="circle",
    line_color="firebrick", fillcolor="firebrick",
    x0=1.5, y0=0, x1=3.5, y1=2
)
fig.update_shapes(opacity=0.4, xref="x", yref="y")

fig.update_layout(
    margin=dict(l=20, r=20, b=100),
    height=600, width=800,
    plot_bgcolor="white",
    title = 'Happy or Sad Venn graph'
)

fig.show()

## Pie chart for explicit content

In [113]:
explicit_content = history['explicit'].value_counts().sort_index(ascending=True)
colors = ['Green', 'firebrick']

labels = explicit_content.index
values = explicit_content.values

fig = go.Figure(
    data=[
        go.Pie(
            labels=labels, 
            values=values,
            hole=.3
        )
    ]
)
fig.update_layout(
    title_text='Explicit songs distribution',
    height = 600,
    width = 900,
    hovermode = 'x'
)
fig.update_traces(
    hoverinfo='label+percent', 
    textinfo='value', 
    textfont_size=20,
    marker=dict(
        colors = colors, 
        line=dict(
            color='#000000', width=2
            )
        )
    )
fig.show()

Heatmap for songs attributes

In [11]:
correlation = history[['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

In [12]:
correlation = correlation.corr(method='spearman')

In [13]:
correlation

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
danceability,1.0,0.042307,0.131355,0.242165,-0.089197,-0.022703,-0.094747,0.397129,-0.101531
energy,0.042307,1.0,0.711381,0.04046,-0.530361,-0.057683,0.08729,0.337796,0.087466
loudness,0.131355,0.711381,1.0,0.014232,-0.388136,-0.271188,0.043753,0.328879,0.017494
speechiness,0.242165,0.04046,0.014232,1.0,0.024421,-0.124487,0.000951,0.105774,0.056379
acousticness,-0.089197,-0.530361,-0.388136,0.024421,1.0,-0.004602,-0.024566,-0.15301,-0.126377
instrumentalness,-0.022703,-0.057683,-0.271188,-0.124487,-0.004602,1.0,-0.053366,-0.132799,0.007457
liveness,-0.094747,0.08729,0.043753,0.000951,-0.024566,-0.053366,1.0,-0.01625,0.004022
valence,0.397129,0.337796,0.328879,0.105774,-0.15301,-0.132799,-0.01625,1.0,0.002955
tempo,-0.101531,0.087466,0.017494,0.056379,-0.126377,0.007457,0.004022,0.002955,1.0


In [14]:
fig = px.imshow(
    correlation,
    labels = dict(
        color="Correlation"
        )
    )
fig.update_layout(
    title_text='Heatmap Correlation between songs attributes',
    height = 600,
    width = 800,
)
fig.show()

In [15]:
popularity = history[['artist_name', 'popularity']]

In [16]:
most_popular = popularity.groupby('artist_name').agg('max').reset_index().sort_values(by='popularity', ascending=False)[:1]
least_popular = popularity.groupby('artist_name').agg('max').reset_index().sort_values(by='popularity', ascending=False)[::-1][:1]

In [17]:
print(f'Most popular: {" ".join(most_popular["artist_name"])}')
print(f'least popular: {" ".join(least_popular["artist_name"])}')

Most popular: Adele
least popular: Amel Bent


Radar Plot

In [18]:
# radar plot
song_attributes = history[['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

In [19]:
fig = go.Figure(
    data = go.Scatterpolar(
        r = song_attributes.mean(),
        theta = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'],
        fill = 'tonext'
    )
)
fig.update_layout(
    polar = dict(
        radialaxis = dict(
            visible = True
        ),
    ),
    showlegend=False,
    title_text = 'Radar chart for songs features'
)

fig.show()

Correlation between song attributes

In [20]:
def get_correlation_song_attribute(x, y):
    fig = px.scatter(
        history,
        x = x, 
        y = y, 
        color = "explicit", 
        facet_row = "year",
        marginal_y = "box"
    )
    fig.update_layout(
        title = f'Correlation between {x} and {y} attributes',
    )
    return fig

In [21]:
get_correlation_song_attribute('speechiness', 'danceability')

In [22]:
get_correlation_song_attribute('energy', 'liveness')

Total time listened per artist (in seconds)

In [114]:
history.head(1)

Unnamed: 0,song_id,artist_name,track_name,uri_id,release_date,type,year,month,day,explicit,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,minutes_played,days,time,week_day_name
0,4PV0uE5pZSh44E3NqNNDEH,Madison Beer,Selfish,spotify:track:4PV0uE5pZSh44E3NqNNDEH,2020-02-14,track,2020,11,17,False,69,0.375,0.461,-6.202,0.0279,0.627,0.0,0.386,0.233,75.217,2.204833,2020-11-17,12:42:00,Tuesday


In [117]:
minutes_listend = history[['artist_name', 'track_name', 'minutes_played']]
group_ms_played = minutes_listend.groupby(by=['artist_name']).agg('sum').reset_index().sort_values(by='minutes_played', ascending=False)
ms_played = group_ms_played.head(40)

In [118]:
# most artists listened
fig = go.Figure(
    data=[
        go.Bar(
            x = ms_played['artist_name'], 
            y = ms_played['minutes_played'],
            marker = {
                'color' : ms_played['minutes_played']
            },
            text = ms_played['artist_name'],
        )
    ]
)
fig.update_layout(
    title_text= 'Total time listened per artist (in seconds)',
    barmode = 'group', 
    xaxis_tickangle=45,
    xaxis_title = "Artist Name",
    yaxis_title = "Seconds",
    hovermode = 'x',
    height = 650,
    width = 1210,
)
fig.show()
